From fb73de7582de4e622299a4ad045e25f771568193 Mon Sep 17 00:00:00 2001
From: Haibin Lin <linhaibin.eric@gmail.com>
Date: Wed, 10 Jun 2020 19:54:25 -0700
Subject: [PATCH] remove mx.module.* APIs for MXNet 2.0 (#18525)

* remove Module tests

* remove APIs relying on module

* remove docs and tools using mx.module

* remove executor manager

* remove ssd and ncf examples

* add back grad compression api doc

* fix lint

* add back cpredict exmaple

* fix resnet memory test

* remove tests

* remove tests/python/tensorrt/test_tensorrt_lenet5.py since it depends on a model traiend by mx.Module

* skip flaky test

* fix quantization test

* remove subgraph tests

Co-authored-by: EC2 Default User <ec2-user@ip-172-31-81-80.ec2.internal>
Co-authored-by: Lin <haibilin@a483e7be4c92.ant.amazon.com>
---
 benchmark/python/gluon/benchmark_gluon.py     |  164 ---
 benchmark/python/sparse/sparse_end2end.py     |  307 -----
 ci/docker/runtime_functions.sh                |   13 -
 ci/jenkins/Jenkins_steps.groovy               |   14 -
 ci/jenkins/Jenkinsfile_unix_gpu               |    1 -
 .../deploy/inference/wine_detector.md         |  405 -------
 .../gluon_from_experiment_to_deployment.md    |   48 +-
 .../packages/ndarray/sparse/train.md          |  339 ------
 .../packages/onnx/super_resolution.md         |  140 ---
 .../tutorials/performance/backend/amp.md      |   65 +-
 .../backend/mkldnn/mkldnn_quantization.md     |  258 ----
 .../src/pages/api/faq/gradient_compression.md |    8 -
 .../src/pages/api/faq/multi_devices.md        |  217 ----
 example/image-classification/README.md        |  387 ------
 example/image-classification/__init__.py      |    0
 example/image-classification/benchmark.py     |  262 ----
 .../image-classification/benchmark_score.py   |  134 ---
 .../image-classification/common/__init__.py   |    0
 example/image-classification/common/data.py   |  206 ----
 .../image-classification/common/find_mxnet.py |   24 -
 example/image-classification/common/fit.py    |  340 ------
 .../image-classification/common/modelzoo.py   |   63 -
 example/image-classification/common/util.py   |   54 -
 .../image-classification/data/caltech256.sh   |   61 -
 .../data/imagenet1k-val.sh                    |   40 -
 example/image-classification/fine-tune.py     |  102 --
 example/image-classification/score.py         |  107 --
 example/image-classification/symbol_alexnet.R |   53 -
 .../image-classification/symbol_googlenet.R   |   84 --
 .../symbol_inception-bn-28-small.R            |   89 --
 .../symbol_inception-bn.R                     |  134 ---
 .../symbol_inception-resnet-v1.R              |  410 -------
 .../symbol_inception-resnet-v2.R              |  455 -------
 example/image-classification/symbol_lenet.R   |   41 -
 example/image-classification/symbol_mlp.R     |   29 -
 .../symbol_resnet-28-small.R                  |   99 --
 .../image-classification/symbol_resnet-v2.R   |  162 ---
 example/image-classification/symbol_resnet.R  |   87 --
 example/image-classification/symbol_vgg.R     |   75 --
 .../image-classification/symbols/README.md    |   13 -
 .../image-classification/symbols/__init__.py  |    0
 .../image-classification/symbols/alexnet.py   |   68 --
 .../image-classification/symbols/googlenet.py |   72 --
 .../symbols/inception-bn.py                   |  144 ---
 .../symbols/inception-resnet-v2.py            |  158 ---
 .../symbols/inception-v3.py                   |  193 ---
 .../symbols/inception-v4.py                   |  215 ----
 example/image-classification/symbols/lenet.py |   64 -
 example/image-classification/symbols/mlp.py   |   32 -
 .../image-classification/symbols/mobilenet.py |  144 ---
 .../symbols/mobilenetv2.py                    |  218 ----
 .../image-classification/symbols/resnet-v1.py |  200 ---
 .../image-classification/symbols/resnetv1.py  |  200 ---
 .../image-classification/symbols/resnext.py   |  210 ----
 example/image-classification/symbols/vgg.py   |   76 --
 example/image-classification/test_score.py    |   64 -
 example/image-classification/train_cifar10.R  |  145 ---
 example/image-classification/train_cifar10.py |   76 --
 example/image-classification/train_imagenet.R |  140 ---
 .../image-classification/train_imagenet.py    |   66 -
 example/image-classification/train_mnist.R    |  163 ---
 example/image-classification/train_mnist.py   |   97 --
 example/image-classification/train_model.R    |  107 --
 .../neural_collaborative_filtering/README.md  |  113 --
 .../benchmark.sh                              |  118 --
 example/neural_collaborative_filtering/ci.py  |   60 -
 .../neural_collaborative_filtering/convert.py |  127 --
 .../core/dataset.py                           |   99 --
 .../core/evaluate.py                          |  105 --
 .../core/load.py                              |   74 --
 .../core/model.py                             |  135 ---
 .../model_optimizer.py                        |   81 --
 example/neural_collaborative_filtering/ncf.py |  162 ---
 .../neural_collaborative_filtering/train.py   |  163 ---
 example/ssd/README.md                         |  270 -----
 example/ssd/__init__.py                       |    0
 example/ssd/benchmark_score.py                |  117 --
 example/ssd/config/__init__.py                |    0
 example/ssd/config/config.py                  |   85 --
 example/ssd/config/utils.py                   |  108 --
 example/ssd/data/demo/download_demo_images.py |   38 -
 example/ssd/dataset/__init__.py               |    0
 example/ssd/dataset/concat_db.py              |  127 --
 example/ssd/dataset/cv2Iterator.py            |   65 -
 example/ssd/dataset/imdb.py                   |  127 --
 example/ssd/dataset/iterator.py               |  307 -----
 example/ssd/dataset/mscoco.py                 |  138 ---
 example/ssd/dataset/names/mscoco.names        |   80 --
 example/ssd/dataset/names/pascal_voc.names    |   20 -
 example/ssd/dataset/pascal_voc.py             |  286 -----
 example/ssd/dataset/pycocotools/README.md     |   19 -
 example/ssd/dataset/pycocotools/__init__.py   |   18 -
 example/ssd/dataset/pycocotools/coco.py       |  418 -------
 example/ssd/dataset/testdb.py                 |   69 --
 example/ssd/dataset/yolo_format.py            |  170 ---
 example/ssd/demo.py                           |  241 ----
 example/ssd/deploy.py                         |   63 -
 example/ssd/detect/__init__.py                |    0
 example/ssd/detect/detector.py                |  238 ----
 example/ssd/evaluate.py                       |  108 --
 example/ssd/evaluate/__init__.py              |    0
 example/ssd/evaluate/eval_metric.py           |  295 -----
 example/ssd/evaluate/eval_voc.py              |  196 ---
 example/ssd/evaluate/evaluate_net.py          |  133 --
 example/ssd/init.sh                           |   59 -
 example/ssd/model/README.md                   |   18 -
 example/ssd/quantization.py                   |  159 ---
 example/ssd/symbol/README.md                  |   66 -
 example/ssd/symbol/__init__.py                |    0
 example/ssd/symbol/common.py                  |  304 -----
 example/ssd/symbol/inceptionv3.py             |  185 ---
 example/ssd/symbol/legacy_vgg16_ssd_300.py    |  207 ----
 example/ssd/symbol/legacy_vgg16_ssd_512.py    |  210 ----
 example/ssd/symbol/resnet.py                  |  186 ---
 example/ssd/symbol/symbol_builder.py          |  182 ---
 example/ssd/symbol/symbol_factory.py          |  139 ---
 example/ssd/symbol/vgg16_reduced.py           |  103 --
 example/ssd/tools/__init__.py                 |    0
 example/ssd/tools/find_mxnet.py               |   24 -
 example/ssd/tools/prepare_coco.sh             |   22 -
 example/ssd/tools/prepare_dataset.py          |  141 ---
 example/ssd/tools/prepare_pascal.sh           |   22 -
 example/ssd/tools/rand_sampler.py             |  287 -----
 example/ssd/tools/visualize_net.py            |   44 -
 example/ssd/train.py                          |  156 ---
 example/ssd/train/__init__.py                 |    0
 example/ssd/train/metric.py                   |   83 --
 example/ssd/train/train_net.py                |  279 -----
 python/mxnet/__init__.py                      |    3 -
 python/mxnet/callback.py                      |   28 -
 python/mxnet/contrib/amp/amp.py               |   64 -
 .../contrib/onnx/onnx2mx/_import_helper.py    |    6 +-
 .../contrib/onnx/onnx2mx/_op_translations.py  |   45 -
 .../onnx/onnx2mx/_translation_utils.py        |   54 -
 python/mxnet/contrib/quantization.py          |  391 ------
 .../contrib/svrg_optimization/__init__.py     |   22 -
 .../contrib/svrg_optimization/svrg_module.py  |  579 ---------
 .../svrg_optimization/svrg_optimizer.py       |  174 ---
 python/mxnet/executor.py                      |    5 -
 python/mxnet/executor_manager.py              |  443 -------
 python/mxnet/gluon/block.py                   |    6 +-
 python/mxnet/initializer.py                   |   26 +-
 python/mxnet/module/__init__.py               |   27 -
 python/mxnet/module/base_module.py            | 1067 -----------------
 python/mxnet/module/bucketing_module.py       |  702 -----------
 python/mxnet/module/executor_group.py         |  703 -----------
 python/mxnet/module/module.py                 |  870 --------------
 python/mxnet/module/python_module.py          |  362 ------
 python/mxnet/module/sequential_module.py      |  440 -------
 .../common.py                                 |   12 -
 .../model_backwards_compat_inference.py       |   33 -
 .../model_backwards_compat_train.py           |   25 -
 tests/nightly/test_optimizer.py               |   38 -
 tests/nightly/test_tlocal_racecondition.py    |  110 --
 tests/python/gpu/test_contrib_amp.py          |   85 --
 tests/python/gpu/test_operator_gpu.py         |   24 -
 tests/python/gpu/test_predictor.py            |   60 -
 tests/python/mkl/test_contrib_amp.py          |  152 ---
 tests/python/mkl/test_mkldnn.py               |   18 -
 tests/python/mkl/test_subgraph.py             |  485 --------
 .../python/quantization/test_quantization.py  |  222 ----
 tests/python/tensorrt/lenet5_train.py         |  101 --
 tests/python/tensorrt/test_tensorrt_lenet5.py |  119 --
 tests/python/train/test_dtype.py              |  263 ----
 tests/python/train/test_resnet_aug.py         |  151 ---
 tests/python/train/test_sparse_fm.py          |  144 ---
 tests/python/unittest/onnx/test_models.py     |  164 ---
 tests/python/unittest/onnx/test_node.py       |   92 --
 .../python/unittest}/resnet.py                |    0
 .../unittest/test_contrib_svrg_module.py      |  307 -----
 .../unittest/test_contrib_svrg_optimizer.py   |   97 --
 tests/python/unittest/test_gluon.py           |    7 -
 tests/python/unittest/test_gluon_rnn.py       |   34 -
 tests/python/unittest/test_init.py            |   97 --
 tests/python/unittest/test_loss.py            |  283 -----
 tests/python/unittest/test_memory_opt.py      |   13 +-
 tests/python/unittest/test_module.py          |  740 ------------
 tests/python/unittest/test_operator.py        |   40 -
 tests/python/unittest/test_optimizer.py       |   28 +-
 179 files changed, 26 insertions(+), 26126 deletions(-)
 delete mode 100644 benchmark/python/gluon/benchmark_gluon.py
 delete mode 100644 benchmark/python/sparse/sparse_end2end.py
 delete mode 100644 docs/python_docs/python/tutorials/deploy/inference/wine_detector.md
 delete mode 100644 docs/python_docs/python/tutorials/packages/ndarray/sparse/train.md
 delete mode 100644 docs/python_docs/python/tutorials/packages/onnx/super_resolution.md
 delete mode 100644 docs/python_docs/python/tutorials/performance/backend/mkldnn/mkldnn_quantization.md
 delete mode 100644 docs/static_site/src/pages/api/faq/multi_devices.md
 delete mode 100644 example/image-classification/README.md
 delete mode 100644 example/image-classification/__init__.py
 delete mode 100644 example/image-classification/benchmark.py
 delete mode 100644 example/image-classification/benchmark_score.py
 delete mode 100644 example/image-classification/common/__init__.py
 delete mode 100644 example/image-classification/common/data.py
 delete mode 100644 example/image-classification/common/find_mxnet.py
 delete mode 100644 example/image-classification/common/fit.py
 delete mode 100644 example/image-classification/common/modelzoo.py
 delete mode 100644 example/image-classification/common/util.py
 delete mode 100755 example/image-classification/data/caltech256.sh
 delete mode 100755 example/image-classification/data/imagenet1k-val.sh
 delete mode 100644 example/image-classification/fine-tune.py
 delete mode 100644 example/image-classification/score.py
 delete mode 100644 example/image-classification/symbol_alexnet.R
 delete mode 100644 example/image-classification/symbol_googlenet.R
 delete mode 100644 example/image-classification/symbol_inception-bn-28-small.R
 delete mode 100644 example/image-classification/symbol_inception-bn.R
 delete mode 100644 example/image-classification/symbol_inception-resnet-v1.R
 delete mode 100644 example/image-classification/symbol_inception-resnet-v2.R
 delete mode 100644 example/image-classification/symbol_lenet.R
 delete mode 100644 example/image-classification/symbol_mlp.R
 delete mode 100644 example/image-classification/symbol_resnet-28-small.R
 delete mode 100644 example/image-classification/symbol_resnet-v2.R
 delete mode 100644 example/image-classification/symbol_resnet.R
 delete mode 100644 example/image-classification/symbol_vgg.R
 delete mode 100644 example/image-classification/symbols/README.md
 delete mode 100644 example/image-classification/symbols/__init__.py
 delete mode 100644 example/image-classification/symbols/alexnet.py
 delete mode 100644 example/image-classification/symbols/googlenet.py
 delete mode 100644 example/image-classification/symbols/inception-bn.py
 delete mode 100644 example/image-classification/symbols/inception-resnet-v2.py
 delete mode 100644 example/image-classification/symbols/inception-v3.py
 delete mode 100644 example/image-classification/symbols/inception-v4.py
 delete mode 100644 example/image-classification/symbols/lenet.py
 delete mode 100644 example/image-classification/symbols/mlp.py
 delete mode 100644 example/image-classification/symbols/mobilenet.py
 delete mode 100644 example/image-classification/symbols/mobilenetv2.py
 delete mode 100644 example/image-classification/symbols/resnet-v1.py
 delete mode 100644 example/image-classification/symbols/resnetv1.py
 delete mode 100644 example/image-classification/symbols/resnext.py
 delete mode 100644 example/image-classification/symbols/vgg.py
 delete mode 100644 example/image-classification/test_score.py
 delete mode 100644 example/image-classification/train_cifar10.R
 delete mode 100644 example/image-classification/train_cifar10.py
 delete mode 100644 example/image-classification/train_imagenet.R
 delete mode 100644 example/image-classification/train_imagenet.py
 delete mode 100644 example/image-classification/train_mnist.R
 delete mode 100644 example/image-classification/train_mnist.py
 delete mode 100644 example/image-classification/train_model.R
 delete mode 100644 example/neural_collaborative_filtering/README.md
 delete mode 100755 example/neural_collaborative_filtering/benchmark.sh
 delete mode 100644 example/neural_collaborative_filtering/ci.py
 delete mode 100644 example/neural_collaborative_filtering/convert.py
 delete mode 100644 example/neural_collaborative_filtering/core/dataset.py
 delete mode 100644 example/neural_collaborative_filtering/core/evaluate.py
 delete mode 100644 example/neural_collaborative_filtering/core/load.py
 delete mode 100644 example/neural_collaborative_filtering/core/model.py
 delete mode 100644 example/neural_collaborative_filtering/model_optimizer.py
 delete mode 100644 example/neural_collaborative_filtering/ncf.py
 delete mode 100644 example/neural_collaborative_filtering/train.py
 delete mode 100644 example/ssd/README.md
 delete mode 100644 example/ssd/__init__.py
 delete mode 100644 example/ssd/benchmark_score.py
 delete mode 100644 example/ssd/config/__init__.py
 delete mode 100644 example/ssd/config/config.py
 delete mode 100644 example/ssd/config/utils.py
 delete mode 100644 example/ssd/data/demo/download_demo_images.py
 delete mode 100644 example/ssd/dataset/__init__.py
 delete mode 100644 example/ssd/dataset/concat_db.py
 delete mode 100644 example/ssd/dataset/cv2Iterator.py
 delete mode 100644 example/ssd/dataset/imdb.py
 delete mode 100644 example/ssd/dataset/iterator.py
 delete mode 100644 example/ssd/dataset/mscoco.py
 delete mode 100644 example/ssd/dataset/names/mscoco.names
 delete mode 100644 example/ssd/dataset/names/pascal_voc.names
 delete mode 100644 example/ssd/dataset/pascal_voc.py
 delete mode 100644 example/ssd/dataset/pycocotools/README.md
 delete mode 100644 example/ssd/dataset/pycocotools/__init__.py
 delete mode 100644 example/ssd/dataset/pycocotools/coco.py
 delete mode 100644 example/ssd/dataset/testdb.py
 delete mode 100644 example/ssd/dataset/yolo_format.py
 delete mode 100644 example/ssd/demo.py
 delete mode 100644 example/ssd/deploy.py
 delete mode 100644 example/ssd/detect/__init__.py
 delete mode 100644 example/ssd/detect/detector.py
 delete mode 100644 example/ssd/evaluate.py
 delete mode 100644 example/ssd/evaluate/__init__.py
 delete mode 100644 example/ssd/evaluate/eval_metric.py
 delete mode 100644 example/ssd/evaluate/eval_voc.py
 delete mode 100644 example/ssd/evaluate/evaluate_net.py
 delete mode 100755 example/ssd/init.sh
 delete mode 100644 example/ssd/model/README.md
 delete mode 100644 example/ssd/quantization.py
 delete mode 100644 example/ssd/symbol/README.md
 delete mode 100644 example/ssd/symbol/__init__.py
 delete mode 100644 example/ssd/symbol/common.py
 delete mode 100644 example/ssd/symbol/inceptionv3.py
 delete mode 100644 example/ssd/symbol/legacy_vgg16_ssd_300.py
 delete mode 100644 example/ssd/symbol/legacy_vgg16_ssd_512.py
 delete mode 100644 example/ssd/symbol/resnet.py
 delete mode 100644 example/ssd/symbol/symbol_builder.py
 delete mode 100644 example/ssd/symbol/symbol_factory.py
 delete mode 100644 example/ssd/symbol/vgg16_reduced.py
 delete mode 100644 example/ssd/tools/__init__.py
 delete mode 100644 example/ssd/tools/find_mxnet.py
 delete mode 100644 example/ssd/tools/prepare_coco.sh
 delete mode 100644 example/ssd/tools/prepare_dataset.py
 delete mode 100755 example/ssd/tools/prepare_pascal.sh
 delete mode 100644 example/ssd/tools/rand_sampler.py
 delete mode 100644 example/ssd/tools/visualize_net.py
 delete mode 100644 example/ssd/train.py
 delete mode 100644 example/ssd/train/__init__.py
 delete mode 100644 example/ssd/train/metric.py
 delete mode 100644 example/ssd/train/train_net.py
 delete mode 100644 python/mxnet/contrib/svrg_optimization/__init__.py
 delete mode 100644 python/mxnet/contrib/svrg_optimization/svrg_module.py
 delete mode 100644 python/mxnet/contrib/svrg_optimization/svrg_optimizer.py
 delete mode 100644 python/mxnet/executor_manager.py
 delete mode 100644 python/mxnet/module/__init__.py
 delete mode 100644 python/mxnet/module/base_module.py
 delete mode 100644 python/mxnet/module/bucketing_module.py
 delete mode 100644 python/mxnet/module/executor_group.py
 delete mode 100644 python/mxnet/module/module.py
 delete mode 100644 python/mxnet/module/python_module.py
 delete mode 100644 python/mxnet/module/sequential_module.py
 delete mode 100644 tests/nightly/test_tlocal_racecondition.py
 delete mode 100644 tests/python/tensorrt/lenet5_train.py
 delete mode 100644 tests/python/tensorrt/test_tensorrt_lenet5.py
 delete mode 100644 tests/python/train/test_dtype.py
 delete mode 100644 tests/python/train/test_resnet_aug.py
 delete mode 100644 tests/python/train/test_sparse_fm.py
 delete mode 100644 tests/python/unittest/onnx/test_models.py
 rename {example/image-classification/symbols => tests/python/unittest}/resnet.py (100%)
 delete mode 100644 tests/python/unittest/test_contrib_svrg_module.py
 delete mode 100644 tests/python/unittest/test_contrib_svrg_optimizer.py
 delete mode 100644 tests/python/unittest/test_init.py
 delete mode 100644 tests/python/unittest/test_module.py

diff --git a/benchmark/python/gluon/benchmark_gluon.py b/benchmark/python/gluon/benchmark_gluon.py
deleted file mode 100644
index 3dbb36404d07..000000000000
--- a/benchmark/python/gluon/benchmark_gluon.py
+++ /dev/null
@@ -1,164 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import mxnet as mx
-import mxnet.gluon.model_zoo.vision as models
-import time
-import logging
-import argparse
-import subprocess
-import os
-import errno
-
-logging.basicConfig(level=logging.INFO)
-parser = argparse.ArgumentParser(description='Gluon modelzoo-based CNN performance benchmark')
-
-parser.add_argument('--model', type=str, default='all',
-                               choices=['all', 'alexnet', 'densenet121', 'densenet161',
-                                        'densenet169', 'densenet201', 'inceptionv3', 'mobilenet0.25',
-                                        'mobilenet0.5', 'mobilenet0.75', 'mobilenet1.0', 'mobilenetv2_0.25',
-                                        'mobilenetv2_0.5', 'mobilenetv2_0.75', 'mobilenetv2_1.0', 'resnet101_v1',
-                                        'resnet101_v2', 'resnet152_v1', 'resnet152_v2', 'resnet18_v1',
-                                        'resnet18_v2', 'resnet34_v1', 'resnet34_v2', 'resnet50_v1',
-                                        'resnet50_v2', 'squeezenet1.0', 'squeezenet1.1', 'vgg11',
-                                        'vgg11_bn', 'vgg13', 'vgg13_bn', 'vgg16', 'vgg16_bn',
-                                        'vgg19', 'vgg19_bn'])
-parser.add_argument('--batch-size', type=int, default=0,
-                     help='Batch size to use for benchmarking. Example: 32, 64, 128.'
-                          'By default, runs benchmark for batch sizes - 1, 32, 64, 128, 256')
-parser.add_argument('--num-batches', type=int, default=10)
-parser.add_argument('--gpus', type=str, default='',
-                    help='GPU IDs to use for this benchmark task. Example: --gpus=0,1,2,3 to use 4 GPUs.'
-                         'By default, use CPU only.')
-parser.add_argument('--type', type=str, default='inference', choices=['all', 'training', 'inference'])
-
-opt = parser.parse_args()
-
-num_batches = opt.num_batches
-dry_run = 10  # use 10 iterations to warm up
-batch_inf = [1, 32, 64, 128, 256]
-batch_train = [1, 32, 64, 128, 256]
-image_shapes = [(3, 224, 224), (3, 299, 299)]
-
-def score(network, batch_size, ctx):
-    assert (batch_size >= len(ctx)), "ERROR: batch size should not be smaller than num of GPUs."
-    net = models.get_model(network)
-    if 'inceptionv3' == network:
-        data_shape = [('data', (batch_size,) + image_shapes[1])]
-    else:
-        data_shape = [('data', (batch_size,) + image_shapes[0])]
-
-    data = mx.sym.var('data')
-    out = net(data)
-    softmax = mx.sym.SoftmaxOutput(out, name='softmax')
-    mod = mx.mod.Module(softmax, context=ctx)
-    mod.bind(for_training     = False,
-             inputs_need_grad = False,
-             data_shapes      = data_shape)
-    mod.init_params(initializer=mx.init.Xavier(magnitude=2.))
-    data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=ctx[0]) for _, shape in mod.data_shapes]
-    batch = mx.io.DataBatch(data, [])
-    for i in range(dry_run + num_batches):
-        if i == dry_run:
-            tic = time.time()
-        mod.forward(batch, is_train=False)
-        for output in mod.get_outputs():
-            output.wait_to_read()
-    fwd = time.time() - tic
-    return fwd
-
-
-def train(network, batch_size, ctx):
-    assert (batch_size >= len(ctx)), "ERROR: batch size should not be smaller than num of GPUs."
-    net = models.get_model(network)
-    if 'inceptionv3' == network:
-        data_shape = [('data', (batch_size,) + image_shapes[1])]
-    else:
-        data_shape = [('data', (batch_size,) + image_shapes[0])]
-
-    data = mx.sym.var('data')
-    out = net(data)
-    softmax = mx.sym.SoftmaxOutput(out, name='softmax')
-    mod = mx.mod.Module(softmax, context=ctx)
-    mod.bind(for_training     = True,
-             inputs_need_grad = False,
-             data_shapes      = data_shape)
-    mod.init_params(initializer=mx.init.Xavier(magnitude=2.))
-    if len(ctx) > 1:
-        mod.init_optimizer(kvstore='device', optimizer='sgd')
-    else:
-        mod.init_optimizer(kvstore='local', optimizer='sgd')
-    data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=ctx[0]) for _, shape in mod.data_shapes]
-    batch = mx.io.DataBatch(data, [])
-    for i in range(dry_run + num_batches):
-        if i == dry_run:
-            tic = time.time()
-        mod.forward(batch, is_train=True)
-        for output in mod.get_outputs():
-            output.wait_to_read()
-        mod.backward()
-        mod.update()
-    bwd = time.time() - tic
-    return bwd
-
-if __name__ == '__main__':
-    runtype = opt.type
-    bs = opt.batch_size
-
-    if opt.model == 'all':
-        networks = ['alexnet', 'densenet121', 'densenet161', 'densenet169', 'densenet201',
-                    'inceptionv3', 'mobilenet0.25', 'mobilenet0.5', 'mobilenet0.75',
-                    'mobilenet1.0', 'mobilenetv2_0.25', 'mobilenetv2_0.5', 'mobilenetv2_0.75',
-                    'mobilenetv2_1.0', 'resnet101_v1', 'resnet101_v2', 'resnet152_v1', 'resnet152_v2',
-                    'resnet18_v1', 'resnet18_v2', 'resnet34_v1', 'resnet34_v2', 'resnet50_v1',
-                    'resnet50_v2', 'squeezenet1.0', 'squeezenet1.1', 'vgg11', 'vgg11_bn', 'vgg13',
-                    'vgg13_bn', 'vgg16', 'vgg16_bn', 'vgg19', 'vgg19_bn']
-        logging.info('It may take some time to run all models, '
-                     'set --network to run a specific one')
-    else:
-        networks = [opt.model]
-    
-    devs = [mx.gpu(int(i)) for i in opt.gpus.split(',')] if opt.gpus.strip() else [mx.cpu()]
-    num_gpus = len(devs)
-
-    for network in networks:
-        logging.info('network: %s', network)
-        logging.info('device: %s', devs)
-        if runtype == 'inference' or runtype == 'all':
-            if bs != 0:
-                fwd_time = score(network, bs, devs)
-                fps = (bs * num_batches)/fwd_time
-                logging.info(network + ' inference perf for BS %d is %f img/s', bs, fps)
-            else:
-                logging.info('run batchsize [1, 2, 4, 8, 16, 32] by default, '
-                             'set --batch-size to run a specific one')
-                for batch_size in batch_inf:
-                    fwd_time = score(network, batch_size, devs)
-                    fps = (batch_size * num_batches) / fwd_time
-                    logging.info(network + ' inference perf for BS %d is %f img/s', batch_size, fps)
-        if runtype == 'training' or runtype == 'all':
-            if bs != 0:
-                bwd_time = train(network, bs, devs)
-                fps = (bs * num_batches) / bwd_time
-                logging.info(network + ' training perf for BS %d is %f img/s', bs, fps)
-            else:
-                logging.info('run batchsize [1, 2, 4, 8, 16, 32] by default, '
-                             'set --batch-size to run a specific one')
-                for batch_size in batch_train:
-                    bwd_time = train(network, batch_size, devs)
-                    fps = (batch_size * num_batches) / bwd_time
-                    logging.info(network + ' training perf for BS %d is %f img/s', batch_size, fps)
diff --git a/benchmark/python/sparse/sparse_end2end.py b/benchmark/python/sparse/sparse_end2end.py
deleted file mode 100644
index fc949b649767..000000000000
--- a/benchmark/python/sparse/sparse_end2end.py
+++ /dev/null
@@ -1,307 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import time
-import argparse
-import os
-import multiprocessing
-from mxnet.test_utils import *
-
-MAX_NUM_BATCH = 99999999
-COMP = "compute"
-COMM = "communication"
-IO = "io"
-
-parser = argparse.ArgumentParser(description="Run sparse linear regression " \
-                                             "with distributed kvstore",
-                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser.add_argument('--profiler', type=int, default=0,
-                    help='whether to use profiler')
-parser.add_argument('--num-epoch', type=int, default=1,
-                    help='number of epochs to train')
-parser.add_argument('--batch-size', type=int, default=512,
-                    help='number of examples per batch')
-parser.add_argument('--num-batch', type=int, default=MAX_NUM_BATCH,
-                    help='number of batches per epoch')
-parser.add_argument('--dummy-iter', type=int, default=0,
-                    help='whether to use dummy iterator to exclude io cost')
-parser.add_argument('--kvstore', type=str, default=None,
-                    help='what kvstore to use [local, dist_sync, etc]')
-parser.add_argument('--sparse-log-level', type=str, default='DEBUG',
-                    help='logging level [DEBUG, INFO, ERROR]')
-parser.add_argument('--dataset', type=str, default='avazu',
-                    help='what test dataset to use')
-parser.add_argument('--num-gpu', type=int, default=0,
-                    help='number of gpus to use. 0 means using cpu(0);'
-                         'otherwise, use gpu(0),...,gpu(num_gpu-1)')
-parser.add_argument('--output-dim', type=int, default=4,
-                    help='number of columns of the forward output')
-parser.add_argument('--dummy-metric', type=int, default=0,
-                    help='whether to call update_metric')
-parser.add_argument('--enable-logging-for', default="0",
-                    help="Enable logging for the specified list of workers")
-parser.add_argument('--measure-only', default=None,
-                    help="Measure only",
-                    choices=[IO, COMP, COMM])
-parser.add_argument('--omit-row-sparse-push', action='store_true',
-                    help="omit row_sparse_push")
-
-class DummyIter(mx.io.DataIter):
-    "A dummy iterator that always return the same batch, used for speed testing"
-    def __init__(self, real_iter):
-        super(DummyIter, self).__init__()
-        self.real_iter = real_iter
-        self.provide_data = real_iter.provide_data
-        self.provide_label = real_iter.provide_label
-        self.batch_size = real_iter.batch_size
-
-        for batch in real_iter:
-            self.the_batch = batch
-            break
-
-    def __iter__(self):
-        return self
-
-    def next(self):
-        return self.the_batch
-
-# testing dataset sources
-avazu = {
-    'data_name': 'avazu-app.t',
-    'data_origin_name': 'avazu-app.t.bz2',
-    'url': "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/avazu-app.t.bz2",
-    'feature_dim': 1000001,
-    'lc': 1719304,
-}
-
-kdda = {
-    'data_name': 'kdda.t',
-    'data_origin_name': 'kdda.t.bz2',
-    'url': "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/kdda.t.bz2",
-    'feature_dim': 20216831,
-    'lc': 510302,
-}
-
-criteo = {
-    'data_name': 'criteo.t',
-    'data_origin_name': 'criteo.t.bz2',
-    'url': "https://s3-us-west-2.amazonaws.com/sparse-dataset/criteo.t.bz2",
-    'feature_dim': 8388621,
-    'lc': 548787,
-}
-
-datasets = { 'kdda' : kdda, 'avazu' : avazu , 'criteo': criteo }
-
-
-def get_sym(feature_dim):
-    inputs = mx.symbol.Variable("data", stype='csr')
-    norm_init = mx.initializer.Normal(sigma=0.01)
-    weights = mx.symbol.Variable("w", shape=(feature_dim, args.output_dim),
-                                 init=norm_init, stype='row_sparse')
-    embed = mx.symbol.sparse.dot(inputs, weights)
-    softmax_output = mx.symbol.Variable("softmax_label")
-    model = mx.symbol.SoftmaxOutput(data=embed, label=softmax_output, name="out")
-    return model
-
-
-def row_sparse_push(kv, param_arrays, grad_arrays, param_names):
-    for index, pair in enumerate(zip(param_arrays, grad_arrays)):
-        arg_list, grad_list = pair
-        if grad_list[0] is None:
-            continue
-        name = param_names[index]
-        kv.push(name, grad_list, priority=-index)
-
-
-def row_sparse_pull(kv, key, data, slices, weight_array, priority):
-    # if have kvstore, need to pull corresponding rows of
-    # the weights to each context
-    # column indices (NDArray type) of the csr data
-    # used as the row_idx of the weight row-sparse matrix
-    row_indices = data.indices
-    if len(slices) == 1:
-        kv.row_sparse_pull(key, weight_array, priority=priority, row_ids=row_indices)
-    else:  # more than one slices, multi-GPU training. Need to retain weight rows according to data slices
-        # TODO(junwu):
-        # the following line blocks, may need to pre-compute
-        # and cache it outside the for loop
-        indptr = data.indptr.asnumpy()
-        row_idx_array = []
-        for s in slices:
-            row_idx_array.append(row_indices[indptr[s.start]:indptr[s.stop]])
-        kv.row_sparse_pull(key, weight_array, priority=priority, row_ids=row_idx_array)
-
-
-if __name__ == '__main__':
-
-    # arg parser
-    args = parser.parse_args()
-    num_epoch = args.num_epoch
-    num_batch = args.num_batch
-    kvstore = args.kvstore
-    profiler = args.profiler > 0
-    batch_size = args.batch_size if args.num_gpu == 0 else args.num_gpu * args.batch_size
-    dummy_iter = args.dummy_iter
-    dataset = args.dataset
-    log_level = args.sparse_log_level
-    measure_only = args.measure_only
-    num_cores = multiprocessing.cpu_count()
-    omit_row_sparse_push = args.omit_row_sparse_push
-    if measure_only == COMP or measure_only == IO:
-        assert not kvstore, "when compute_only or io_only is set, kvstore should be None"
-        num_batch = datasets[dataset]['lc'] / batch_size if num_batch == MAX_NUM_BATCH else num_batch
-    if measure_only == COMM:
-        assert (kvstore == "dist_async"), "when communication_only is set kvstore should be dist_async"
-        num_batch = datasets[dataset]['lc'] / batch_size if num_batch == MAX_NUM_BATCH else num_batch
-
-
-    contexts = mx.context.cpu(0) if args.num_gpu < 1\
-        else [mx.context.gpu(i) for i in range(args.num_gpu)]
-
-    # create kvstore when there are gpus
-    kv = mx.kvstore.create(kvstore) if kvstore else None
-    rank = kv.rank if kv is not None else 0
-    num_worker = kv.num_workers if kv is not None else 1
-
-    # only print log for rank 0 worker
-    import logging
-    if log_level == 'ERROR':
-        log_level = logging.ERROR
-    elif log_level == 'DEBUG':
-        log_level = logging.DEBUG
-    else:
-        log_level = logging.INFO
-
-    # Only log if it is in the list of workers to be logged
-    logging_workers_list = [int(i) for i in args.enable_logging_for.split(",")]
-    log_level = log_level if rank in logging_workers_list else logging.CRITICAL
-
-    head = '%(asctime)-15s %(message)s'
-    logging.basicConfig(level=log_level, format=head)
-
-    # dataset
-    assert(dataset in datasets), "unknown dataset " + dataset
-    metadata = datasets[dataset]
-    feature_dim = metadata['feature_dim']
-    if logging:
-        logging.debug('preparing data ... ')
-    data_dir = os.path.join(os.getcwd(), 'data')
-    path = os.path.join(data_dir, metadata['data_name'])
-    if not os.path.exists(path):
-        get_bz2_data(data_dir, metadata['data_name'], metadata['url'],
-                        metadata['data_origin_name'])
-        assert os.path.exists(path)
-
-    # data iterator
-    train_data = mx.io.LibSVMIter(data_libsvm=path, data_shape=(feature_dim,),
-                                  batch_size=batch_size, num_parts=num_worker,
-                                  part_index=rank)
-    if dummy_iter or measure_only == COMP or measure_only  == COMM:
-        train_data = DummyIter(train_data)
-
-    # model
-    model = get_sym(feature_dim)
-
-    # module
-    mod = mx.mod.Module(symbol=model, data_names=['data'],
-                        label_names=['softmax_label'], context=contexts)
-    mod.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label)
-    mod.init_params(initializer=mx.init.Uniform(scale=.1))
-    sgd = mx.optimizer.SGD(momentum=0.0, clip_gradient=5.0,
-                           learning_rate=0.1, rescale_grad=1.0/batch_size/num_worker)
-    mod.init_optimizer(optimizer=sgd, kvstore=kv)
-    # use accuracy as the metric
-    metric = mx.gluon.metric.create('acc')
-
-    index = mod._exec_group.param_names.index('w')
-    # weight_array bound to executors of the contexts
-    weight_array = mod._exec_group.param_arrays[index]
-
-    mx.nd.waitall()  # sync point for initialization
-    # start profiler
-    if profiler:
-        device = 'cpu'
-        if args.num_gpu > 0:
-            device = 'gpu' + str(args.num_gpu)
-        name = 'profile_' + args.dataset + '_' + device + '_nworker' + str(num_worker)\
-               + '_batchsize' + str(args.batch_size) + '_outdim' + str(args.output_dim) + '.json'
-        mx.profiler.set_config(profile_all=True, filename=name)
-        mx.profiler.set_state('run')
-
-    logging.debug('start training ...')
-    start = time.time()
-    data_iter = iter(train_data)
-    time_cost_epoch = 0.
-    sum_cost_epoch = 0.
-    average_cost_epoch = 0.
-
-    for epoch in range(num_epoch):
-        start_time_epoch = time.time()
-        nbatch = 0
-        end_of_batch = False
-        metric.reset()
-        next_batch = next(data_iter)
-        if kv is not None:
-            row_sparse_pull(kv, 'w', next_batch.data[0], mod._exec_group.slices, weight_array, -index)
-        while not end_of_batch:
-            nbatch += 1
-            batch = next_batch
-
-            if measure_only != IO and measure_only != COMM:
-                mod.forward_backward(batch)
-                # update parameters
-                mod.update()
-            if measure_only == COMM:
-                if nbatch == 1:
-                    mod.forward_backward(batch)
-                    mod.update()
-                elif not omit_row_sparse_push:
-                    row_sparse_push(kv, mod._exec_group.param_arrays, mod._exec_group.grad_arrays, mod._exec_group.param_names)
-
-
-            try:
-                # pre fetch next batch
-                next_batch = next(data_iter)
-                if nbatch == num_batch:
-                    raise StopIteration
-                if kv is not None:
-                    row_sparse_pull(kv, 'w', next_batch.data[0], mod._exec_group.slices, weight_array, -index)
-            except StopIteration:
-                end_of_batch = True
-            # accumulate prediction accuracy
-            if args.dummy_metric == 0:
-                mod.update_metric(metric, batch.label)
-            else:  # call waitall to replace update_metric as sync point
-                mx.nd.waitall()  # sync point for the current minibatch
-        logging.info('epoch {}, {}'.format(epoch, metric.get()))
-        end_time_epoch = time.time()
-        if epoch == 0:
-            logging.debug("num_batches = {}".format(nbatch))
-            logging.info('|device|num_worker|average_cost_epoch|rank|')
-        time_cost_epoch = end_time_epoch - start_time_epoch
-        if epoch > 0:
-            sum_cost_epoch = sum_cost_epoch + time_cost_epoch
-            average_cost_epoch = float(sum_cost_epoch) / epoch
-        logging.info('num_worker = {}, time cost per epoch = {}'.format(str(num_worker), str(time_cost_epoch)))
-        if args.num_gpu < 1:
-            logging.info('|cpu/{} cores| {} | {} | {} |'.format(str(num_cores), str(num_worker), str(average_cost_epoch), rank))
-        data_iter.reset()
-    if profiler:
-        mx.profiler.set_state('stop')
-    end = time.time()
-    time_cost = end - start
-    logging.info('num_worker = {}, rank = {}, time cost = {}'.format(str(num_worker), str(rank), str(time_cost)))
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index f3b03570db33..62412e092523 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -966,9 +966,6 @@ cd_unittest_ubuntu() {
             pytest -m 'not serial' -k 'not test_operator' -n 4 --durations=50 --verbose tests/python/gpu
         pytest -m 'serial' --durations=50 --verbose tests/python/gpu
 
-        # Adding these here as CI doesn't test all CUDA environments
-        MXNET_GPU_MEM_POOL_TYPE=Unpooled \
-            pytest -n 4 example/image-classification/test_score.py
         # TODO(szha): fix and reenable the hanging issue. tracked in #18098
         # integrationtest_ubuntu_gpu_dist_kvstore
         integrationtest_ubuntu_gpu_byteps
@@ -1081,7 +1078,6 @@ unittest_ubuntu_tensorrt_gpu() {
     export CUDNN_VERSION=${CUDNN_VERSION:-7.0.3}
     export MXNET_ENABLE_CYTHON=0
     export DMLC_LOG_STACK_TRACE_DEPTH=10
-    python3 tests/python/tensorrt/lenet5_train.py
     MXNET_GPU_MEM_POOL_TYPE=Unpooled \
         pytest -n 4 --durations=50 --cov-report xml:tests_trt_gpu.xml --verbose --capture=no tests/python/tensorrt/test_ops.py
     pytest -k 'not test_ops' --durations=50 --cov-report xml:tests_trt_gpu.xml --cov-append --verbose --capture=no tests/python/tensorrt/
@@ -1275,15 +1271,6 @@ integrationtest_ubuntu_cpu_onnx() {
 	pytest -n 4 tests/python/unittest/onnx/test_node.py
 }
 
-integrationtest_ubuntu_gpu_python() {
-    set -ex
-    export PYTHONPATH=./python/
-    export MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
-    export MXNET_SUBGRAPH_VERBOSE=0
-    export DMLC_LOG_STACK_TRACE_DEPTH=10
-    pytest example/image-classification/test_score.py
-}
-
 integrationtest_ubuntu_cpu_asan() {
     set -ex
     export DMLC_LOG_STACK_TRACE_DEPTH=10
diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index 8ac63ef5a9d6..ce97f4a03a74 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -970,20 +970,6 @@ def test_unix_python3_tensorrt_gpu(lib_name) {
     }]
 }
 
-def test_unix_python3_integration_gpu(lib_name) {
-    return ['Python Integration GPU': {
-      node(NODE_LINUX_GPU_G4) {
-        ws('workspace/it-python-gpu') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init(lib_name, mx_lib)
-            utils.docker_run('ubuntu_gpu_cu101', 'integrationtest_ubuntu_gpu_python', true)
-            utils.publish_test_coverage()
-          }
-        }
-      }
-    }]
-}
-
 def test_unix_cpp_package_gpu(lib_name) {
     return ['cpp-package GPU Makefile': {
       node(NODE_LINUX_GPU_G4) {
diff --git a/ci/jenkins/Jenkinsfile_unix_gpu b/ci/jenkins/Jenkinsfile_unix_gpu
index 84ac2bcb4623..33c0d4daf580 100644
--- a/ci/jenkins/Jenkinsfile_unix_gpu
+++ b/ci/jenkins/Jenkinsfile_unix_gpu
@@ -55,7 +55,6 @@ core_logic: {
     custom_steps.test_unix_perl_gpu('gpu_make'),
     custom_steps.test_unix_r_gpu('gpu'),
     custom_steps.test_unix_cpp_gpu('cmake_gpu'),
-    custom_steps.test_unix_python3_integration_gpu('gpu'),
     custom_steps.test_unix_cpp_package_gpu('gpu_make'),
     custom_steps.test_unix_scala_gpu('gpu_make'),
     // TODO(szha): fix and reenable the hanging issue. tracked in #18098
diff --git a/docs/python_docs/python/tutorials/deploy/inference/wine_detector.md b/docs/python_docs/python/tutorials/deploy/inference/wine_detector.md
deleted file mode 100644
index b28d709c2f0d..000000000000
--- a/docs/python_docs/python/tutorials/deploy/inference/wine_detector.md
+++ /dev/null
@@ -1,405 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-# Real-time Object Detection with MXNet On The Raspberry Pi  
-
-This tutorial shows developers who work with the Raspberry Pi or similar embedded ARM-based devices how to compile MXNet for those devices and run a pretrained deep network model. It also shows how to use AWS IoT to manage and monitor MXNet models running on your devices.
-
-## What's In This Tutorial?
-
-This tutorial shows how to:
-
-1. Use MXNet to set up a real-time object classifier on a Raspberry Pi 3 device.
-
-2. Connect the local Raspberry Pi model to the AWS Cloud with AWS IoT to get real-time updates from the device.
-
-### Who's This Tutorial For?
-
-It assumes that you are familiar with the Raspbian operating system and the [Raspberry Pi ecosystem](https://www.raspberrypi.org/) and are somewhat familiar with machine learning, MXNet, and [AWS IoT](https://aws.amazon.com/iot/). All code is written in Python 2.7.
-
-### How to Use This Tutorial
-
-To follow this tutorial, you must set up your Pi as instructed (preferably from a fresh Raspbian install), and then create the files and run the bash commands described below. All instructions described are can be executed on the Raspberry Pi directly or via SSH.
-
-You will accomplish the following:
-
-- Build and Install MXNet with Python bindings on your Raspbian Based Raspberry Pi
-- Fetch and run a pre-trained MXNet model on your Pi
-- Create a real-time video analysis application for the Pi
-- Connect the application to the AWS IoT service
-
-## Prerequisites
-
-To complete this tutorial, you need:
-
-* Raspbian Wheezy or later, which can be downloaded [here](https://www.raspberrypi.org/downloads/raspbian/), loaded onto a 8GB+ micro SD card (with at least 4GB+ free)
-* A [Raspberry Pi 3](https://www.raspberrypi.org/blog/raspberry-pi-3-on-sale/) or equivalent Raspberry Pi with 1GB+ of RAM
-* A [Raspberry Pi Camera Module](https://www.raspberrypi.org/products/camera-module-v2/) [activated and running with the corresponding Python module](http://www.pyimagesearch.com/2015/02/23/install-opencv-and-python-on-your-raspberry-pi-2-and-b/) (for the real-time video analysis with the deep network model)
-* An AWS account With AWS IoT enabled and the [AWS IoT Python SDK](https://github.com/aws/aws-iot-device-sdk-python) (for remote, real-time managing and monitoring of the model running on the Pi)
-* The [cv2 Python library](http://www.pyimagesearch.com/2015/02/23/install-opencv-and-python-on-your-raspberry-pi-2-and-b/) for the Pi
-
-## Building MXNet for The Pi
-
-The first step is to get MXNet with the Python bindings running on your Raspberry Pi 3. There is a tutorial for that provided [here](https://mxnet.io/get_started). The linked tutorial walks you through downloading the dependencies, and building the full MXNet library for the Pi with the ARM specific compile flags. Be sure to build the library with open CV as we will be using a model that requires it to process images. Then you will register the Python bindings to MXNet. After this is done you should test that your installation works by opening a python REPL on your Pi and typing the following commands:
-
-
-```bash
-python
->>> import mxnet as mx
-```
-
-*Note: If you are getting memory allocation failed errors at this point (or at any point in this tutorial) it is likely because the full MXNet library takes up a large amount of RAM when loaded. You might want to kill the GUI and other processes that are occupying memory.*
-
-
-## Running A Pre-Trained Inception Model on The Pi
-
-We are now ready to load a pre-trained model and run inference on the Pi. We will be using a simple object recognition model trained on the ImageNet data set. The model is called batch normalized Inception network (or Inception_BN for short) and it is found in the MXNet model zoo.
-
-### Getting the Model
-
-The first step is to download, unzip, and set up the pre-trained deep network model files that we will be using to classify images. To do this run the following commands in your home directory:
-
-```bash
-curl --header 'Host: data.mxnet.io' --header 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:45.0) Gecko/20100101 Firefox/45.0' --header 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' --header 'Accept-Language: en-US,en;q=0.5' --header 'Referer: http://data.mxnet.io/models/imagenet/' --header 'Connection: keep-alive' 'http://data.mxnet.io/models/imagenet/inception-bn.tar.gz' -o 'inception-bn.tar.gz' -L
-
-tar -xvzf inception-bn.tar.gz
-
-mv Inception_BN-0039.params Inception_BN-0000.params
-```
-
-### Running the Model
-
-The next step is to create a python script to load the model, and run inference on local image files. To do this create a new file in your home directory called inception_predict.py and add the following code to it:
-
-
-```python
-# inception_predict.py
-
-import mxnet as mx
-import numpy as np
-import time
-import cv2, os, urllib
-from collections import namedtuple
-Batch = namedtuple('Batch', ['data'])
-
-# Load the symbols for the networks
-with open('synset.txt', 'r') as f:
-    synsets = [l.rstrip() for l in f]
-
-# Load the network parameters
-sym, arg_params, aux_params = mx.model.load_checkpoint('Inception-BN', 126)
-
-
-# Load the network into an MXNet module and bind the corresponding parameters
-mod = mx.mod.Module(symbol=sym, context=mx.cpu())
-mod.bind(for_training=False, data_shapes=[('data', (1,3,224,224))])
-mod.set_params(arg_params, aux_params)
-
-'''
-Function to predict objects by giving the model a pointer to an image file and running a forward pass through the model.
-
-inputs:
-filename = jpeg file of image to classify objects in
-mod = the module object representing the loaded model
-synsets = the list of symbols representing the model
-N = Optional parameter denoting how many predictions to return (default is top 5)
-
-outputs:
-python list of top N predicted objects and corresponding probabilities
-'''
-def predict(filename, mod, synsets, N=5):
-    tic = time.time()
-    img = cv2.cvtColor(cv2.imread(filename), cv2.COLOR_BGR2RGB)
-    if img is None:
-        return None
-    img = cv2.resize(img, (224, 224))
-    img = np.swapaxes(img, 0, 2)
-    img = np.swapaxes(img, 1, 2)
-    img = img[np.newaxis, :]
-    print "pre-processed image in "+str(time.time()-tic)
-
-    toc = time.time()
-    mod.forward(Batch([mx.nd.array(img)]))
-    prob = mod.get_outputs()[0].asnumpy()
-    prob = np.squeeze(prob)
-    print "forward pass in "+str(time.time()-toc)
-
-
-    topN = []
-    a = np.argsort(prob)[::-1]
-    for i in a[0:N]:
-        print('probability=%f, class=%s' %(prob[i], synsets[i]))
-        topN.append((prob[i], synsets[i]))
-    return topN
-
-
-# Code to download an image from the internet and run a prediction on it
-def predict_from_url(url, N=5):
-    filename = url.split("/")[-1]
-    urllib.urlretrieve(url, filename)
-    img = cv2.imread(filename)
-    if img is None:
-        print "Failed to download"
-    else:
-        return predict(filename, mod, synsets, N)
-
-# Code to predict on a local file
-def predict_from_local_file(filename, N=5):
-    return predict(filename, mod, synsets, N)
-```
-
-Now that we have defined inception_predict.py we can test that the model is running correctly. Open a Python REPL in your home directory and enter the following:
-
-```bash
-python
->>> from inception_predict import *
->>> predict_from_url("https://i.imgur.com/HzafyBA.jpg")
-```
-
-This should give a reasonable prediction for the fluffy cow in this [image](http://imgur.com/HzafyBA).
-
-```
-pre-processed image in 0.20366191864
-forward pass in 63.2164611816
-probability=0.718524, class=n02403003 ox
-probability=0.176381, class=n02389026 sorrel
-probability=0.095558, class=n03868242 oxcart
-probability=0.002765, class=n02408429 water buffalo, water ox, Asiatic buffalo, Bubalus bubalis
-probability=0.001262, class=n03935335 piggy bank, penny bank
-[(0.71852392, 'n02403003 ox'), (0.17638102, 'n02389026 sorrel'), (0.09555836, 'n03868242 oxcart'),
-(0.0027645244, 'n02408429 water buffalo, water ox, Asiatic buffalo, Bubalus bubalis'),
-(0.0012616422, 'n03935335 piggy bank, penny bank')]
-```
-
-
-## Running an Inception on Real-Time Video From PiCamera
-
-We can now move on to using this network for object detection in real-time video from the PiCamera.
-
-Doing this requires sending the images that the camera is capturing to the prediction code that we created in the previous step. To do this, create a new file in your home directory called camera_test.py and add the following code to it:
-
-
-```python
-# camera_test.py
-
-import picamera
-import inception_predict
-
-# Create camera interface
-camera = picamera.PiCamera()
-while True:
-    # Take the jpg image from camera
-    print "Capturing"
-    filename = '/home/pi/cap.jpg'
-    # Show quick preview of what's being captured
-    camera.start_preview()
-    camera.capture(filename)
-    camera.stop_preview()
-
-    # Run inception prediction on image
-    print "Predicting"
-    topn = inception_predict.predict_from_local_file(filename, N=5)
-
-    # Print the top N most likely objects in image (default set to 5, change this in the function call above)
-    print topn
-```
-
-You can then run this file by entering the following command:
-
-```bash
-python camera_test.py
-```
-
-If camera_test.py is working you should see a preview every few seconds of the image that is being captured and fed to the model, as well as predicted classes for objects in the image being written to the terminal.
-
-Try pointing the PiCamera at a few different objects and see what predictions the network comes out with.
-
-## Connecting Our Model To The AWS Cloud
-
-We can now move on to adding the code to send the predictions that this real-time model is making locally to the AWS cloud if certain conditions are met.
-
-The first step is to set up an AWS account if you don't have one yet. Then go to the [AWS IoT dashboard](https://us-west-2.console.aws.amazon.com/iotv2/home?region=us-west-2#/thinghub) and register a new device.
-
-After the device is registered, download and copy the corresponding rootCA, Certificate, and Private key to your home directory. Note the unique endpoint of your device shadow on the AWS IoT Dashboard.
-
-We will now build an application, based off the code in camera_test.py, which will send a message to the cloud whenever a wine bottle is detected in a frame by the PiCamera.
-
-To do this create a new file in your home directory called wine_alerter.py and add the following code to it:
-
-
-```python
-# wine_alerter.py
-
-import AWSIoTPythonSDK
-from AWSIoTPythonSDK.MQTTLib import AWSIoTMQTTClient
-import sys
-import logging
-import time
-import getopt
-import picamera
-import inception_predict
-
-# Custom MQTT message callback
-def customCallback(client, userdata, message):
-    print("Received a new message: ")
-    print(message.payload)
-    print("from topic: ")
-    print(message.topic)
-    print("--------------\n\n")
-
-# Usage
-usageInfo = """Usage:
-
-Use certificate based mutual authentication:
-python wine_alerter.py -e <endpoint> -r <rootCAFilePath> -c <certFilePath> -k <privateKeyFilePath>
-
-Use MQTT over WebSocket:
-python wine_alerter.py -e <endpoint> -r <rootCAFilePath> -w
-
-Type "python wine_alerter.py -h" for available options.
-"""
-
-# Help info
-helpInfo = """-e, --endpoint
-    Your AWS IoT custom endpoint
--r, --rootCA
-    Root CA file path
--c, --cert
-    Certificate file path
--k, --key
-    Private key file path
--w, --websocket
-    Use MQTT over WebSocket
--h, --help
-    Help information
-"""
-
-# Read in command-line parameters
-useWebsocket = False
-host = ""
-rootCAPath = ""
-certificatePath = ""
-privateKeyPath = ""
-try:
-    opts, args = getopt.getopt(sys.argv[1:], "hwe:k:c:r:", ["help", "endpoint=", "key=","cert=","rootCA=", "websocket"])
-    if len(opts) == 0:
-        raise getopt.GetoptError("No input parameters!")
-    for opt, arg in opts:
-        if opt in ("-h", "--help"):
-            print(helpInfo)
-            exit(0)
-        if opt in ("-e", "--endpoint"):
-            host = arg
-        if opt in ("-r", "--rootCA"):
-            rootCAPath = arg
-        if opt in ("-c", "--cert"):
-            certificatePath = arg
-        if opt in ("-k", "--key"):
-            privateKeyPath = arg
-        if opt in ("-w", "--websocket"):
-            useWebsocket = True
-except getopt.GetoptError:
-    print(usageInfo)
-    exit(1)
-
-# Missing configuration notification
-missingConfiguration = False
-if not host:
-    print("Missing '-e' or '--endpoint'")
-    missingConfiguration = True
-if not rootCAPath:
-    print("Missing '-r' or '--rootCA'")
-    missingConfiguration = True
-if not useWebsocket:
-    if not certificatePath:
-        print("Missing '-c' or '--cert'")
-        missingConfiguration = True
-    if not privateKeyPath:
-        print("Missing '-k' or '--key'")
-        missingConfiguration = True
-if missingConfiguration:
-    exit(2)
-
-
-# Configure logging
-logger = logging.getLogger("AWSIoTPythonSDK.core")
-logger.setLevel(logging.DEBUG)
-streamHandler = logging.StreamHandler()
-formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
-streamHandler.setFormatter(formatter)
-logger.addHandler(streamHandler)
-
-
-# Init AWSIoTMQTTClient For Publish/Subscribe Communication With Server
-myAWSIoTMQTTClient = None
-if useWebsocket:
-    myAWSIoTMQTTClient = AWSIoTMQTTClient("basicPubSub", useWebsocket=True)
-    myAWSIoTMQTTClient.configureEndpoint(host, 443)
-    myAWSIoTMQTTClient.configureCredentials(rootCAPath)
-else:
-    myAWSIoTMQTTClient = AWSIoTMQTTClient("basicPubSub")
-    myAWSIoTMQTTClient.configureEndpoint(host, 8883)
-    myAWSIoTMQTTClient.configureCredentials(rootCAPath, privateKeyPath, certificatePath)
-
-
-# AWSIoTMQTTClient connection configuration
-myAWSIoTMQTTClient.configureAutoReconnectBackoffTime(1, 32, 20)
-myAWSIoTMQTTClient.configureOfflinePublishQueueing(-1)  # Infinite offline Publish queueing
-myAWSIoTMQTTClient.configureDrainingFrequency(2)  # Draining: 2 Hz
-myAWSIoTMQTTClient.configureConnectDisconnectTimeout(10)  # 10 sec
-myAWSIoTMQTTClient.configureMQTTOperationTimeout(5)  # 5 sec
-
-
-# Connect and subscribe to AWS IoT
-myAWSIoTMQTTClient.connect()
-myAWSIoTMQTTClient.subscribe("sdk/test/Python", 1, customCallback)
-time.sleep(2)
-
-
-# Start the Camera and tell the Server we are alive
-print "Running camera"
-myAWSIoTMQTTClient.publish("sdk/test/Python", "New Message: Starting Camera", 0)
-camera = picamera.PiCamera()
-
-# Capture forever (this is a modified version of camera_test.py)
-while True:
-    filename = '/home/pi/cap.jpg'
-    camera.start_preview()
-    camera.capture(filename)
-    camera.stop_preview()
-    topn = inception_predict.predict_from_local_file(filename, N=5)
-
-    # Check if either of the top two predictions are wine related and publish a message if it is
-    # you can change 'wine' here to anything you want to alert the server about detecting
-    if 'wine' in topn[0][1] or 'wine' in topn[1][1]:
-        myAWSIoTMQTTClient.publish("sdk/test/Python", "New Message: WINE DETECTED!", 0)
-```
-
-You can then run this file by entering the following command
-
-```bash
-python wine_alerter.py -e <endpointURL> -r <rootCAFilePath> -c <certFilePath> -k <privateKeyFilePath>
-```
-
-If this is working you should see the same kind of image preview you did with camera_test.py every few seconds, however the console will only print a message now when a wine bottle is detected in the shot (you can edit the bottom lines in the wine_alerter.py code to make this alert for any object label from the [ImageNet-11k dataset](http://image-net.org/index) that you specify).
-
-You can open up the activity tab for the thing that you registered on the AWS IoT Dashboard and see the corresponding messages pushed to the server whenever a wine bottle is detected in a camera shot. Even if network connectivity periodically fails, AWS IoT will push updates out to the server when possible, allowing this system to reliably let you know when there is wine around.
-
-## Summary
-You now have a Raspberry Pi running a pre-trained MXNet model fully locally. This model is also linked to the cloud via AWS IoT and can reliably alert AWS whenever it sees a wine bottle.
-
-You can now extend this demo to create more interesting applications, such as using AWS IoT to push model updates to your Pi, loading different pre-trained models from the MXNet Model Zoo onto the Pi, or even training full end-to-end models on the Pi.
diff --git a/docs/python_docs/python/tutorials/getting-started/gluon_from_experiment_to_deployment.md b/docs/python_docs/python/tutorials/getting-started/gluon_from_experiment_to_deployment.md
index 20e9cabcdaf8..7fb4d48157f6 100644
--- a/docs/python_docs/python/tutorials/getting-started/gluon_from_experiment_to_deployment.md
+++ b/docs/python_docs/python/tutorials/getting-started/gluon_from_experiment_to_deployment.md
@@ -260,53 +260,7 @@ finetune_net.export("flower-recognition", epoch=epochs)
 
 ```
 
-`export` creates `flower-recognition-symbol.json` and `flower-recognition-0040.params` (`0040` is for 40 epochs we ran) in the current directory. These files can be used for model deployment in the next section.
-
-## Load the model and run inference using the MXNet Module API
-
-MXNet provides various useful tools and interfaces for deploying your model for inference. For example, you can use [MXNet Model Server](https://github.com/awslabs/mxnet-model-server) to start a service and host your trained model easily.
-Besides that, you can also use MXNet's different language APIs to integrate your model with your existing service. We provide [Python](/api/python.html),    [Java](/api/java.html), [Scala](/api/scala.html), and [C++](/api/cpp) APIs.
-
-Here we will briefly introduce how to run inference using Module API in Python. In general, prediction consists of the following steps:
-1. Load the model architecture (symbol file) and trained parameter values (params file)
-2. Load the synset file for label names
-3. Load the image and apply the same transformation we did on validation dataset during training
-4. Run a forward pass on the image data
-5. Convert output probabilities to predicted label name
-
-```python
-import numpy as np
-from collections import namedtuple
-
-ctx = mx.cpu()
-# load model symbol and params
-sym, arg_params, aux_params = mx.model.load_checkpoint('flower-recognition', epochs)
-mod = mx.mod.Module(symbol=sym, context=ctx, label_names=None)
-mod.bind(for_training=False, data_shapes=[('data', (1, 3, 224, 224))], label_shapes=mod._label_shapes)
-mod.set_params(arg_params, aux_params, allow_missing=True)
-
-# load synset for label names
-with open('synset.txt', 'r') as f:
-    labels = [l.rstrip() for l in f]
-
-# load an image for prediction
-img = mx.image.imread('./data/test/lotus/image_01832.jpg')
-# apply transform we did during training
-img = validation_transformer(img)
-# batchify
-img = img.expand_dims(axis=0)
-Batch = namedtuple('Batch', ['data'])
-mod.forward(Batch([img]))
-prob = mod.get_outputs()[0].asnumpy()
-prob = np.squeeze(prob)
-idx = np.argmax(prob)
-print('probability=%f, class=%s' % (prob[idx], labels[idx]))
-```
-
-Following is the output, you can see the image has been classified as lotus correctly.
-```text
-probability=9.798435, class=lotus
-```
+`export` creates `flower-recognition-symbol.json` and `flower-recognition-0040.params` (`0040` is for 40 epochs we ran) in the current directory. These files can be used for model deployment using the `HybridBlock.import` API.
 
 ## What's next
 
diff --git a/docs/python_docs/python/tutorials/packages/ndarray/sparse/train.md b/docs/python_docs/python/tutorials/packages/ndarray/sparse/train.md
deleted file mode 100644
index 23654fc6a33a..000000000000
--- a/docs/python_docs/python/tutorials/packages/ndarray/sparse/train.md
+++ /dev/null
@@ -1,339 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-
-# Train a Linear Regression Model with Sparse Symbols
-In previous tutorials, we introduced `CSRNDArray` and `RowSparseNDArray`,
-the basic data structures for manipulating sparse data.
-MXNet also provides `Sparse Symbol` API, which enables symbolic expressions that handle sparse arrays.
-In this tutorial, we first focus on how to compose a symbolic graph with sparse operators,
-then train a linear regression model using sparse symbols with the Module API.
-
-## Prerequisites
-
-To complete this tutorial, we need:
-
-- MXNet. See the instructions for your operating system in [Setup and Installation](/get_started).  
-
-- [Jupyter Notebook](https://jupyter.org/index.html) and [Python Requests](https://3.python-requests.org/) packages.
-```
-pip install jupyter requests
-```
-
-- Basic knowledge of Symbol in MXNet. See the detailed tutorial for Symbol in [Symbol - Neural Network Graphs and Auto-differentiation](https://mxnet.apache.org/tutorials/basic/symbol.html).
-
-- Basic knowledge of CSRNDArray in MXNet. See the detailed tutorial for CSRNDArray in [CSRNDArray - NDArray in Compressed Sparse Row Storage Format](/api/python/docs/tutorials/packages/ndarray/sparse/csr.html).
-
-- Basic knowledge of RowSparseNDArray in MXNet. See the detailed tutorial for RowSparseNDArray in [RowSparseNDArray - NDArray for Sparse Gradient Updates](/api/python/docs/tutorials/packages/ndarray/sparse/row_sparse.html).
-
-## Variables
-
-Variables are placeholder for arrays. We can use them to hold sparse arrays too.
-
-### Variable Storage Types
-
-The `stype` attribute of a variable is used to indicate the storage type of the array.
-By default, the `stype` of a variable is "default" which indicates the default dense storage format.
-We can specify the `stype` of a variable as "csr" or "row_sparse" to hold sparse arrays.
-
-
-```python
-import mxnet as mx
-import numpy as np
-import random
-
-# set the seeds for repeatability
-random.seed(42)
-np.random.seed(42)
-mx.random.seed(42)
-
-# Create a variable to hold an NDArray
-a = mx.sym.Variable('a')
-# Create a variable to hold a CSRNDArray
-b = mx.sym.Variable('b', stype='csr')
-# Create a variable to hold a RowSparseNDArray
-c = mx.sym.Variable('c', stype='row_sparse')
-(a, b, c)
-```
-
-
-
-
-`(<Symbol a>, <Symbol b>, <Symbol c>)` <!--notebook-skip-line-->
-
-
-
-### Bind with Sparse Arrays
-
-The sparse symbols constructed above declare storage types of the arrays to hold.
-To evaluate them, we need to feed the free variables with sparse data.
-
-You can instantiate an executor from a sparse symbol by using the `simple_bind` method,
-which allocate zeros to all free variables according to their storage types.
-The executor provides `forward` method for evaluation and an attribute
-`outputs` to get all the results. Later, we will show the use of the `backward` method and other methods computing the gradients and updating parameters. A simple example first:
-
-
-```python
-shape = (2,2)
-# Instantiate an executor from sparse symbols
-b_exec = b.simple_bind(ctx=mx.cpu(), b=shape)
-c_exec = c.simple_bind(ctx=mx.cpu(), c=shape)
-b_exec.forward()
-c_exec.forward()
-# Sparse arrays of zeros are bound to b and c
-print(b_exec.outputs, c_exec.outputs)
-```
-
-```
-([
-<CSRNDArray 2x2 @cpu(0)>], [
-<RowSparseNDArray 2x2 @cpu(0)>])
-```
-
-You can update the array held by the variable by accessing executor's `arg_dict` and assigning new values.
-
-
-```python
-b_exec.arg_dict['b'][:] = mx.nd.ones(shape).tostype('csr')
-b_exec.forward()
-# The array `b` holds are updated to be ones
-eval_b = b_exec.outputs[0]
-{'eval_b': eval_b, 'eval_b.asnumpy()': eval_b.asnumpy()}
-```
-
-
-
-```
-{'eval_b':
- <CSRNDArray 2x2 @cpu(0)>, 'eval_b.asnumpy()': array([[ 1.,  1.],
-        [ 1.,  1.]], dtype=float32)}
-```
-
-
-## Symbol Composition and Storage Type Inference
-
-### Basic Symbol Composition
-
-The following example builds a simple element-wise addition expression with different storage types.
-The sparse symbols are available in the `mx.sym.sparse` package.
-
-
-```python
-# Element-wise addition of variables with "default" stype
-d = mx.sym.elemwise_add(a, a)
-# Element-wise addition of variables with "csr" stype
-e = mx.sym.sparse.negative(b)
-# Element-wise addition of variables with "row_sparse" stype
-f = mx.sym.sparse.elemwise_add(c, c)
-{'d':d, 'e':e, 'f':f}
-```
-
-
-
-```
-{'d': <Symbol elemwise_add0>,
- 'e': <Symbol negative0>,
- 'f': <Symbol elemwise_add1>}
-```
-
-
-### Storage Type Inference
-
-What will be the output storage types of sparse symbols? In MXNet, for any sparse symbol, the result storage types are inferred based on storage types of inputs.
-You can read the [Sparse Symbol API](/api/python/docs/api/symbol/sparse/index.html) documentation to find what output storage types are. In the example below we will try out the storage types introduced in the Row Sparse and Compressed Sparse Row tutorials: `default` (dense), `csr`, and `row_sparse`.
-
-
-```python
-add_exec = mx.sym.Group([d, e, f]).simple_bind(ctx=mx.cpu(), a=shape, b=shape, c=shape)
-add_exec.forward()
-dense_add = add_exec.outputs[0]
-# The output storage type of elemwise_add(csr, csr) will be inferred as "csr"
-csr_add = add_exec.outputs[1]
-# The output storage type of elemwise_add(row_sparse, row_sparse) will be inferred as "row_sparse"
-rsp_add = add_exec.outputs[2]
-{'dense_add.stype': dense_add.stype, 'csr_add.stype':csr_add.stype, 'rsp_add.stype': rsp_add.stype}
-```
-
-
-
-```
-{'csr_add.stype': 'csr',
- 'dense_add.stype': 'default',
- 'rsp_add.stype': 'row_sparse'}
-```
-
-
-### Storage Type Fallback
-
-For operators that don't specialize in certain sparse arrays, you can still use them with sparse inputs with some performance penalty. In MXNet, dense operators require all inputs and outputs to be in the dense format. If sparse inputs are provided, MXNet will convert sparse inputs into dense ones temporarily so that the dense operator can be used. If sparse outputs are provided, MXNet will convert the dense outputs generated by the dense operator into the provided sparse format. Warning messages will be printed when such a storage fallback event happens.
-
-
-```python
-# `log` operator doesn't support sparse inputs at all, but we can fallback on the dense implementation
-csr_log = mx.sym.log(a)
-# `elemwise_add` operator doesn't support adding csr with row_sparse, but we can fallback on the dense implementation
-csr_rsp_add = mx.sym.elemwise_add(b, c)
-fallback_exec = mx.sym.Group([csr_rsp_add, csr_log]).simple_bind(ctx=mx.cpu(), a=shape, b=shape, c=shape)
-fallback_exec.forward()
-fallback_add = fallback_exec.outputs[0]
-fallback_log = fallback_exec.outputs[1]
-{'fallback_add': fallback_add, 'fallback_log': fallback_log}
-```
-
-
-
-```
-{'fallback_add':
- [[ 0.  0.]
-  [ 0.  0.]]
- <NDArray 2x2 @cpu(0)>, 'fallback_log':
- [[-inf -inf]
-  [-inf -inf]]
- <NDArray 2x2 @cpu(0)>}
-```
-
-
-### Inspecting Storage Types of the Symbol Graph
-
-When the environment variable `MXNET_INFER_STORAGE_TYPE_VERBOSE_LOGGING` is set to `1`, MXNet will log the storage type information of
-operators' inputs and outputs in the computation graph. For example, we can inspect the storage types of
-a linear classification network with sparse operators. Uncomment the line below and inspect your console.:
-
-
-```python
-# Set logging level for executor
-import mxnet as mx
-import os
-#os.environ['MXNET_INFER_STORAGE_TYPE_VERBOSE_LOGGING'] = "1"
-# Data in csr format
-data = mx.sym.var('data', stype='csr', shape=(32, 10000))
-# Weight in row_sparse format
-weight = mx.sym.var('weight', stype='row_sparse', shape=(10000, 2))
-bias = mx.symbol.Variable("bias", shape=(2,))
-dot = mx.symbol.sparse.dot(data, weight)
-pred = mx.symbol.broadcast_add(dot, bias)
-y = mx.symbol.Variable("label")
-output = mx.symbol.SoftmaxOutput(data=pred, label=y, name="output")
-executor = output.simple_bind(ctx=mx.cpu())
-```
-
-## Training with Module APIs
-
-In the following section we'll walk through how one can implement **linear regression** using sparse symbols and sparse optimizers.
-
-The function you will explore is: *y = x<sub>1</sub>  +  2x<sub>2</sub> + ... 100x<sub>100*, where *(x<sub>1</sub>,x<sub>2</sub>, ..., x<sub>100</sub>)* are input features and *y* is the corresponding label.
-
-### Preparing the Data
-
-In MXNet, both [mx.io.LibSVMIter](/api/python/docs/api/mxnet/io/index.html#mxnet.io.LibSVMIter)
-and [mx.io.NDArrayIter](/api/python/docs/api/mxnet/io/index.html#mxnet.io.NDArrayIter)
-support loading sparse data in CSR format. In this example, we'll use the `NDArrayIter`.
-
-You may see some warnings from SciPy. You don't need to worry about those for this example.
-
-
-```python
-# Random training data
-feature_dimension = 100
-train_data = mx.test_utils.rand_ndarray((1000, feature_dimension), 'csr', 0.01)
-target_weight = mx.nd.arange(1, feature_dimension + 1).reshape((feature_dimension, 1))
-train_label = mx.nd.dot(train_data, target_weight)
-batch_size = 1
-train_iter = mx.io.NDArrayIter(train_data, train_label, batch_size, last_batch_handle='discard', label_name='label')
-```
-
-### Defining the Model
-
-Below is an example of a linear regression model specifying the storage type of the variables.
-
-
-```python
-initializer = mx.initializer.Normal(sigma=0.01)
-X = mx.sym.Variable('data', stype='csr')
-Y = mx.symbol.Variable('label')
-weight = mx.symbol.Variable('weight', stype='row_sparse', shape=(feature_dimension, 1), init=initializer)
-bias = mx.symbol.Variable('bias', shape=(1, ))
-pred = mx.sym.broadcast_add(mx.sym.sparse.dot(X, weight), bias)
-lro = mx.sym.LinearRegressionOutput(data=pred, label=Y, name="lro")
-```
-
-The above network uses the following symbols:
-
-1. `Variable X`: The placeholder for sparse data inputs. The `csr` stype indicates that the array to hold is in CSR format.
-
-2. `Variable Y`: The placeholder for dense labels.
-
-3. `Variable weight`: The placeholder for the weight to learn. The `stype` of weight is specified as `row_sparse` so that it is initialized as RowSparseNDArray,
-   and the optimizer will perform sparse update rules on it. The `init` attribute specifies what initializer to use for this variable.
-
-4. `Variable bias`: The placeholder for the bias to learn.
-
-5. `sparse.dot`: The dot product operation of `X` and `weight`. The sparse implementation will be invoked to handle `csr` and `row_sparse` inputs.
-
-6. `broadcast_add`: The broadcasting add operation to apply `bias`.
-
-7. `LinearRegressionOutput`: The output layer which computes *l2* loss against its input and the labels provided to it.
-
-### Training the model
-
-Once we have defined the model structure, the next step is to create a module and initialize the parameters and optimizer.
-
-
-```python
-# Create module
-mod = mx.mod.Module(symbol=lro, data_names=['data'], label_names=['label'])
-# Allocate memory by giving the input data and label shapes
-mod.bind(data_shapes=train_iter.provide_data, label_shapes=train_iter.provide_label)
-# Initialize parameters by random numbers
-mod.init_params(initializer=initializer)
-# Use SGD as the optimizer, which performs sparse update on "row_sparse" weight
-sgd = mx.optimizer.SGD(learning_rate=0.05, rescale_grad=1.0/batch_size, momentum=0.9)
-mod.init_optimizer(optimizer=sgd)
-```
-
-Finally, we train the parameters of the model to fit the training data by using the `forward`, `backward`, and `update` methods in Module.
-
-
-```python
-# Use mean square error as the metric
-metric = mx.metric.create('MSE')
-# Train 10 epochs
-for epoch in range(10):
-    train_iter.reset()
-    metric.reset()
-    for batch in train_iter:
-        mod.forward(batch, is_train=True)       # compute predictions
-        mod.update_metric(metric, batch.label)  # accumulate prediction accuracy
-        mod.backward()                          # compute gradients
-        mod.update()                            # update parameters
-    print('Epoch %d, Metric = %s' % (epoch, metric.get()))
-assert metric.get()[1] < 1, "Achieved MSE (%f) is larger than expected (1.0)" % metric.get()[1]    
-```
-
-`Epoch 9, Metric = ('mse', 0.35979430613957991)`<!--notebook-skip-line-->
-
-
-
-
-### Training the model with multiple machines or multiple devices
-
-Distributed training with `row_sparse` weights and gradients are supported in MXNet, which significantly reduces communication cost for large models. To train a sparse model with multiple machines, you need to call `prepare` before `forward`, or `save_checkpoint`.
-Please refer to the example in [mxnet/example/sparse/linear_classification](https://github.com/apache/incubator-mxnet/tree/master/example/sparse/linear_classification)
-for more details.
-
-<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/docs/python_docs/python/tutorials/packages/onnx/super_resolution.md b/docs/python_docs/python/tutorials/packages/onnx/super_resolution.md
deleted file mode 100644
index eec904d80a64..000000000000
--- a/docs/python_docs/python/tutorials/packages/onnx/super_resolution.md
+++ /dev/null
@@ -1,140 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-# Importing an ONNX model into MXNet
-
-In this tutorial we will:
-
-- learn how to load a pre-trained ONNX model file into MXNet.
-- run inference in MXNet.
-
-## Prerequisites
-This example assumes that the following python packages are installed:
-- [mxnet](/get_started)
-- [onnx](https://github.com/onnx/onnx) (follow the install guide)
-- Pillow - A Python Image Processing package and is required for input pre-processing. It can be installed with ```pip install Pillow```.
-- matplotlib
-
-
-```python
-from PIL import Image
-import numpy as np
-import mxnet as mx
-import mxnet.contrib.onnx as onnx_mxnet
-from mxnet.test_utils import download
-from matplotlib.pyplot import imshow
-```
-
-### Fetching the required files
-
-
-```python
-img_url = 'https://s3.amazonaws.com/onnx-mxnet/examples/super_res_input.jpg'
-download(img_url, 'super_res_input.jpg')
-model_url = 'https://s3.amazonaws.com/onnx-mxnet/examples/super_resolution.onnx'
-onnx_model_file = download(model_url, 'super_resolution.onnx')
-```
-
-## Loading the model into MXNet
-
-To completely describe a pre-trained model in MXNet, we need two elements: a symbolic graph, containing the model's network definition, and a binary file containing the model weights. You can import the ONNX model and get the symbol and parameters objects using ``import_model`` API. The paameter object is split into argument parameters and auxilliary parameters.
-
-
-```python
-sym, arg, aux = onnx_mxnet.import_model(onnx_model_file)
-```
-
-We can now visualize the imported model (graphviz needs to be installed)
-
-
-```python
-mx.viz.plot_network(sym, node_attrs={"shape":"oval","fixedsize":"false"})
-```
-
-
-
-
-![svg](https://s3.amazonaws.com/onnx-mxnet/examples/super_res_mxnet_model.png) <!--notebook-skip-line-->
-
-
-
-## Input Pre-processing
-
-We will transform the previously downloaded input image into an input tensor.
-
-
-```python
-img = Image.open('super_res_input.jpg').resize((224, 224))
-img_ycbcr = img.convert("YCbCr")
-img_y, img_cb, img_cr = img_ycbcr.split()
-test_image = np.array(img_y)[np.newaxis, np.newaxis, :, :]
-```
-
-## Run Inference using MXNet's Module API
-
-We will use MXNet's Module API to run the inference. For this we will need to create the module, bind it to the input data and assign the loaded weights from the two parameter objects - argument parameters and auxilliary parameters.
-
-To obtain the input data names we run the following line, which picks all the inputs of the symbol graph excluding the argument and auxiliary parameters:
-
-```python
-data_names = [graph_input for graph_input in sym.list_inputs()
-                      if graph_input not in arg and graph_input not in aux]
-print(data_names)
-```
-
-```['1']```
-
-```python
-mod = mx.mod.Module(symbol=sym, data_names=data_names, context=mx.cpu(), label_names=None)
-mod.bind(for_training=False, data_shapes=[(data_names[0],test_image.shape)], label_shapes=None)
-mod.set_params(arg_params=arg, aux_params=aux, allow_missing=True, allow_extra=True)
-```
-
-Module API's forward method requires batch of data as input. We will prepare the data in that format and feed it to the forward method.
-
-
-```python
-from collections import namedtuple
-Batch = namedtuple('Batch', ['data'])
-
-# forward on the provided data batch
-mod.forward(Batch([mx.nd.array(test_image)]))
-```
-
-To get the output of previous forward computation, you use ``module.get_outputs()`` method.
-It returns an ``ndarray`` that we convert to a ``numpy`` array and then to Pillow's image format
-
-
-```python
-output = mod.get_outputs()[0][0][0]
-img_out_y = Image.fromarray(np.uint8((output.asnumpy().clip(0, 255)), mode='L'))
-result_img = Image.merge(
-"YCbCr", [
-                img_out_y,
-                img_cb.resize(img_out_y.size, Image.BICUBIC),
-                img_cr.resize(img_out_y.size, Image.BICUBIC)
-]).convert("RGB")
-result_img.save("super_res_output.jpg")
-```
-
-You can now compare the input image and the resulting output image. As you will notice, the model was able to increase the spatial resolution from ``256x256`` to ``672x672``.
-
-| Input Image | Output Image | <!--notebook-skip-line-->
-| ----------- | ------------ | <!--notebook-skip-line-->
-| ![input](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/onnx/images/super_res_input.jpg?raw=true) | ![output](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/doc/tutorials/onnx/images/super_res_output.jpg?raw=true) | <!--notebook-skip-line-->
-
-<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/docs/python_docs/python/tutorials/performance/backend/amp.md b/docs/python_docs/python/tutorials/performance/backend/amp.md
index c862b51131f4..c18ff29ef4c1 100644
--- a/docs/python_docs/python/tutorials/performance/backend/amp.md
+++ b/docs/python_docs/python/tutorials/performance/backend/amp.md
@@ -262,11 +262,10 @@ We got 60% speed increase from 3 additional lines of code!
 
 ## Inference with AMP
 
-To do inference with mixed precision for a trained model in FP32, you can use the conversion APIs: `amp.convert_model` for symbolic model and `amp.convert_hybrid_block` for gluon models. The conversion APIs will take the FP32 model as input and will return a mixed precision model, which can be used to run inference.
-Below, we demonstrate for a gluon model and a symbolic model:
+To do inference with mixed precision for a trained model in FP32, you can use the conversion API `amp.convert_hybrid_block` for gluon models. The conversion APIs will take the FP32 model as input and will return a mixed precision model, which can be used to run inference.
+Below, we demonstrate for a gluon model:
 - Conversion from FP32 model to mixed precision model.
 - Run inference on the mixed precision model.
-- For AMP conversion of bucketing module please refer to [example/rnn/bucketing/README.md](https://github.com/apache/incubator-mxnet/blob/master/example/rnn/bucketing/README.md).
 
 ```python
 with mx.Context(mx.gpu(0)):
@@ -283,69 +282,13 @@ with mx.Context(mx.gpu(0)):
     result = converted_model.forward(mx.nd.random.uniform(shape=(1, 3, 224, 224),
                                                           dtype=np.float32))
 
-    # Below is an example of converting a symbolic model to a mixed precision model
-    model_path = "model"
-    if not os.path.isdir(model_path):
-        os.mkdir(model_path)
-    prefix, epoch = mx.test_utils.download_model("imagenet1k-resnet-18", dst_dir=model_path)
-    sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
-    result_sym, result_arg_params, result_aux_params = amp.convert_model(sym,
-                                                                         arg_params,
-                                                                         aux_params)
-
-    # Run dummy inference with the converted symbolic model
-    mod = mx.mod.Module(result_sym, data_names=["data"], label_names=["softmax_label"], context=mx.current_context())
-    mod.bind(data_shapes=[['data', (1, 3, 224, 224)]], label_shapes=[['softmax_label', (1,)]])
-    mod.set_params(result_arg_params, result_aux_params)
-    mod.forward(mx.io.DataBatch(data=[mx.nd.ones((1, 3, 224, 224))],
-                                label=[mx.nd.ones((1,))]))
-    mod.get_outputs()[0].wait_to_read()
     print("Conversion and Inference completed successfully")
 ```
 
 You can also customize the operators to run in FP16 versus the operator to run in FP32 or to conditionally run in FP32.
-Also, you can force cast the params wherever possible to FP16. Below is an example which demonstrates both these use cases
-for symbolic model. You can do the same for gluon hybrid block with `amp.convert_hybrid_block` API, `cast_optional_params` flag.
-
-```python
-with mx.Context(mx.gpu(0)):
-    # Below is an example of converting a symbolic model to a mixed precision model
-    # with only Convolution op being force casted to FP16.
-    model_path = "model"
-    if not os.path.isdir(model_path):
-        os.mkdir(model_path)
-    prefix, epoch = mx.test_utils.download_model("imagenet1k-resnet-18", dst_dir=model_path)
-    sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
-
-    # All Convolution ops should run in FP16, SoftmaxOutput and FullyConnected should run in FP32
-    # cast_optional_params=True: Force cast params to FP16 wherever possible
-    result_sym, result_arg_params, result_aux_params = amp.convert_model(sym,
-                                                                         arg_params,
-                                                                         aux_params,
-                                                                         target_dtype_ops=["Convolution"],
-                                                                         fp32_ops=["SoftmaxOutput", "FullyConnected"],
-                                                                         cast_optional_params=True)
-
-    # Run dummy inference with the converted symbolic model
-    mod = mx.mod.Module(result_sym, data_names=["data"], label_names=["softmax_label"], context=mx.current_context())
-    mod.bind(data_shapes=[['data', (1, 3, 224, 224)]], label_shapes=[['softmax_label', (1,)]])
-    mod.set_params(result_arg_params, result_aux_params)
-    mod.forward(mx.io.DataBatch(data=[mx.nd.ones((1, 3, 224, 224))],
-                                label=[mx.nd.ones((1,))]))
-    mod.get_outputs()[0].wait_to_read()
-
-    # Assert that the params for conv are in FP16, this is because cast_optional_params is set to True
-    assert mod._arg_params["conv0_weight"].dtype == np.float16
-    # FullyConnected params stay in FP32
-    assert mod._arg_params["fc1_bias"].dtype == np.float32
-
-    print("Conversion and Inference completed successfully")
-
-    # Serialize AMP model and save to disk
-    mod.save_checkpoint("amp_tutorial_model", 0, remove_amp_cast=False)
-```
+Also, you can force cast the params wherever possible to FP16. 
 
 ## Current limitations of AMP
 
 - AMP's dynamic loss scaling currently supports only Gluon trainer with `update_on_kvstore=False` option set
-- Using `SoftmaxOutput`, `LinearRegressionOutput`, `LogisticRegressionOutput`, `MAERegressionOutput` with dynamic loss scaling does not work when training networks with multiple Gluon trainers and so multiple loss scales
\ No newline at end of file
+- Using `SoftmaxOutput`, `LinearRegressionOutput`, `LogisticRegressionOutput`, `MAERegressionOutput` with dynamic loss scaling does not work when training networks with multiple Gluon trainers and so multiple loss scales
diff --git a/docs/python_docs/python/tutorials/performance/backend/mkldnn/mkldnn_quantization.md b/docs/python_docs/python/tutorials/performance/backend/mkldnn/mkldnn_quantization.md
deleted file mode 100644
index 8c15af267cd4..000000000000
--- a/docs/python_docs/python/tutorials/performance/backend/mkldnn/mkldnn_quantization.md
+++ /dev/null
@@ -1,258 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-# Quantize with MKL-DNN backend
-
-This document is to introduce how to quantize the customer models from FP32 to INT8 with Apache/MXNet toolkit and APIs under Intel CPU.
-
-If you are not familiar with Apache/MXNet quantization flow, please reference [quantization blog](https://medium.com/apache-mxnet/model-quantization-for-production-level-neural-network-inference-f54462ebba05) first, and the performance data is shown in [Apache/MXNet C++ interface](https://github.com/apache/incubator-mxnet/tree/master/cpp-package/example/inference) and [GluonCV](https://gluon-cv.mxnet.io/build/examples_deployment/int8_inference.html). 
-
-## Installation and Prerequisites
-
-Installing MXNet with MKLDNN backend is an easy and essential process. You can follow [How to build and install MXNet with MKL-DNN backend](/api/python/docs/tutorials/performance/backend/mkldnn/mkldnn_readme.html) to build and install MXNet from source. Also, you can install the release or nightly version via PyPi and pip directly by running:
-
-```
-# release version
-pip install mxnet-mkl
-# nightly version
-pip install mxnet-mkl --pre
-```
-
-## Image Classification Demo
-
-A quantization script [imagenet_gen_qsym_mkldnn.py](https://github.com/apache/incubator-mxnet/blob/master/example/quantization/imagenet_gen_qsym_mkldnn.py) has been designed to launch quantization for image-classification models. This script is  integrated with [Gluon-CV modelzoo](https://gluon-cv.mxnet.io/model_zoo/classification.html), so that all pre-trained models can be downloaded from Gluon-CV and then converted for quantization. For details, you can refer [Model Quantization with Calibration Examples](https://github.com/apache/incubator-mxnet/blob/master/example/quantization/README.md).
-
-## Integrate Quantization Flow to Your Project
-
-Quantization flow works for both symbolic and Gluon models. If you're using Gluon, you can first refer [Saving and Loading Gluon Models](/api/python/docs/tutorials/packages/gluon/blocks/save_load_params.html) to hybridize your computation graph and export it as a symbol before running quantization.
-
-In general, the quantization flow includes 4 steps. The user can get the acceptable accuracy from step 1 to 3 with minimum effort. Most of thing in this stage is out-of-box and the data scientists and researchers only need to focus on how to represent data and layers in their model. After a quantized model is generated, you may want to deploy it online and the performance will be the next key point. Thus, step 4, calibration, can improve the performance a lot by reducing lots of runtime calculation.
-
-![quantization flow](https://github.com/dmlc/web-data/raw/master/mxnet/tutorials/mkldnn/quantization/quantization.png)
-
-Now, we are going to take Gluon ResNet18 as an example to show how each step work.
-
-### Initialize Model
-
-```python
-import logging
-import mxnet as mx
-from mxnet.gluon.model_zoo import vision
-from mxnet.contrib.quantization import *
-
-logging.basicConfig()
-logger = logging.getLogger('logger')
-logger.setLevel(logging.INFO)
-
-batch_shape = (1, 3, 224, 224)
-resnet18 = vision.resnet18_v1(pretrained=True)
-resnet18.hybridize()
-resnet18.forward(mx.nd.zeros(batch_shape))
-resnet18.export('resnet18_v1')
-sym, arg_params, aux_params = mx.model.load_checkpoint('resnet18_v1', 0)
-# (optional) visualize float32 model
-mx.viz.plot_network(sym)
-```
-First, we download resnet18-v1 model from gluon modelzoo and export it as a symbol. You can visualize float32 model. Below is a raw residual block.
-
-![float32 model](https://github.com/dmlc/web-data/raw/master/mxnet/tutorials/mkldnn/quantization/fp32_raw.png)
-
-#### Model Fusion
-
-```python
-sym = sym.get_backend_symbol('MKLDNN_QUANTIZE')
-# (optional) visualize fused float32 model
-mx.viz.plot_network(sym)
-```
-It's important to add this line to enable graph fusion before quantization to get better performance. Below is a fused residual block. Batchnorm, Activation and elemwise_add are fused into Convolution.
-
-![float32 fused model](https://github.com/dmlc/web-data/raw/master/mxnet/tutorials/mkldnn/quantization/fp32_fusion.png)
-
-### Quantize Model
-
-A python interface `quantize_graph` is provided for the user. Thus, it is very flexible for the data scientist to construct the expected models based on different requirements in a real deployment.
-
-```python
-# quantize configs
-# set exclude layers
-excluded_names = []
-# set calib mode.
-calib_mode = 'none'
-# set calib_layer
-calib_layer = None
-# set quantized_dtype
-quantized_dtype = 'auto'
-logger.info('Quantizing FP32 model Resnet18-V1')
-qsym, qarg_params, aux_params, collector = quantize_graph(sym=sym, arg_params=arg_params, aux_params=aux_params,
-                                                          excluded_sym_names=excluded_names,
-                                                          calib_mode=calib_mode, calib_layer=calib_layer,
-                                                          quantized_dtype=quantized_dtype, logger=logger)
-# (optional) visualize quantized model
-mx.viz.plot_network(qsym)
-# save quantized model
-mx.model.save_checkpoint('quantized-resnet18_v1', 0, qsym, qarg_params, aux_params)
-```
-
-By applying `quantize_graph` to the symbolic model, a new quantized model can be generated, named `qsym` along with its parameters. We can see `_contrib_requantize` operators are inserted after `Convolution` to convert the INT32 output to FP32. 
-
-![none calibrated model](https://github.com/dmlc/web-data/raw/master/mxnet/tutorials/mkldnn/quantization/none_calib.png)
-
-Below table gives some descriptions.
-
-| param              | type            | description|
-|--------------------|-----------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| excluded_sym_names | list of strings | A list of strings representing the names of the symbols that users want to excluding from being quantized.|
-| calib_mode         | str             | If calib_mode='none', no calibration will be used and the thresholds for requantization after the corresponding layers will be calculated at runtime by calling min  and max operators. The quantized models generated in this mode are normally 10-20% slower than those with  calibrations during inference.<br>If calib_mode='naive', the min and max values of the layer outputs from a calibration dataset will be directly taken as the thresholds for quantization.<br>If calib_mode='entropy', the thresholds for quantization will be derived such that the KL divergence between the distributions of FP32 layer outputs and  quantized layer outputs is minimized based upon the calibration dataset. |
-| calib_layer        | function        | Given a layer's output name in string, return True or False for deciding whether to calibrate this layer.<br>If yes, the statistics of the layer's output will be collected; otherwise, no information of the layer's output will be collected.<br>If not provided, all the layers' outputs that need requantization will be collected.|
-| quantized_dtype    | str             | The quantized destination type for input data. Currently support 'int8', 'uint8' and 'auto'.<br>'auto' means automatically select output type according to calibration result.|
-
-### Evaluate & Tune
-
-Now, you get a pair of quantized symbol and params file for inference. For Gluon inference, only difference is to load model and params by a SymbolBlock as below example:
-
-```python
-quantized_net = mx.gluon.SymbolBlock.imports('quantized-resnet18_v1-symbol.json', 'data', 'quantized-resnet18_v1-0000.params')
-quantized_net.hybridize(static_shape=True, static_alloc=True)
-batch_size = 1
-data = mx.nd.ones((batch_size,3,224,224))
-quantized_net(data)
-```
-
-Now, you can get the accuracy from a quantized network. Furthermore, you can try to select different layers or OPs to be quantized by `excluded_sym_names` parameter and figure out an acceptable accuracy.
-
-### Calibrate Model (optional for performance)
-
-The quantized model generated in previous steps can be very slow during inference since it will calculate min and max at runtime. We recommend using offline calibration for better performance by setting `calib_mode` to `naive` or `entropy`. And then calling `set_monitor_callback` api to collect layer information with a subset of the validation datasets before int8 inference.
-
-```python
-# quantization configs
-# set exclude layers
-excluded_names = []
-# set calib mode.
-calib_mode = 'naive'
-# set calib_layer
-calib_layer = None
-# set quantized_dtype
-quantized_dtype = 'auto'
-logger.info('Quantizing FP32 model resnet18-V1')
-cqsym, cqarg_params, aux_params, collector = quantize_graph(sym=sym, arg_params=arg_params, aux_params=aux_params,
-                                                          excluded_sym_names=excluded_names,
-                                                          calib_mode=calib_mode, calib_layer=calib_layer,
-                                                          quantized_dtype=quantized_dtype, logger=logger)
-
-# download imagenet validation dataset
-mx.test_utils.download('https://data.mxnet.io/data/val_256_q90.rec', 'dataset.rec')
-# set rgb info for data
-mean_std = {'mean_r': 123.68, 'mean_g': 116.779, 'mean_b': 103.939, 'std_r': 58.393, 'std_g': 57.12, 'std_b': 57.375}
-# set batch size
-batch_size = 16
-# create DataIter
-data = mx.io.ImageRecordIter(path_imgrec='dataset.rec', batch_size=batch_size, data_shape=batch_shape[1:], rand_crop=False, rand_mirror=False, **mean_std)
-# create module
-mod = mx.mod.Module(symbol=sym, label_names=None, context=mx.cpu())
-mod.bind(for_training=False, data_shapes=data.provide_data, label_shapes=None)
-mod.set_params(arg_params, aux_params)
-
-# calibration configs
-# set num_calib_batches
-num_calib_batches = 5
-max_num_examples = num_calib_batches * batch_size
-# monitor FP32 Inference
-mod._exec_group.execs[0].set_monitor_callback(collector.collect, monitor_all=True)
-num_batches = 0
-num_examples = 0
-for batch in data:
-    mod.forward(data_batch=batch, is_train=False)
-    num_batches += 1
-    num_examples += batch_size
-    if num_examples >= max_num_examples:
-        break
-if logger is not None:
-    logger.info("Collected statistics from %d batches with batch_size=%d"
-                % (num_batches, batch_size))
-```
-
-After that, layer information will be filled into the `collector` returned by `quantize_graph` api. Then, you need to write the layer information into int8 model by calling `calib_graph` api.
-
-
-```python
-# write scaling factor into quantized symbol
-cqsym, cqarg_params, aux_params = calib_graph(qsym=cqsym, arg_params=arg_params, aux_params=aux_params,
-                                            collector=collector, calib_mode=calib_mode,
-                                            quantized_dtype=quantized_dtype, logger=logger)
-# (optional) visualize quantized model
-mx.viz.plot_network(cqsym)
-```
-
-Below is a quantized residual block with naive calibration. We can see `min_calib_range` and `max_calib_range` are written into `_contrib_requantize` operators.
-
-![naive calibrated model](https://github.com/dmlc/web-data/raw/master/mxnet/tutorials/mkldnn/quantization/naive_calib.png)
-
-When you get a quantized model with calibration, keeping sure to call fusion api again since this can fuse some `requantize` or `dequantize` operators for further performance improvement.
-
-```python
-# perform post-quantization fusion
-cqsym = cqsym.get_backend_symbol('MKLDNN_QUANTIZE')
-# (optional) visualize post-quantized model
-mx.viz.plot_network(cqsym)
-# save quantized model
-mx.model.save_checkpoint('quantized-resnet18_v1', 0, cqsym, cqarg_params, aux_params)
-```
-
-Below is a post-quantized residual block. We can see `_contrib_requantize` operators are fused into `Convolution` operators.
-
-![post-quantized model](https://github.com/dmlc/web-data/raw/master/mxnet/tutorials/mkldnn/quantization/post_quantize.png)
-
-BTW, You can also modify the `min_calib_range` and `max_calib_range` in the JSON file directly.
-
-```
-    {
-      "op": "_sg_mkldnn_conv", 
-      "name": "quantized_sg_mkldnn_conv_bn_act_6", 
-      "attrs": {
-        "max_calib_range": "3.562147", 
-        "min_calib_range": "0.000000", 
-        "quantized": "true", 
-        "with_act": "true", 
-        "with_bn": "true"
-      }, 
-......
-```
-
-### Tips for Model Calibration
-
-#### Accuracy Tuning
-
-- Try to use `entropy` calib mode;
-
-- Try to exclude some layers which may cause obvious accuracy drop;
-
-- Change calibration dataset by setting different `num_calib_batches` or shuffle your validation dataset;
-
-#### Performance Tuning
-
-- Keep sure to perform graph fusion before quantization;
-
-- If lots of `requantize` layers exist, keep sure to perform post-quantization fusion after calibration;
-
-- Compare the MXNet profile or `MKLDNN_VERBOSE` of float32 and int8 inference;
-
-## Deploy with Python/C++
-
-MXNet also supports deploy quantized models with C++. Refer [MXNet C++ Package](https://github.com/apache/incubator-mxnet/blob/master/cpp-package/README.md) for more details.
-
-<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/docs/static_site/src/pages/api/faq/gradient_compression.md b/docs/static_site/src/pages/api/faq/gradient_compression.md
index e2b47c646ada..b6e318a4f8b5 100644
--- a/docs/static_site/src/pages/api/faq/gradient_compression.md
+++ b/docs/static_site/src/pages/api/faq/gradient_compression.md
@@ -104,14 +104,6 @@ trainer = gluon.Trainer(..., compression_params={'type’:'2bit', 'threshold':0.
 ```
 A reference `gluon` implementation with a gradient compression option can be found in the [train.py script from a word-level language modeling RNN example](https://github.com/apache/incubator-mxnet/blob/master/example/gluon/word_language_model/train.py).
 
-**Module API**:
-
-```python
-mod = mx.mod.Module(..., compression_params={'type’:'2bit', 'threshold':0.5})
-```
-
-A `module` example is provided with [this guide for setting up MXNet with distributed training](/api/faq/distributed_training). It comes with the option of turning on gradient compression as an argument to the [train_mnist.py script](https://github.com/apache/incubator-mxnet/blob/master/example/image-classification/train_mnist.py).
-
 ### Configuration Details
 
 **Threshold**
diff --git a/docs/static_site/src/pages/api/faq/multi_devices.md b/docs/static_site/src/pages/api/faq/multi_devices.md
deleted file mode 100644
index d8bc81cb1106..000000000000
--- a/docs/static_site/src/pages/api/faq/multi_devices.md
+++ /dev/null
@@ -1,217 +0,0 @@
----
-layout: page_category
-title: Data Parallelism with Multiple CPU/GPUs on MXNet
-category: faq
-faq_c: Model
-question: How do I run MXNet on Multiple CPU/GPUs with data parallelism?
-permalink: /api/faq/multi_device
----
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-# Run MXNet on Multiple CPU/GPUs with Data Parallelism
-
-_MXNet_ supports training with multiple CPUs and GPUs, which may be located on different physical machines.
-
-## Data Parallelism vs Model Parallelism
-
-By default, _MXNet_ uses data parallelism to partition the workload over multiple
-devices.
-Assume there are *n* devices.
-Then each one will receive a copy of the complete model
-and train it on *1/n* of the data.
-The results such as gradients and
-updated model are communicated across these devices.
-
-MXNet also supports model parallelism.
-In this approach, each device holds onto only part of the model.
-This proves useful when the model is too large to fit onto a single device.
-As an example, see the following [tutorial](./model_parallel_lstm.md)
-which shows how to use model parallelism for training a multi-layer LSTM model.
-In this tutorial, we'll focus on data parallelism.
-
-## Multiple GPUs within a Single Machine
-
-### Workload Partitioning
-
-By default, _MXNet_ partitions a data batch evenly among the available GPUs.
-Assume a batch size *b* and assume there are *k* GPUs, then in one iteration
-each GPU will perform forward and backward on *b/k* examples. The
-gradients are then summed over all GPUs before updating the model.
-
-### How to Use
-
-> To use GPUs, we need to compile MXNet with GPU support. For
-> example, set `USE_CUDA=1` in `config.mk` before `make`. (see
-> [MXNet installation guide](/get_started) for more options).
-
-If a machine has one or more GPU cards installed,
-then each card is labeled by a number starting from 0.
-To use a particular GPU, one can either
-specify the context `context` in code
-or pass `--gpus` at the command line.
-For example, to use GPU 0 and 2 in python,
-one can typically create a module with
-```python
-import mxnet as mx
-module = mx.module.Module(context=[mx.gpu(0), mx.gpu(2)], ...)
-```
-while if the program accepts a `--gpus` flag (as seen in
-[example/image-classification](https://github.com/dmlc/mxnet/tree/master/example/image-classification)),
-then we can try
-```bash
-python train_mnist.py --gpus 0,2 ...
-```
-
-### Advanced Usage
-If the available GPUs are not all equally powerful,
-we can partition the workload accordingly.
-For example, if GPU 0 is 3 times faster than GPU 2,
-then we might use the workload option `work_load_list=[3, 1]`,
-see [Module](/api/python/docs/api/module/index.html)
-for more details.
-
-Training with multiple GPUs should yield the same results
-as training on a single GPU if all other hyper-parameters are the same.
-In practice, the results may exhibit small differences,
-owing to the randomness of I/O (random order or other augmentations),
-weight initialization with different seeds, and CUDNN.
-
-We can control on which devices the gradient is aggregated
-and on which device the model is updated via [`KVStore`](/api/python/docs/api/kvstore/index.html),
-the _MXNet_ module that supports data communication.
-One can either use `mx.kvstore.create(type)` to get an instance
-or use the program flag `--kv-store type`.
-
-There are two commonly used types,
-
-- `local`: all gradients are copied to CPU memory and weights are updated there.
-- `device`: both gradient aggregation and weight updates are run on GPUs.
-With this setting, the `KVStore` also attempts to use GPU peer-to-peer communication,
-potentially accelerating the communication.
-Note that this option may result in higher GPU memory usage.
-
-When using a large number of GPUs, e.g. >=4, we suggest using `device` for better performance.
-
-## Distributed Training with Multiple Machines
-
-`KVStore` also supports a number of options for running on multiple machines.
-
-- `dist_sync` behaves similarly to `local` but exhibits one major difference.
-  With `dist_sync`, `batch-size` now means the batch size used on each machine.
-  So if there are *n* machines and we use batch size *b*,
-  then `dist_sync` behaves like `local` with batch size *n\*b*.
-- `dist_device_sync` is similar to `dist_sync`. The difference between them is that
-  `dist_device_sync` aggregates gradients and updates weight on GPUs
-  while `dist_sync` does so on CPU memory.
-- `dist_async`  performs asynchronous updates.
-  The weight is updated whenever gradients are received from any machine.
-  The update is atomic, i.e., no two updates happen on the same weight at the same time.
-  However, the order is not guaranteed.
-
-### How to Launch a Job
-
-> To use distributed training, we need to compile with `USE_DIST_KVSTORE=1`
-> (see [MXNet installation guide](/get_started) for more options).
-
-Launching a distributed job is a bit different from running on a single
-machine. MXNet provides
-[tools/launch.py](https://github.com/dmlc/mxnet/blob/master/tools/launch.py) to
-start a job by using `ssh`, `mpi`, `sge`, or `yarn`.
-
-An easy way to set up a cluster of EC2 instances for distributed deep learning
-is using an [AWS CloudFormation template](https://github.com/awslabs/deeplearning-cfn).
-If you do not have a cluster, you can check the repository before you continue.
-
-Assume we are at the directory `mxnet/example/image-classification`
-and want to train LeNet to classify MNIST images, as demonstrated here:
-[train_mnist.py](https://github.com/dmlc/mxnet/blob/master/example/image-classification/train_mnist.py).
-
-On a single machine, we can run:
-
-```bash
-python train_mnist.py --network lenet
-```
-
-Now, say we are given two ssh-able machines and _MXNet_ is installed on both machines.
-We want to train LeNet on these two machines.
-First, we save the IPs (or hostname) of these two machines in file `hosts`, e.g.
-
-```bash
-$ cat hosts
-172.30.0.172
-172.30.0.171
-```
-
-Next, if the mxnet folder is accessible from both machines, e.g. on a
-[network filesystem](https://help.ubuntu.com/lts/serverguide/network-file-system.html),
-then we can run:
-
-```bash
-python ../../tools/launch.py -n 2 --launcher ssh -H hosts python train_mnist.py --network lenet --kv-store dist_sync
-```
-
-Note that here we
-
-- use `launch.py` to submit the job.
-- provide launcher, `ssh` if all machines are ssh-able, `mpi` if `mpirun` is
-  available, `sge` for Sun Grid Engine, and `yarn` for Apache Yarn.
-- `-n` number of worker nodes to run on
-- `-H` the host file which is required by `ssh` and `mpi`
-- `--kv-store` use either `dist_sync` or `dist_async`
-
-
-### Synchronize Directory
-
-Now consider if the mxnet folder is not accessible.
-We can first copy the `MXNet` library to this folder by
-```bash
-cp -r ../../python/mxnet .
-cp -r ../../lib/libmxnet.so mxnet
-```
-
-then ask `launch.py` to synchronize the current directory to all machines'
- `/tmp/mxnet` directory with `--sync-dst-dir`
-
-```bash
-python ../../tools/launch.py -n 2 -H hosts --sync-dst-dir /tmp/mxnet \
-   python train_mnist.py --network lenet --kv-store dist_sync
-```
-
-### Use a Particular Network Interface
-
-_MXNet_ often chooses the first available network interface.
-But for machines that have multiple interfaces,
-we can specify which network interface to use for data
-communication by the environment variable `DMLC_INTERFACE`.
-For example, to use the interface `eth0`, we can
-
-```
-export DMLC_INTERFACE=eth0; python ../../tools/launch.py ...
-```
-
-### Debug Connection
-
-Set`PS_VERBOSE=1` to see the debug logging, e.g
-```
-export PS_VERBOSE=1; python ../../tools/launch.py ...
-```
-
-### More
-
-- See more launch options by `python ../../tools/launch.py -h`
-- See more options of [ps-lite](http://ps-lite.readthedocs.org/en/latest/how_to.html)
\ No newline at end of file
diff --git a/example/image-classification/README.md b/example/image-classification/README.md
deleted file mode 100644
index 4b4a48b33ae4..000000000000
--- a/example/image-classification/README.md
+++ /dev/null
@@ -1,387 +0,0 @@
-# Image Classification
-
-This fold contains examples for image classification. The goal of image
-classifcation is to identify the objects contained in images. The following
-[example](https://mxnet.io/tutorials/python/predict_image.html) shows
-recognized object classes with corresponding probabilities using a pre-trained
-model.
-
-<img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/predict-dog.png" width="600"/>
-
-## Contents
-
-1. [Basic usages](#basic-usages)
-2. [How to prepare datasets](#prepare-datasets)
-3. [A List of pre-trained models](#pre-trained-models)
-4. [How to fine-tune a dataset with a pre-trained model](#fine-tune-another-dataset)
-5. [How to train with multiple machines](#distributed-training)
-6. [Frequently asked questions](#faq)
-
-## Basic Usages
-
-Both python and R training programs are provided. Use `train_*.py` or
-`train_*.R` to train a network on a particular dataset. For example:
-
-- train a multilayer perception on the mnist dataset
-
-  ```bash
-  python train_mnist.py --network mlp
-  ```
-
-- train a 110-layer resnet on the cifar10 dataset with batch size 128 and GPU 0 and 1
-
-  ```bash
-  python train_cifar10.py --network resnet --num-layers 110 --batch-size 128 --gpus 0,1
-  ```
-
-There is a rich set of options, one can list them by passing `--help`. Some
-commonly used options are listed as following:
-
-| Argument                      | Comments                                 |
-| ----------------------------- | ---------------------------------------- |
-| `network`                     | The network to train, which is defined in [symbol/](https://github.com/dmlc/mxnet/tree/master/example/image-classification/symbols). Some networks may accept additional arguments, such as `--num-layers` is used to specify the number of layers in ResNet. |
-| `data-train`, `data-val`      | The data for training and validation. It can be either a filename or a directory. For the latter, all files in the directory will be used. But if `--benchmark 1` is used, then there two arguments will be ignored. |
-| `gpus`                        | The list of GPUs to use, such as `0` or `0,3,4,7`. If an empty string `''` is given, then we will use CPU. |
-| `batch-size`                  | The batch size for SGD training. It specifies the number of examples used for each SGD iteration. If we use *k* GPUs, then each GPU will compute *batch_size/k* examples in each time. |
-| `model`                       | The model name to save (and load). A model will be saved into two parts: `model-symbol.json` for the network definition and `model-n.params` for the parameters saved on epoch *n*. |
-| `num-epochs`                  | The maximal number of epochs to train.   |
-| `load-epoch`                  | If given integer *k*, then resume the training starting from epoch *k* with the model saved at the end of epoch *k-1*. Note that the training starts from epoch 0, and the model saved at the end of this epoch will be `model-0001.params`. |
-| `lr`                          | The initial learning rate, namely for epoch 0. |
-| `lr-factor`, `lr-step-epochs` | Reduce the learning rate on give epochs. For example, `--lr-factor .1 --lr-step-epochs 30,60` will reduce the learning rate by 0.1 on epoch 30, and then reduce it by 0.1 again on epoch 60. |
-
-## Prepare Datasets
-
-The recommended data format is
-[RecordIO](https://mxnet.io/architecture/note_data_loading.html), which
-concatenates multiple examples into seekable binary files for better read
-efficiency. We provide a tool `im2rec.py` located in `tools/` to convert
-individual images into `.rec` files.
-
-For a simple tutorial, assume all images are stored as individual image files
-such as `.png` or `.jpg`, and images belonging to the same class are placed in
-the same directory. All these class directories are then in the same root
-`img_data` directory. Our goal is to generate two files, `mydata_train.rec` for
-training and `mydata_val.rec` for validation, and the former contains 95%
-images.
-
-We first prepare two `.lst` files, which consist of the labels and image paths
-can be used for generating `rec` files.
-
-```bash
-python tools/im2rec.py --list --recursive --train-ratio 0.95 mydata img_data
-```
-
-Then we generate the `.rec` files. We resize the images such that the short edge
-is at least 480px and save them with 95/100 quality. We also use 16 threads to
-accelerate the packing.
-
-```bash
-python tools/im2rec.py --resize 480 --quality 95 --num-thread 16 mydata img_data
-```
-
-Hints:
-
-- SSD is much faster than HDD when dealing with a large number of small
-  files. (but HDD is good enough to read `rec` files).
-  - We can use a cloud storage instance to prepare the data. For example, AWS
-    `i2.4xlarge` provides 4 x 800 GB SSDs.
-  - We can make a software RAID over multiple disks. For example, the following
-    command create a RAID0 on 4 disks:
-
-    ```bash
-    sudo mdadm --create --verbose /dev/md0 --level=stripe --raid-devices=4 \
-      /dev/nvme0n1 /dev/nvme1n1 /dev/nvme2n1 /dev/nvme3n1
-    sudo mkfs /dev/md0
-    ```
-- Check `*.sh` scripts in the `data/` folder for more examples
-- Use `im2rec.py --help` to see more options.
-
-## Pre-trained Models
-
-We provide multiple pre-trained models on various datasets. Use
-[common/modelzone.py](https://github.com/dmlc/mxnet/blob/master/example/image-classification/common/modelzoo.py)
-to download these models. These models can be used in any front-end language
-MXNet supports. For example,
-[the tutorial](https://mxnet.io/tutorials/python/predict_image.html) shows how
-to classify an image with jupyter notebook.
-
-### ImageNet 1K
-
-It is first used by
-[ImageNet challenge 2012](http://www.image-net.org/challenges/LSVRC/2012/),
-which contains about 1.2M images with 1000 classes. To test these models, one
-can use
-[data/imagenet1k-val.sh](https://github.com/dmlc/mxnet/blob/master/example/image-classification/data/imagenet1k-val.sh)
-to prepare the validation dataset and
-[score.py](https://github.com/dmlc/mxnet/blob/master/example/image-classification/score.py)
-to calculate the accuracy.
-
-#### Single Center Crop Accuracy
-
-| Model                          | Top-1  | Top-5  |
-| ------------------------------ | ------ | ------ |
-| `imagenet1k-inception-bn`      | 0.7245 | 0.9079 |
-| `imagenet1k-resnet-18`         | 0.6858 | 0.8866 |
-| `imagenet1k-resnet-34`         | 0.7244 | 0.9097 |
-| `imagenet1k-resnet-50`         | 0.7527 | 0.9258 |
-| `imagenet1k-resnet-101`        | 0.7684 | 0.9327 |
-| `imagenet1k-resnet-152`        | 0.7653 | 0.9312 |
-| `imagenet1k-resnext-50`        | 0.7689 | 0.9332 |
-| `imagenet1k-resnext-101`       | 0.7828 | 0.9408 |
-| `imagenet1k-resnext-101-64x4d` | 0.7911 | 0.9430 |
-
-Note:
-- our Resnet does not need to specify the RGB mean due the data batch
-  normalization layer. While the inception models needs `--rgb-mean
-  123.68,116.779,103.939`
-- Resnet training logs are available at
-  [tornadomeet/ResNet](https://github.com/tornadomeet/ResNet/tree/master/log)
-- We warm up our Resnext-101-64x4d by training it with 1/100 and 1/10 of the
-  base learning rate for the 1st and 2nd epoch. We use 3 p2.16xlarge instances
-  with a batch size of 384 on each node with base lr set to 0.45, and decay step
-  set at 50, 80, 110 epoch. After 133 epoch, we use one node to finetune, and
-  turn off color and scale data augmentation, with lr reduced to 1.5e-04.
-
-#### Speed and Memory Footprint:
-
-Single K80 GPU with batch size 32.
-
-| Model                     | memory (MB) | images/sec |
-| ------------------------- | ----------- | ---------- |
-| `imagenet1k-inception-bn` | 548         | 152        |
-| `imagenet1k-resnet-18`    | 637         | 185        |
-| `imagenet1k-resnet-34`    | 678         | 172        |
-| `imagenet1k-resnet-50`    | 763         | 109        |
-| `imagenet1k-resnet-101`   | 835         | 78         |
-| `imagenet1k-resnet-152`   | 897         | 57         |
-
-### Imagenet 11K
-
-It is generated from the complete Imagenet dataset, namely  `fall11_whole.tar`
-from
-[http://www.image-net.org/download-images](http://www.image-net.org/download-images). In
-addition, we removed classes which have less than 500 images, and then randomly
-picked 50 images from each class as the validation set. As a result, this
-dataset contains 11221 classes, with 11,797,630 images for training.
-
-### Single Center Crop Accuracy
-
-| Model                    | Top-1  |
-| ------------------------ | ------ |
-| `imagenet11k-resnet-152` | 0.4157 |
-
-### Imagenet 11K + Place365 Challenge
-
-This dataset combine the Imagenet 11K dataset with
-[the Place 365 challenge dataset](http://places2.csail.mit.edu/download.html). The
-latter contains 365 classes with 8 millions images. It results in a dataset with
-around 20 million images.
-
-### Single Center Crop Accuracy
-
-| Model                               | Top-1  |
-| ----------------------------------- | ------ |
-| `imagenet11k-place365ch-resnet-50`  | 0.3112 |
-| `imagenet11k-place365ch-resnet-152` | 0.3355 |
-
-
-## Fine-tune another Dataset
-
-Fine-tune refers training with parameters partially intialized with pre-trained
-model. One can use
-[fine-tune.py](https://github.com/dmlc/mxnet/blob/master/example/image-classification/fine-tune.py)
-to train another dataset with pre-trained models listed above. For example,
-first run
-[data/caltech256.sh](https://github.com/dmlc/mxnet/blob/master/example/image-classification/data/caltech256.sh)
-to download and prepare the
-[Caltech-256](http://www.vision.caltech.edu/Image_Datasets/Caltech256/) dataset,
-then fine tune it with `imagenet11k-resnet-152` by using 8 GPUs:
-
-```bash
-python fine-tune.py --pretrained-model imagenet11k-resnet-152 --gpus 0,1,2,3,4,5,6,7 \
-    --data-train data/caltech256-train.rec --data-val data/caltech256-val.rec \
-    --batch-size 128 --num-classes 256 --num-examples 15240
-```
-
-We obtained 87.3% top-1 validation accuracy, and the training log is available
-[here](https://gist.github.com/mli/900b810258e2e0bc26fa606977a3b043#file-finetune-caltech265). See
-the [python notebook](https://mxnet.io/faq/finetune.html) for more
-explanations.
-
-## Distributed Training
-
-The simplest way for distributing training is that both programs and data are
-placed on the a shared filesystem such as
-[NFS](https://en.wikipedia.org/wiki/Network_File_System) and
-[AWS EFS](https://aws.amazon.com/efs/), and there is one machine, we call it the
-root machine, can ssh to all others. Assume we save the hostnames (or IPs) of
-all machines will be used for training (might include the root machine) into a
-file named `hosts`. The outputs of `cat hosts` may be
-
-```bash
-172.30.0.172
-172.30.0.171
-```
-
-Now we can run the previous cifar10 training on two machines:
-
-```bash
-python ../../tools/launch.py -n 2 -H hosts \
-    python train_cifar10.py --network resnet --num-layers 110 --batch-size 128 --gpus 0,1 \
-    --kv-store dist_device_sync
-```
-
-It differs the previous command in two aspects. First, we use `launch.py` to
-start the program, which creates two workers (given by `-n`) on the two machines
-specified in `hosts` . Second, we change the `--kv-store` from the default
-`device`, which means try to use GPU P2P, to `dist_device_sync`. The latter uses
-distributed synchronized communication.
-
-For more usages:
-
-- One can use
-  [benchmark.py](https://github.com/dmlc/mxnet/blob/master/example/image-classification/benchmark.py)
-  to run distributed benchmarks (also for multiple GPUs with single machine)
-- A how-to [tutorial](https://mxnet.io/api/faq/distributed_training.html) with more
-  explanation.
-- A
-  [blog](https://aws.amazon.com/blogs/compute/distributed-deep-learning-made-easy/)
-  about setuping up a GPU cluster on AWS with cloud formation.
-
-### Benchmark
-
-To run benchmark on imagenet networks, use `--benchmark 1` as the argument to `train_imagenet.py`, An example is shown below:
-
-```bash
-python train_imagenet.py --benchmark 1 --gpus 0,1 --network inception-v3 --batch-size 64 \
-  --image-shape 3,299,299 --num-epochs 1 --kv-store device
-```
-
-When running in benchmark mode, the script generates synthetic data of the given data shape and batch size.
-
-The `benchmark.py` can be used to run a series of benchmarks against different image networks on a given set of workers and takes the following arguments:
-- `--worker_file`: file that contains a list of worker hostnames or list of worker ip addresses that have passwordless ssh enabled.
-- `--worker_count`: number of workers to run benchmark on.
-- `--gpu_count`: number of gpus on each worker to use.
-- `--networks`: one or more networks in the format mode:network_name:batch_size:image_size. (Use `native` mode for imagenet benchmarks and any of the symbolic/imperative/hybrid for gluon benchmarks). Be sure to use appropriate models according to the mode you are using.
-
-The `benchmark.py` script runs benchmarks on variable number of gpus upto gpu_count starting from 1 gpu doubling the number of gpus in each run using `kv-store=device` and after that running on variable number of nodes on all gpus starting with 1 node upto `worker_count` doubling the number of nodes used in each run using `kv-store=dist_sync_device`.
-
-An example to run the benchmark script is shown below with 8 workers and 16 gpus on each worker:
-```
-python benchmark.py --worker_file /opt/deeplearning/workers --worker_count 8 \
-  --gpu_count 16 --networks 'native:inception-v3:32:299'
-```
-
-Additionally, this script also runs [Gluon vision models](mxnet/python/mxnet/gluon/model_zoo/model_store.py) benchmarking [image_classification](mxnet/example/gluon/image_classification.py) script
-for all three symbolic, imperative and hybrid paradigms using synthetic data.
-An example to run the benchmark script is shown below with 8 workers and 16 gpus on each worker:
-```
-python benchmark.py --worker_file /opt/deeplearning/workers --worker_count 8 \
-  --gpu_count 16 --networks 'imperative:resnet152_v1:32:299'
-```
-
-To run benchmark on gluon vision models, use `--benchmark 1`  as the argument to `image_classification.py`, An example is shown below:
-```
-python ../gluon/image_classification.py --dataset dummy --gpus 2 --epochs 1 --benchmark --mode imperative \
-  --model resnet152_v1 --batch-size 32 --log-interval 1 --kv-store dist_sync_device
-```
-
-### Scalability Results
-
-- Hardware: 16x AWS [P2.16xlarge](https://aws.amazon.com/ec2/instance-types/p2/)
-with 256 GPUs in total.
-- Software:
-  [AWS Deep Learning AMI](https://aws.amazon.com/marketplace/pp/B01M0AXXQB) with
-  CUDA 7.5 and CUDNN 5.1 installed
-
-We fixed the batch size per GPU and then increase the number of
-GPUs. Synchronized SGD is used, namely `--kv-store dist_device_sync`. The
-following three CNNs (located in [symbol/](./symbol/)) are used
-
-|  | `alexnet` | `inception-v3` | `resnet-152` |
-| --- | --- | --- | --- |
-| batch per GPU | 512 | 32 | 32 |
-| model size (MB) | 203 | 95 | 240 |
-
-Number of images proccessed per second is shown in the following table:
-
-| #GPUs | `alexnet` | `inception-v3` | `resnet-152` |
-| --- | --- | --- | --- |
-| 1   | 457.07   | 30.4    | 20.08   |
-| 2   | 870.43   | 59.61   | 38.76   |
-| 4   | 1514.8   | 117.9   | 77.01   |
-| 8   | 2852.5   | 233.39  | 153.07  |
-| 16  | 4244.18  | 447.61  | 298.03  |
-| 32  | 7945.57  | 882.57  | 595.53  |
-| 64  | 15840.52 | 1761.24 | 1179.86 |
-| 128 | 31334.88 | 3416.2  | 2333.47 |
-| 256 | 61938.36 | 6660.98 | 4630.42 |
-
-The following figure shows the speedup against a single GPU compared to the ideal scalability.
-
-<img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/speedup-p2.png" width="600"/>
-
-### Convergence
-
-We show the convergence of training ResNet-152 on Imagenet 1K. The single machine with 8 GPUs results are from [Wei Wu](https://github.com/tornadomeet/ResNet/tree/master/log). We then trained the model using 10 machines, each machine has 8 GPUs, with the same hyper-parameters, except for we changed the total batch size from 8\*32 to 80\*32 and the initial learning rate to 0.5 instead of 0.1. The validation accuracy versus data epoch is shown as following. Both models have almost identical convergence rate.
-
-<img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/dist_converge.png" width="600"/>
-
-## FAQ
-
-### Validation Accuracy
-
-It is often straightforward to achieve a reasonable validation accuracy, but
-sometimes matching the state-of-the-art numbers reported in the papers is
-extremely hard. Here we list some aspects you may check to improve the
-validation accuracy:
-
-- Add more data argumentations, which often reduces the gap between training
-  accuracy and validation accuracy. You may reduce the data argumentation close
-  to end.
-- Increase the learning rate and keep large learning rate for a long time. For
-  example, in CIFAR10 we keep `lr=0.1` for 200 epochs and then reduce to 0.01.
-- Do not use too large batch size, especially for batch size >> number of
-  classes.
-
-### Speed
-
-First check the workload is not too small (e.g. LeNet on MNIST) and also batch
-size is reasonable large. The performance bottleneck often happens in three
-aspects:
-
-- Reading data. Use the `--test-io 1` flag to check how many images can be pre-processed per second
-  - Increase `--data-nthreads` (default is 4) to use more threads for data augmentation can help.
-  - Data preprocessing is done by `opencv`.  If opencv is compiled from source
-    codes, check if it is configured correctly.
-  - Use `--benchmark 1` to use randomly generated data rather than real data.
-
-Refer to [faq/performance](https://mxnet.io/api/faq/perf) for more details
-about CPU, GPU and multi-device performance.
-
-### Memory
-
-An over sized batch size may result in out of GPU memory. The common error
-message is `cudaMalloc failed: out of memory`. Now we can
-
-- Reduce the batch size
-- Set the environment variable `MXNET_MEMORY_OPT=1` to perform a series of
-  memory optimizations (e.g., trades off computation for memory consumption).
-  For example, with batch size 64, inception-v3 uses 10G memory and trains 30
-  image/sec on a single K80 GPU. When mirroring is enabled, with 10G GPU memory
-  consumption, we can run inception-v3 using batch size 128. The cost is that
-  the speed reduces to 27 images/sec.
-
-## History
-
-- Nov 9, 2015: major refactor.
-  - Organize files into sub-directories
-  - Add [Resnet](https://github.com/tornadomeet/ResNet), pretrained models, and fine-tune scripts.
-  - Update documents.
-  - Move `../cpp/image-classification` into `./predict-cpp/`
-- Oct 15, 2016: add R examples
-- Nov 19, 2015: major refactor.
-  - Various networks (Alex/VGG/Inception) on multiple dataset
-    (MNIST/Cifar10/Imagenet)
-  - Distributed training
diff --git a/example/image-classification/__init__.py b/example/image-classification/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/example/image-classification/benchmark.py b/example/image-classification/benchmark.py
deleted file mode 100644
index 2a50d506adb6..000000000000
--- a/example/image-classification/benchmark.py
+++ /dev/null
@@ -1,262 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from __future__ import print_function
-import logging
-import argparse
-import os
-import time
-import sys
-import shutil
-import csv
-import re
-import subprocess, threading
-import pygal
-import importlib
-import collections
-import threading
-import copy
-'''
-Setup Logger and LogLevel
-'''
-def setup_logging(log_loc):
-    if os.path.exists(log_loc):
-        shutil.move(log_loc, log_loc + "_" + str(int(os.path.getctime(log_loc))))
-    os.makedirs(log_loc)
-
-    log_file = '{}/benchmark.log'.format(log_loc)
-    LOGGER = logging.getLogger('benchmark')
-    LOGGER.setLevel(logging.INFO)
-    formatter = logging.Formatter('%(asctime)s %(levelname)s:%(name)s %(message)s')
-    file_handler = logging.FileHandler(log_file)
-    console_handler = logging.StreamHandler()
-    file_handler.setFormatter(formatter)
-    console_handler.setFormatter(formatter)
-
-    LOGGER.addHandler(file_handler)
-    LOGGER.addHandler(console_handler)
-    return LOGGER
-
-'''
-Runs the command given in the cmd_args for specified timeout period
-and terminates after
-'''
-class RunCmd(threading.Thread):
-    def __init__(self, cmd_args, logfile):
-        threading.Thread.__init__(self)
-        self.cmd_args = cmd_args
-        self.logfile = logfile
-        self.process = None
-
-    def run(self):
-        LOGGER = logging.getLogger('benchmark')
-        LOGGER.info('started running %s', ' '.join(self.cmd_args))
-        log_fd = open(self.logfile, 'w')
-        self.process = subprocess.Popen(self.cmd_args, stdout=log_fd, stderr=subprocess.STDOUT, universal_newlines=True)
-        for line in self.process.communicate():
-            LOGGER.debug(line)
-        log_fd.close()
-        LOGGER.info('finished running %s', ' '.join(self.cmd_args))
-
-    def startCmd(self, timeout):
-        LOGGER.debug('Attempting to start Thread to run %s', ' '.join(self.cmd_args))
-        self.start()
-        self.join(timeout)
-        if self.is_alive():
-            LOGGER.debug('Terminating process running %s', ' '.join(self.cmd_args))
-            self.process.terminate()
-            self.join()
-            time.sleep(1)
-        return
-
-log_loc = './benchmark'
-LOGGER = setup_logging(log_loc)
-
-class Network(object):
-    def __init__(self, mode, name, img_size, batch_size):
-        self.mode = mode
-        self.name = name
-        self.img_size = img_size
-        self.batch_size = batch_size
-        self.gpu_speedup = collections.OrderedDict()
-
-def parse_args():
-    class NetworkArgumentAction(argparse.Action):
-        def validate(self, attrs):
-            args = attrs.split(':')
-            if len(args) != 4 or isinstance(args[0], str) == False or isinstance(args[1], str) == False:
-                print('expected network attributes in format mode:network_name:batch_size:image_size \
-                \nThe network_name is a valid model defined as network_name.py in the image-classification/symbol folder. \
-                \nOr a gluon vision model defined in mxnet/python/mxnet/gluon/model_zoo/model_store.py.')
-                sys.exit(1)
-            try:
-                # check if the network exists
-                if args[0] == 'native':
-                    importlib.import_module('symbols.' + args[1])
-                batch_size = int(args[2])
-                img_size = int(args[3])
-                return Network(mode=args[0], name=args[1], batch_size=batch_size, img_size=img_size)
-            except Exception as e:
-                print('expected network attributes in format mode:network_name:batch_size:image_size \
-                \nThe network_name is a valid model defined as network_name.py in the image-classification/symbol folder. \
-                \nOr a gluon vision model defined in mxnet/python/mxnet/gluon/model_zoo/model_store.py.')
-                print(e)
-                sys.exit(1)
-
-        def __init__(self, *args, **kw):
-            kw['nargs'] = '+'
-            argparse.Action.__init__(self, *args, **kw)
-
-        def __call__(self, parser, namespace, values, option_string=None):
-            if isinstance(values, list) == True:
-                setattr(namespace, self.dest, map(self.validate, values))
-            else:
-                setattr(namespace, self.dest, self.validate(values))
-
-    parser = argparse.ArgumentParser(description='Run Benchmark on various imagenet networks using train_imagenent.py')
-    parser.add_argument('--networks', dest='networks', nargs='+', type=str, help='one or more networks in the format mode:network_name:batch_size:image_size \
-    \nThe network_name is a valid model defined as network_name.py in the image-classification/symbol folder for native imagenet \
-    \n Or a gluon vision model defined in mxnet/python/mxnet/gluon/model_zoo/model_store.py.',
-                        action=NetworkArgumentAction)
-    parser.add_argument('--worker_file', type=str,
-                        help='file that contains a list of worker hostnames or list of worker ip addresses that can be sshed without a password.',
-                        required=True)
-    parser.add_argument('--worker_count', type=int, help='number of workers to run benchmark on.', required=True)
-    parser.add_argument('--gpu_count', type=int, help='number of gpus on each worker to use.', required=True)
-    args = parser.parse_args()
-    return args
-
-def series(max_count):
-    i = 1
-    s = []
-    while i <= max_count:
-        s.append(i)
-        i = i * 2
-    if s[-1] < max_count:
-        s.append(max_count)
-    return s
-
-'''
-Choose the middle iteration to get the images processed per sec
-'''
-def images_processed(log_loc, mode):
-    f = open(log_loc)
-    if mode == 'native':
-        img_per_sec = re.findall("(?:Batch\s+\[30\]\\\\tSpeed:\s+)(\d+\.\d+)(?:\s+)", str(f.readlines()))
-    else:
-        img_per_sec = re.findall("(?:Batch\s+\[3\]\\\\tSpeed:\s+)(\d+\.\d+)(?:\s+)", str(f.readlines()))
-    f.close()
-    img_per_sec = map(float, img_per_sec)
-    total_img_per_sec = sum(img_per_sec)
-    return total_img_per_sec
-
-def generate_hosts_file(num_nodes, workers_file, args_workers_file):
-    f = open(workers_file, 'w')
-    output = subprocess.check_output(['head', '-n', str(num_nodes), args_workers_file])
-    f.write(output)
-    f.close()
-    return
-
-def stop_old_processes(hosts_file, prog_name):
-    stop_args = ['python', '../../tools/kill-mxnet.py', hosts_file, 'python', prog_name]
-    stop_args_str = ' '.join(stop_args)
-    LOGGER.info('killing old remote processes\n %s', stop_args_str)
-    stop = subprocess.check_output(stop_args, stderr=subprocess.STDOUT)
-    LOGGER.debug(stop)
-    time.sleep(1)
-
-def run_benchmark(kv_store, data_shape, batch_size, num_gpus, num_nodes, network, args_workers_file, mode):
-    if mode == 'native':
-        benchmark_args = ['python', 'train_imagenet.py', '--gpus', ','.join(str(i) for i in range(num_gpus)), \
-                          '--network', network, '--batch-size', str(batch_size * num_gpus), \
-                          '--image-shape', '3,' + str(data_shape) + ',' + str(data_shape), '--num-epochs', '1',
-                          '--kv-store', kv_store, '--benchmark', '1', '--disp-batches', '10']
-    else:
-        benchmark_args = ['python', '../gluon/image_classification.py', '--dataset', 'dummy', '--gpus', str(num_gpus), \
-                          '--epochs', '1', '--benchmark', '--mode', mode, '--model', network, '--batch-size',
-                          str(batch_size), \
-                          '--log-interval', str(1), '--kvstore', kv_store]
-
-    log = log_loc + '/' + network + '_' + str(num_nodes * num_gpus) + '_log'
-    hosts = log_loc + '/' + network + '_' + str(num_nodes * num_gpus) + '_workers'
-    generate_hosts_file(num_nodes, hosts, args_workers_file)
-    if mode == 'native':
-        stop_old_processes(hosts, 'train_imagenet.py')
-    else:
-        stop_old_processes(hosts, '../gluon/image-classification.py')
-    launch_args = ['../../tools/launch.py', '-n', str(num_nodes), '-s', str(num_nodes * 2), '-H', hosts,
-                   ' '.join(benchmark_args)]
-
-    # use train_imagenet/image_classification when running on a single node
-    if kv_store == 'device':
-        imagenet = RunCmd(benchmark_args, log)
-        imagenet.startCmd(timeout=60 * 10)
-    else:
-        launch = RunCmd(launch_args, log)
-        launch.startCmd(timeout=60 * 10)
-
-    if mode == 'native':
-        stop_old_processes(hosts, 'train_imagenet.py')
-    else:
-        stop_old_processes(hosts, '../gluon/image-classification.py')
-    img_per_sec = images_processed(log, mode)
-    LOGGER.info('network: %s, num_gpus: %d, image/sec: %f', network, num_gpus * num_nodes, img_per_sec)
-    return img_per_sec
-
-def plot_graph(args):
-    speedup_chart = pygal.Line(x_title='gpus', y_title='speedup', logarithmic=True)
-    speedup_chart.x_labels = map(str, series(args.worker_count * args.gpu_count))
-    speedup_chart.add('ideal speedup', series(args.worker_count * args.gpu_count))
-    for net in args.networks:
-        image_single_gpu = net.gpu_speedup[1] if 1 in net.gpu_speedup or not net.gpu_speedup[1] else 1
-        y_values = [each / image_single_gpu for each in net.gpu_speedup.values()]
-        LOGGER.info('%s: image_single_gpu:%.2f' % (net.name, image_single_gpu))
-        LOGGER.debug('network:%s, y_values: %s' % (net.name, ' '.join(map(str, y_values))))
-        speedup_chart.add(net.name, y_values \
-            , formatter=lambda y_val, img=copy.deepcopy(image_single_gpu), batch_size=copy.deepcopy(
-            net.batch_size): 'speedup:%.2f, img/sec:%.2f, batch/gpu:%d' % \
-            (0 if y_val is None else y_val, 0 if y_val is None else y_val * img, batch_size))
-    speedup_chart.render_to_file(log_loc + '/speedup.svg')
-
-def write_csv(log_loc, args):
-    for net in args.networks:
-        with open(log_loc + '/' + net.name + '.csv', 'wb') as f:
-            w = csv.writer(f)
-            w.writerow(['num_gpus', 'img_processed_per_sec'])
-            w.writerows(net.gpu_speedup.items())
-
-def main():
-    args = parse_args()
-    for net in args.networks:
-        # use kv_store='device' when running on 1 node
-        for num_gpus in series(args.gpu_count):
-            imgs_per_sec = run_benchmark(kv_store='device', data_shape=net.img_size, batch_size=net.batch_size, \
-                                         num_gpus=num_gpus, num_nodes=1, network=net.name,
-                                         args_workers_file=args.worker_file, mode=net.mode)
-            net.gpu_speedup[num_gpus] = imgs_per_sec
-        for num_nodes in series(args.worker_count)[1::]:
-            imgs_per_sec = run_benchmark(kv_store='dist_sync_device', data_shape=net.img_size,
-                                         batch_size=net.batch_size, \
-                                         num_gpus=args.gpu_count, num_nodes=num_nodes, network=net.name,
-                                         args_workers_file=args.worker_file, mode=net.mode)
-            net.gpu_speedup[num_nodes * args.gpu_count] = imgs_per_sec
-        LOGGER.info('Network: %s (num_gpus, images_processed): %s', net.name, ','.join(map(str, net.gpu_speedup.items())))
-    write_csv(log_loc, args)
-    plot_graph(args)
-
-if __name__ == '__main__':
-    main()
diff --git a/example/image-classification/benchmark_score.py b/example/image-classification/benchmark_score.py
deleted file mode 100644
index e81a30bd6439..000000000000
--- a/example/image-classification/benchmark_score.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""
-Benchmark the scoring performance on various CNNs
-"""
-from common import find_mxnet
-from common.util import get_gpus
-import mxnet as mx
-import mxnet.gluon.model_zoo.vision as models
-from importlib import import_module
-import logging
-import argparse
-import time
-import numpy as np
-logging.basicConfig(level=logging.DEBUG)
-
-parser = argparse.ArgumentParser(description='SymbolAPI-based CNN inference performance benchmark')
-parser.add_argument('--network', type=str, default='all', 
-                                 choices=['all', 'alexnet', 'vgg-16', 'resnetv1-50', 'resnet-50',
-                                          'resnet-152', 'inception-bn', 'inception-v3', 
-                                          'inception-v4', 'inception-resnet-v2', 'mobilenet',
-                                          'densenet121', 'squeezenet1.1'])
-parser.add_argument('--batch-size', type=int, default=0,
-                     help='Batch size to use for benchmarking. Example: 32, 64, 128.'
-                          'By default, runs benchmark for batch sizes - 1, 32, 64, 128, 256')
-
-opt = parser.parse_args()
-
-def get_symbol(network, batch_size, dtype):
-    image_shape = (3,299,299) if network in ['inception-v3', 'inception-v4'] else (3,224,224)
-    num_layers = 0
-    if network == 'inception-resnet-v2':
-        network = network
-    elif 'resnet' in network:
-        num_layers = int(network.split('-')[1])
-        network = network.split('-')[0]
-    if 'vgg' in network:
-        num_layers = int(network.split('-')[1])
-        network = 'vgg'
-    if network in ['densenet121', 'squeezenet1.1']:
-        sym = models.get_model(network)
-        sym.hybridize()
-        data = mx.sym.var('data')
-        sym = sym(data)
-        sym = mx.sym.SoftmaxOutput(sym, name='softmax')
-    else:
-        net = import_module('symbols.'+network)
-        sym = net.get_symbol(num_classes=1000,
-                             image_shape=','.join([str(i) for i in image_shape]),
-                             num_layers=num_layers,
-                             dtype=dtype)
-    return (sym, [('data', (batch_size,)+image_shape)])
-
-def score(network, dev, batch_size, num_batches, dtype):
-    # get mod
-    sym, data_shape = get_symbol(network, batch_size, dtype)
-    mod = mx.mod.Module(symbol=sym, context=dev)
-    mod.bind(for_training     = False,
-             inputs_need_grad = False,
-             data_shapes      = data_shape)
-    mod.init_params(initializer=mx.init.Xavier(magnitude=2.))
-
-    # get data
-    data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=dev) for _, shape in mod.data_shapes]
-    batch = mx.io.DataBatch(data, []) # empty label
-
-    # run
-    dry_run = 5                 # use 5 iterations to warm up
-    for i in range(dry_run+num_batches):
-        if i == dry_run:
-            tic = time.time()
-        mod.forward(batch, is_train=False)
-        for output in mod.get_outputs():
-            output.wait_to_read()
-
-    # return num images per second
-    return num_batches*batch_size/(time.time() - tic)
-
-if __name__ == '__main__':
-    if opt.network == 'all':
-        networks = ['alexnet', 'vgg-16', 'resnetv1-50', 'resnet-50',
-                    'resnet-152', 'inception-bn', 'inception-v3', 
-                    'inception-v4', 'inception-resnet-v2', 
-                    'mobilenet', 'densenet121', 'squeezenet1.1']
-        logging.info('It may take some time to run all models, '
-                     'set --network to run a specific one')
-    else:
-        networks = [opt.network]
-    devs = [mx.gpu(0)] if len(get_gpus()) > 0 else []
-    # Enable USE_MKLDNN for better CPU performance
-    devs.append(mx.cpu())
-
-    if opt.batch_size == 0:
-        batch_sizes = [1, 32, 64, 128, 256]
-        logging.info('run batchsize [1, 32, 64, 128, 256] by default, '
-                     'set --batch-size to run a specific one')
-    else:
-        batch_sizes = [opt.batch_size]
-
-    for net in networks:
-        logging.info('network: %s', net)
-        if net in ['densenet121', 'squeezenet1.1']:
-            logging.info('network: %s is converted from gluon modelzoo', net)
-            logging.info('you can run benchmark/python/gluon/benchmark_gluon.py for more models')
-        for d in devs:
-            logging.info('device: %s', d)
-            logged_fp16_warning = False
-            for b in batch_sizes:
-                for dtype in ['float32', 'float16']:
-                    if d == mx.cpu() and dtype == 'float16':
-                        #float16 is not supported on CPU
-                        continue
-                    elif net in ['inception-bn', 'alexnet'] and dtype == 'float16':
-                        if not logged_fp16_warning:
-                            logging.info('Model definition for {} does not support float16'.format(net))
-                            logged_fp16_warning = True
-                    else:
-                        speed = score(network=net, dev=d, batch_size=b, num_batches=10, dtype=dtype)
-                        logging.info('batch size %2d, dtype %s, images/sec: %f', b, dtype, speed)
diff --git a/example/image-classification/common/__init__.py b/example/image-classification/common/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/example/image-classification/common/data.py b/example/image-classification/common/data.py
deleted file mode 100644
index 78385a7b7e40..000000000000
--- a/example/image-classification/common/data.py
+++ /dev/null
@@ -1,206 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import mxnet as mx
-import random
-from mxnet.io import DataBatch, DataIter
-import numpy as np
-
-def add_data_args(parser):
-    data = parser.add_argument_group('Data', 'the input images')
-    data.add_argument('--data-train', type=str, help='the training data')
-    data.add_argument('--data-train-idx', type=str, default='', help='the index of training data')
-    data.add_argument('--data-val', type=str, help='the validation data')
-    data.add_argument('--data-val-idx', type=str, default='', help='the index of validation data')
-    data.add_argument('--rgb-mean', type=str, default='123.68,116.779,103.939',
-                      help='a tuple of size 3 for the mean rgb')
-    data.add_argument('--rgb-std', type=str, default='1,1,1',
-                      help='a tuple of size 3 for the std rgb')
-    data.add_argument('--pad-size', type=int, default=0,
-                      help='padding the input image')
-    data.add_argument('--fill-value', type=int, default=127,
-                      help='Set the padding pixels value to fill_value')
-    data.add_argument('--image-shape', type=str,
-                      help='the image shape feed into the network, e.g. (3,224,224)')
-    data.add_argument('--num-classes', type=int, help='the number of classes')
-    data.add_argument('--num-examples', type=int, help='the number of training examples')
-    data.add_argument('--data-nthreads', type=int, default=4,
-                      help='number of threads for data decoding')
-    data.add_argument('--benchmark', type=int, default=0,
-                      help='if 1, then feed the network with synthetic data')
-    return data
-
-def add_data_aug_args(parser):
-    aug = parser.add_argument_group(
-        'Image augmentations', 'implemented in src/io/image_aug_default.cc')
-    aug.add_argument('--random-crop', type=int, default=0,
-                     help='if or not randomly crop the image')
-    aug.add_argument('--random-mirror', type=int, default=0,
-                     help='if or not randomly flip horizontally')
-    aug.add_argument('--max-random-h', type=int, default=0,
-                     help='max change of hue, whose range is [0, 180]')
-    aug.add_argument('--max-random-s', type=int, default=0,
-                     help='max change of saturation, whose range is [0, 255]')
-    aug.add_argument('--max-random-l', type=int, default=0,
-                     help='max change of intensity, whose range is [0, 255]')
-    aug.add_argument('--min-random-aspect-ratio', type=float, default=None,
-                     help='min value of aspect ratio, whose value is either None or a positive value.')
-    aug.add_argument('--max-random-aspect-ratio', type=float, default=0,
-                     help='max value of aspect ratio. If min_random_aspect_ratio is None, '
-                          'the aspect ratio range is [1-max_random_aspect_ratio, '
-                          '1+max_random_aspect_ratio], otherwise it is '
-                          '[min_random_aspect_ratio, max_random_aspect_ratio].')
-    aug.add_argument('--max-random-rotate-angle', type=int, default=0,
-                     help='max angle to rotate, whose range is [0, 360]')
-    aug.add_argument('--max-random-shear-ratio', type=float, default=0,
-                     help='max ratio to shear, whose range is [0, 1]')
-    aug.add_argument('--max-random-scale', type=float, default=1,
-                     help='max ratio to scale')
-    aug.add_argument('--min-random-scale', type=float, default=1,
-                     help='min ratio to scale, should >= img_size/input_shape. '
-                          'otherwise use --pad-size')
-    aug.add_argument('--max-random-area', type=float, default=1,
-                     help='max area to crop in random resized crop, whose range is [0, 1]')
-    aug.add_argument('--min-random-area', type=float, default=1,
-                     help='min area to crop in random resized crop, whose range is [0, 1]')
-    aug.add_argument('--min-crop-size', type=int, default=-1,
-                     help='Crop both width and height into a random size in '
-                          '[min_crop_size, max_crop_size]')
-    aug.add_argument('--max-crop-size', type=int, default=-1,
-                     help='Crop both width and height into a random size in '
-                          '[min_crop_size, max_crop_size]')
-    aug.add_argument('--brightness', type=float, default=0,
-                     help='brightness jittering, whose range is [0, 1]')
-    aug.add_argument('--contrast', type=float, default=0,
-                     help='contrast jittering, whose range is [0, 1]')
-    aug.add_argument('--saturation', type=float, default=0,
-                     help='saturation jittering, whose range is [0, 1]')
-    aug.add_argument('--pca-noise', type=float, default=0,
-                     help='pca noise, whose range is [0, 1]')
-    aug.add_argument('--random-resized-crop', type=int, default=0,
-                     help='whether to use random resized crop')
-    return aug
-
-class SyntheticDataIter(DataIter):
-    def __init__(self, num_classes, data_shape, max_iter, dtype):
-        self.batch_size = data_shape[0]
-        self.cur_iter = 0
-        self.max_iter = max_iter
-        self.dtype = dtype
-        label = np.random.randint(0, num_classes, [self.batch_size,])
-        data = np.random.uniform(-1, 1, data_shape)
-        self.data = mx.nd.array(data, dtype=self.dtype, ctx=mx.Context('cpu_pinned', 0))
-        self.label = mx.nd.array(label, dtype=self.dtype, ctx=mx.Context('cpu_pinned', 0))
-    def __iter__(self):
-        return self
-    @property
-    def provide_data(self):
-        return [mx.io.DataDesc('data', self.data.shape, self.dtype)]
-    @property
-    def provide_label(self):
-        return [mx.io.DataDesc('softmax_label', (self.batch_size,), self.dtype)]
-    def next(self):
-        self.cur_iter += 1
-        if self.cur_iter <= self.max_iter:
-            return DataBatch(data=(self.data,),
-                             label=(self.label,),
-                             pad=0,
-                             index=None,
-                             provide_data=self.provide_data,
-                             provide_label=self.provide_label)
-        else:
-            raise StopIteration
-    def __next__(self):
-        return self.next()
-    def reset(self):
-        self.cur_iter = 0
-
-def get_rec_iter(args, kv=None):
-    image_shape = tuple([int(l) for l in args.image_shape.split(',')])
-    if 'benchmark' in args and args.benchmark:
-        data_shape = (args.batch_size,) + image_shape
-        train = SyntheticDataIter(args.num_classes, data_shape,
-                args.num_examples / args.batch_size, np.float32)
-        return (train, None)
-    if kv:
-        (rank, nworker) = (kv.rank, kv.num_workers)
-    else:
-        (rank, nworker) = (0, 1)
-    rgb_mean = [float(i) for i in args.rgb_mean.split(',')]
-    rgb_std = [float(i) for i in args.rgb_std.split(',')]
-    train = mx.io.ImageRecordIter(
-        path_imgrec         = args.data_train,
-        path_imgidx         = args.data_train_idx,
-        label_width         = 1,
-        mean_r              = rgb_mean[0],
-        mean_g              = rgb_mean[1],
-        mean_b              = rgb_mean[2],
-        std_r               = rgb_std[0],
-        std_g               = rgb_std[1],
-        std_b               = rgb_std[2],
-        data_name           = 'data',
-        label_name          = 'softmax_label',
-        data_shape          = image_shape,
-        batch_size          = args.batch_size,
-        rand_crop           = args.random_crop,
-        max_random_scale    = args.max_random_scale,
-        pad                 = args.pad_size,
-        fill_value          = args.fill_value,
-        random_resized_crop = args.random_resized_crop,
-        min_random_scale    = args.min_random_scale,
-        max_aspect_ratio    = args.max_random_aspect_ratio,
-        min_aspect_ratio    = args.min_random_aspect_ratio,
-        max_random_area     = args.max_random_area,
-        min_random_area     = args.min_random_area,
-        min_crop_size       = args.min_crop_size,
-        max_crop_size       = args.max_crop_size,
-        brightness          = args.brightness,
-        contrast            = args.contrast,
-        saturation          = args.saturation,
-        pca_noise           = args.pca_noise,
-        random_h            = args.max_random_h,
-        random_s            = args.max_random_s,
-        random_l            = args.max_random_l,
-        max_rotate_angle    = args.max_random_rotate_angle,
-        max_shear_ratio     = args.max_random_shear_ratio,
-        rand_mirror         = args.random_mirror,
-        preprocess_threads  = args.data_nthreads,
-        shuffle             = True,
-        num_parts           = nworker,
-        part_index          = rank)
-    if args.data_val is None:
-        return (train, None)
-    val = mx.io.ImageRecordIter(
-        path_imgrec         = args.data_val,
-        path_imgidx         = args.data_val_idx,
-        label_width         = 1,
-        mean_r              = rgb_mean[0],
-        mean_g              = rgb_mean[1],
-        mean_b              = rgb_mean[2],
-        std_r               = rgb_std[0],
-        std_g               = rgb_std[1],
-        std_b               = rgb_std[2],
-        data_name           = 'data',
-        label_name          = 'softmax_label',
-        batch_size          = args.batch_size,
-        data_shape          = image_shape,
-        preprocess_threads  = args.data_nthreads,
-        rand_crop           = False,
-        rand_mirror         = False,
-        num_parts           = nworker,
-        part_index          = rank)
-    return (train, val)
diff --git a/example/image-classification/common/find_mxnet.py b/example/image-classification/common/find_mxnet.py
deleted file mode 100644
index 2ce07130a361..000000000000
--- a/example/image-classification/common/find_mxnet.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import os, sys
-try:
-    import mxnet as mx
-except ImportError:
-    curr_path = os.path.abspath(os.path.dirname(__file__))
-    sys.path.append(os.path.join(curr_path, "../../../python"))
-    import mxnet as mx
diff --git a/example/image-classification/common/fit.py b/example/image-classification/common/fit.py
deleted file mode 100644
index 8662db3baba4..000000000000
--- a/example/image-classification/common/fit.py
+++ /dev/null
@@ -1,340 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-""" example train fit utility """
-import logging
-import os
-import time
-import re
-import math
-import mxnet as mx
-
-def get_epoch_size(args, kv):
-    return math.ceil(int(args.num_examples / kv.num_workers) / args.batch_size)
-
-def _get_lr_scheduler(args, kv):
-    if 'lr_factor' not in args or args.lr_factor >= 1:
-        return (args.lr, None)
-    epoch_size = get_epoch_size(args, kv)
-    begin_epoch = args.load_epoch if args.load_epoch else 0
-    if 'pow' in args.lr_step_epochs:
-        lr = args.lr
-        max_up = args.num_epochs * epoch_size
-        pwr = float(re.sub('pow[- ]*', '', args.lr_step_epochs))
-        poly_sched = mx.lr_scheduler.PolyScheduler(max_up, lr, pwr)
-        return (lr, poly_sched)
-    step_epochs = [int(l) for l in args.lr_step_epochs.split(',')]
-    lr = args.lr
-    for s in step_epochs:
-        if begin_epoch >= s:
-            lr *= args.lr_factor
-    if lr != args.lr:
-        logging.info('Adjust learning rate to %e for epoch %d',
-                     lr, begin_epoch)
-
-    steps = [epoch_size * (x - begin_epoch)
-             for x in step_epochs if x - begin_epoch > 0]
-    if steps:
-        return (lr, mx.lr_scheduler.MultiFactorScheduler(step=steps, factor=args.lr_factor,
-                                                         base_lr=args.lr))
-    else:
-        return (lr, None)
-
-def _load_model(args, rank=0):
-    if 'load_epoch' not in args or args.load_epoch is None:
-        return (None, None, None)
-    assert args.model_prefix is not None
-    model_prefix = args.model_prefix
-    if rank > 0 and os.path.exists("%s-%d-symbol.json" % (model_prefix, rank)):
-        model_prefix += "-%d" % (rank)
-    sym, arg_params, aux_params = mx.model.load_checkpoint(
-        model_prefix, args.load_epoch)
-    logging.info('Loaded model %s_%04d.params', model_prefix, args.load_epoch)
-    return (sym, arg_params, aux_params)
-
-
-def _save_model(args, rank=0):
-    if args.model_prefix is None:
-        return None
-    return mx.callback.do_checkpoint(args.model_prefix if rank == 0 else "%s-%d" % (
-        args.model_prefix, rank), period=args.save_period)
-
-
-def add_fit_args(parser):
-    """
-    parser : argparse.ArgumentParser
-    return a parser added with args required by fit
-    """
-    train = parser.add_argument_group('Training', 'model training')
-    train.add_argument('--network', type=str,
-                       help='the neural network to use')
-    train.add_argument('--num-layers', type=int,
-                       help='number of layers in the neural network, \
-                             required by some networks such as resnet')
-    train.add_argument('--gpus', type=str,
-                       help='list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu')
-    train.add_argument('--kv-store', type=str, default='device',
-                       help='key-value store type')
-    train.add_argument('--num-epochs', type=int, default=100,
-                       help='max num of epochs')
-    train.add_argument('--lr', type=float, default=0.1,
-                       help='initial learning rate')
-    train.add_argument('--lr-factor', type=float, default=0.1,
-                       help='the ratio to reduce lr on each step')
-    train.add_argument('--lr-step-epochs', type=str,
-                       help='the epochs to reduce the lr, e.g. 30,60')
-    train.add_argument('--initializer', type=str, default='default',
-                       help='the initializer type')
-    train.add_argument('--optimizer', type=str, default='sgd',
-                       help='the optimizer type')
-    train.add_argument('--mom', type=float, default=0.9,
-                       help='momentum for sgd')
-    train.add_argument('--wd', type=float, default=0.0001,
-                       help='weight decay for sgd')
-    train.add_argument('--batch-size', type=int, default=128,
-                       help='the batch size')
-    train.add_argument('--disp-batches', type=int, default=20,
-                       help='show progress for every n batches')
-    train.add_argument('--model-prefix', type=str,
-                       help='model prefix')
-    train.add_argument('--save-period', type=int, default=1, help='params saving period')
-    parser.add_argument('--monitor', dest='monitor', type=int, default=0,
-                        help='log network parameters every N iters if larger than 0')
-    train.add_argument('--load-epoch', type=int,
-                       help='load the model on an epoch using the model-load-prefix')
-    train.add_argument('--top-k', type=int, default=0,
-                       help='report the top-k accuracy. 0 means no report.')
-    train.add_argument('--loss', type=str, default='',
-                       help='show the cross-entropy or nll loss. ce strands for cross-entropy, nll-loss stands for likelihood loss')
-    train.add_argument('--test-io', type=int, default=0,
-                       help='1 means test reading speed without training')
-    train.add_argument('--dtype', type=str, default='float32',
-                       help='precision: float32 or float16')
-    train.add_argument('--gc-type', type=str, default='none',
-                       help='type of gradient compression to use, \
-                             takes `2bit` or `none` for now')
-    train.add_argument('--gc-threshold', type=float, default=0.5,
-                       help='threshold for 2bit gradient compression')
-    # additional parameters for large batch sgd
-    train.add_argument('--macrobatch-size', type=int, default=0,
-                       help='distributed effective batch size')
-    train.add_argument('--warmup-epochs', type=int, default=5,
-                       help='the epochs to ramp-up lr to scaled large-batch value')
-    train.add_argument('--warmup-strategy', type=str, default='linear',
-                       help='the ramping-up strategy for large batch sgd')
-    train.add_argument('--profile-worker-suffix', type=str, default='',
-                       help='profile workers actions into this file. During distributed training\
-                             filename saved will be rank1_ followed by this suffix')
-    train.add_argument('--profile-server-suffix', type=str, default='',
-                       help='profile server actions into a file with name like rank1_ followed by this suffix \
-                             during distributed training')
-    train.add_argument('--use-imagenet-data-augmentation', type=int, default=0,
-                       help='enable data augmentation of ImageNet data, default disabled')
-    return train
-
-
-def fit(args, network, data_loader, **kwargs):
-    """
-    train a model
-    args : argparse returns
-    network : the symbol definition of the nerual network
-    data_loader : function that returns the train and val data iterators
-    """
-    # kvstore
-    kv = mx.kvstore.create(args.kv_store)
-    if args.gc_type != 'none':
-        kv.set_gradient_compression({'type': args.gc_type,
-                                     'threshold': args.gc_threshold})
-    if args.profile_server_suffix:
-        mx.profiler.set_config(filename=args.profile_server_suffix, profile_all=True, profile_process='server')
-        mx.profiler.set_state(state='run', profile_process='server')
-
-    if args.profile_worker_suffix:
-        if kv.num_workers > 1:
-            filename = 'rank' + str(kv.rank) + '_' + args.profile_worker_suffix
-        else:
-            filename = args.profile_worker_suffix
-        mx.profiler.set_config(filename=filename, profile_all=True, profile_process='worker')
-        mx.profiler.set_state(state='run', profile_process='worker')
-
-    # logging
-    head = '%(asctime)-15s Node[' + str(kv.rank) + '] %(message)s'
-    logging.basicConfig(level=logging.DEBUG, format=head)
-    logging.info('start with arguments %s', args)
-    
-    epoch_size = get_epoch_size(args, kv)
-
-    # data iterators
-    (train, val) = data_loader(args, kv)
-    if 'dist' in args.kv_store and not 'async' in args.kv_store:
-        logging.info('Resizing training data to %d batches per machine', epoch_size)
-        # resize train iter to ensure each machine has same number of batches per epoch
-        # if not, dist_sync can hang at the end with one machine waiting for other machines
-        train = mx.io.ResizeIter(train, epoch_size)
-
-    if args.test_io:
-        tic = time.time()
-        for i, batch in enumerate(train):
-            if isinstance(batch, list):
-                for b in batch:
-                    for j in b.data:
-                        j.wait_to_read()
-            else:
-                for j in batch.data:
-                    j.wait_to_read()
-            if (i + 1) % args.disp_batches == 0:
-                logging.info('Batch [%d]\tSpeed: %.2f samples/sec', i,
-                             args.disp_batches * args.batch_size / (time.time() - tic))
-                tic = time.time()
-        return
-
-    # load model
-    if 'arg_params' in kwargs and 'aux_params' in kwargs:
-        arg_params = kwargs['arg_params']
-        aux_params = kwargs['aux_params']
-    else:
-        sym, arg_params, aux_params = _load_model(args, kv.rank)
-        if sym is not None:
-            assert sym.tojson() == network.tojson()
-
-    # save model
-    checkpoint = _save_model(args, kv.rank)
-
-    # devices for training
-    devs = mx.cpu() if args.gpus is None or args.gpus == "" else [
-        mx.gpu(int(i)) for i in args.gpus.split(',')]
-
-    # learning rate
-    lr, lr_scheduler = _get_lr_scheduler(args, kv)
-
-    # create model
-    model = mx.mod.Module(
-        context=devs,
-        symbol=network
-    )
-
-    lr_scheduler = lr_scheduler
-    optimizer_params = {
-        'learning_rate': lr,
-        'wd': args.wd,
-        'lr_scheduler': lr_scheduler,
-        'multi_precision': True}
-
-    # Only a limited number of optimizers have 'momentum' property
-    has_momentum = {'sgd', 'dcasgd', 'nag', 'signum'}
-    if args.optimizer in has_momentum:
-        optimizer_params['momentum'] = args.mom
-
-    monitor = mx.mon.Monitor(
-        args.monitor, pattern=".*") if args.monitor > 0 else None
-
-    # A limited number of optimizers have a warmup period
-    has_warmup = {'lbnag'}
-    if args.optimizer in has_warmup:
-        nworkers = kv.num_workers
-        if epoch_size < 1:
-            epoch_size = 1
-        macrobatch_size = args.macrobatch_size
-        if macrobatch_size < args.batch_size * nworkers:
-            macrobatch_size = args.batch_size * nworkers
-        #batch_scale = round(float(macrobatch_size) / args.batch_size / nworkers +0.4999)
-        batch_scale = math.ceil(
-            float(macrobatch_size) / args.batch_size / nworkers)
-        optimizer_params['updates_per_epoch'] = epoch_size
-        optimizer_params['begin_epoch'] = args.load_epoch if args.load_epoch else 0
-        optimizer_params['batch_scale'] = batch_scale
-        optimizer_params['warmup_strategy'] = args.warmup_strategy
-        optimizer_params['warmup_epochs'] = args.warmup_epochs
-        optimizer_params['num_epochs'] = args.num_epochs
-
-    if args.initializer == 'default':
-        if args.network == 'alexnet':
-            # AlexNet will not converge using Xavier
-            initializer = mx.init.Normal()
-            # VGG will not trend to converge using Xavier-Gaussian
-        elif args.network and 'vgg' in args.network:
-            initializer = mx.init.Xavier()
-        else:
-            initializer = mx.init.Xavier(
-                rnd_type='gaussian', factor_type="in", magnitude=2)
-    # initializer   = mx.init.Xavier(factor_type="in", magnitude=2.34),
-    elif args.initializer == 'xavier':
-        initializer = mx.init.Xavier()
-    elif args.initializer == 'msra':
-        initializer = mx.init.MSRAPrelu()
-    elif args.initializer == 'orthogonal':
-        initializer = mx.init.Orthogonal()
-    elif args.initializer == 'normal':
-        initializer = mx.init.Normal()
-    elif args.initializer == 'uniform':
-        initializer = mx.init.Uniform()
-    elif args.initializer == 'one':
-        initializer = mx.init.One()
-    elif args.initializer == 'zero':
-        initializer = mx.init.Zero()
-
-    # evaluation metrices
-    eval_metrics = ['accuracy']
-    if args.top_k > 0:
-        eval_metrics.append(mx.gluon.metric.create(
-            'top_k_accuracy', top_k=args.top_k))
-
-    supported_loss = ['ce', 'nll_loss']
-    if len(args.loss) > 0:
-        # ce or nll loss is only applicable to softmax output
-        loss_type_list = args.loss.split(',')
-        if 'softmax_output' in network.list_outputs():
-            for loss_type in loss_type_list:
-                loss_type = loss_type.strip()
-                if loss_type == 'nll':
-                    loss_type = 'nll_loss'
-                if loss_type not in supported_loss:
-                    logging.warning(loss_type + ' is not an valid loss type, only cross-entropy or ' \
-                                    'negative likelihood loss is supported!')
-                else:
-                    eval_metrics.append(mx.gluon.metric.create(loss_type))
-        else:
-            logging.warning("The output is not softmax_output, loss argument will be skipped!")
-
-    # callbacks that run after each batch
-    batch_end_callbacks = [mx.callback.Speedometer(
-        args.batch_size, args.disp_batches)]
-    if 'batch_end_callback' in kwargs:
-        cbs = kwargs['batch_end_callback']
-        batch_end_callbacks += cbs if isinstance(cbs, list) else [cbs]
-
-    # run
-    model.fit(train,
-              begin_epoch=args.load_epoch if args.load_epoch else 0,
-              num_epoch=args.num_epochs,
-              eval_data=val,
-              eval_metric=eval_metrics,
-              kvstore=kv,
-              optimizer=args.optimizer,
-              optimizer_params=optimizer_params,
-              initializer=initializer,
-              arg_params=arg_params,
-              aux_params=aux_params,
-              batch_end_callback=batch_end_callbacks,
-              epoch_end_callback=checkpoint,
-              allow_missing=True,
-              monitor=monitor)
-
-    if args.profile_server_suffix:
-        mx.profiler.set_state(state='run', profile_process='server')
-    if args.profile_worker_suffix:
-        mx.profiler.set_state(state='run', profile_process='worker')
diff --git a/example/image-classification/common/modelzoo.py b/example/image-classification/common/modelzoo.py
deleted file mode 100644
index ce8fd5e0ed16..000000000000
--- a/example/image-classification/common/modelzoo.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import os
-from common.util import download_file
-
-_base_model_url = 'http://data.mxnet.io/models/'
-_default_model_info = {
-    'imagenet1k-inception-bn': {'symbol':_base_model_url+'imagenet/inception-bn/Inception-BN-symbol.json',
-                             'params':_base_model_url+'imagenet/inception-bn/Inception-BN-0126.params'},
-    'imagenet1k-resnet-18': {'symbol':_base_model_url+'imagenet/resnet/18-layers/resnet-18-symbol.json',
-                             'params':_base_model_url+'imagenet/resnet/18-layers/resnet-18-0000.params'},
-    'imagenet1k-resnet-34': {'symbol':_base_model_url+'imagenet/resnet/34-layers/resnet-34-symbol.json',
-                             'params':_base_model_url+'imagenet/resnet/34-layers/resnet-34-0000.params'},
-    'imagenet1k-resnet-50': {'symbol':_base_model_url+'imagenet/resnet/50-layers/resnet-50-symbol.json',
-                             'params':_base_model_url+'imagenet/resnet/50-layers/resnet-50-0000.params'},
-    'imagenet1k-resnet-101': {'symbol':_base_model_url+'imagenet/resnet/101-layers/resnet-101-symbol.json',
-                             'params':_base_model_url+'imagenet/resnet/101-layers/resnet-101-0000.params'},
-    'imagenet1k-resnet-152': {'symbol':_base_model_url+'imagenet/resnet/152-layers/resnet-152-symbol.json',
-                             'params':_base_model_url+'imagenet/resnet/152-layers/resnet-152-0000.params'},
-    'imagenet1k-resnext-50': {'symbol':_base_model_url+'imagenet/resnext/50-layers/resnext-50-symbol.json',
-                             'params':_base_model_url+'imagenet/resnext/50-layers/resnext-50-0000.params'},
-    'imagenet1k-resnext-101': {'symbol':_base_model_url+'imagenet/resnext/101-layers/resnext-101-symbol.json',
-                             'params':_base_model_url+'imagenet/resnext/101-layers/resnext-101-0000.params'},
-    'imagenet1k-resnext-101-64x4d': {'symbol':_base_model_url+'imagenet/resnext/101-layers/resnext-101-64x4d-symbol.json',
-                                     'params':_base_model_url+'imagenet/resnext/101-layers/resnext-101-64x4d-0000.params'},
-    'imagenet11k-resnet-152': {'symbol':_base_model_url+'imagenet-11k/resnet-152/resnet-152-symbol.json',
-                             'params':_base_model_url+'imagenet-11k/resnet-152/resnet-152-0000.params'},
-    'imagenet11k-place365ch-resnet-152': {'symbol':_base_model_url+'imagenet-11k-place365-ch/resnet-152-symbol.json',
-                                          'params':_base_model_url+'imagenet-11k-place365-ch/resnet-152-0000.params'},
-    'imagenet11k-place365ch-resnet-50': {'symbol':_base_model_url+'imagenet-11k-place365-ch/resnet-50-symbol.json',
-                                         'params':_base_model_url+'imagenet-11k-place365-ch/resnet-50-0000.params'},
-}
-
-def download_model(model_name, dst_dir='./', meta_info=None):
-    if meta_info is None:
-        meta_info = _default_model_info
-    meta_info = dict(meta_info)
-    if model_name not in meta_info:
-        return (None, 0)
-    if not os.path.isdir(dst_dir):
-        os.mkdir(dst_dir)
-    meta = dict(meta_info[model_name])
-    assert 'symbol' in meta, "missing symbol url"
-    model_name = os.path.join(dst_dir, model_name)
-    download_file(meta['symbol'], model_name+'-symbol.json')
-    assert 'params' in meta, "mssing parameter file url"
-    download_file(meta['params'], model_name+'-0000.params')
-    return (model_name, 0)
diff --git a/example/image-classification/common/util.py b/example/image-classification/common/util.py
deleted file mode 100644
index 8737b69a7351..000000000000
--- a/example/image-classification/common/util.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import subprocess
-import os
-import errno
-
-import mxnet as mx
-
-def download_file(url, local_fname=None, force_write=False):
-    # requests is not default installed
-    import requests
-    if local_fname is None:
-        local_fname = url.split('/')[-1]
-    if not force_write and os.path.exists(local_fname):
-        return local_fname
-
-    dir_name = os.path.dirname(local_fname)
-
-    if dir_name != "":
-        if not os.path.exists(dir_name):
-            try: # try to create the directory if it doesn't exists
-                os.makedirs(dir_name)
-            except OSError as exc:
-                if exc.errno != errno.EEXIST:
-                    raise
-
-    r = requests.get(url, stream=True)
-    assert r.status_code == 200, "failed to open %s" % url
-    with open(local_fname, 'wb') as f:
-        for chunk in r.iter_content(chunk_size=1024):
-            if chunk: # filter out keep-alive new chunks
-                f.write(chunk)
-    return local_fname
-
-def get_gpus():
-    """
-    return a list of GPUs
-    """
-    return range(mx.util.get_gpu_count())
diff --git a/example/image-classification/data/caltech256.sh b/example/image-classification/data/caltech256.sh
deleted file mode 100755
index 187d026f801b..000000000000
--- a/example/image-classification/data/caltech256.sh
+++ /dev/null
@@ -1,61 +0,0 @@
-#!/bin/bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-# This file download the caltech 256 dataset
-# (http://www.vision.caltech.edu/Image_Datasets/Caltech256/), and split it into
-# the train and val rec files.
-
-# number of images per class for training
-IMG_TRAIN=60
-
-# download
-if [ ! -e 256_ObjectCategories.tar ]; then
-    wget http://www.vision.caltech.edu/Image_Datasets/Caltech256/256_ObjectCategories.tar
-fi
-
-# split into train and val set
-tar -xf 256_ObjectCategories.tar
-TRAIN_DIR=caltech_256_train
-mkdir -p ${TRAIN_DIR}
-for i in 256_ObjectCategories/*; do
-    c=`basename $i`
-    echo "spliting $c"
-    mkdir -p ${TRAIN_DIR}/$c
-    for j in `ls $i/*.jpg | shuf | head -n ${IMG_TRAIN}`; do
-        mv $j ${TRAIN_DIR}/$c/
-    done
-done
-
-# generate lst files
-CUR_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
-MX_DIR=${CUR_DIR}/../../../
-python ${MX_DIR}/tools/im2rec.py --list --recursive caltech256-train ${TRAIN_DIR}/
-python ${MX_DIR}/tools/im2rec.py --list --recursive caltech256-val 256_ObjectCategories/
-mv caltech256-train_train.lst caltech256-train.lst
-rm caltech256-train_*
-mv caltech256-val_train.lst caltech256-val.lst
-rm caltech256-val_*
-
-# generate rec files
-python ${MX_DIR}/tools/im2rec.py --resize 256 --quality 95 --num-thread 16 caltech256-val 256_ObjectCategories/
-python ${MX_DIR}/tools/im2rec.py --resize 256 --quality 95 --num-thread 16 caltech256-train ${TRAIN_DIR}/
-
-# clean
-rm -rf ${TRAIN_DIR} 256_ObjectCategories/
diff --git a/example/image-classification/data/imagenet1k-val.sh b/example/image-classification/data/imagenet1k-val.sh
deleted file mode 100755
index 13cb551140f8..000000000000
--- a/example/image-classification/data/imagenet1k-val.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/bin/bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-# This file download the imagnet-1k validation dataset and convert it into a rec
-# file. One need to provide the URL for the ILSVRC2012_img_val.tar, which can be
-# find at http://www.image-net.org/download-images
-#
-# Example usage (replace the URL with the correct one):
-# ./imagenet1k-val.sh http://xxxxxx/challenges/LSVRC/2012/nnoupb/ILSVRC2012_img_val.tar
-
-if [ ! -e ILSVRC2012_img_val.tar ]; then
-    wget $1
-fi
-mkdir -p val
-tar -xf ILSVRC2012_img_val.tar -C val
-wget http://data.mxnet.io/models/imagenet/resnet/val.lst -O imagenet1k-val.lst
-
-CUR_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
-MX_DIR=${CUR_DIR}/../../../
-
-python ${CUR_DIR}/../../../tools/im2rec.py --resize 256 --quality 90 --num-thread 16 imagenet1k-val val/
-
-rm -rf val
diff --git a/example/image-classification/fine-tune.py b/example/image-classification/fine-tune.py
deleted file mode 100644
index 719fa86e01b3..000000000000
--- a/example/image-classification/fine-tune.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import os
-import argparse
-import logging
-logging.basicConfig(level=logging.DEBUG)
-from common import find_mxnet
-from common import data, fit, modelzoo
-import mxnet as mx
-import numpy as np
-
-
-def get_fine_tune_model(symbol, arg_params, num_classes, layer_name, dtype='float32'):
-    """
-    symbol: the pre-trained network symbol
-    arg_params: the argument parameters of the pre-trained model
-    num_classes: the number of classes for the fine-tune datasets
-    layer_name: the layer name before the last fully-connected layer
-    """
-    all_layers = symbol.get_internals()
-    net = all_layers[layer_name+'_output']
-    net = mx.symbol.FullyConnected(data=net, num_hidden=num_classes, name='fc')
-    if dtype == 'float16':
-        net = mx.sym.Cast(data=net, dtype=np.float32)
-    net = mx.symbol.SoftmaxOutput(data=net, name='softmax')
-    new_args = dict({k:arg_params[k] for k in arg_params if 'fc' not in k})
-    return (net, new_args)
-
-if __name__ == "__main__":
-    # parse args
-    parser = argparse.ArgumentParser(description="fine-tune a dataset",
-                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    train = fit.add_fit_args(parser)
-    data.add_data_args(parser)
-    aug = data.add_data_aug_args(parser)
-    parser.add_argument('--pretrained-model', type=str,
-                        help='the pre-trained model. can be prefix of local model files prefix \
-                        or a model name from common/modelzoo')
-    parser.add_argument('--layer-before-fullc', type=str, default='flatten0',
-                        help='the name of the layer before the last fullc layer')\
-
-    # use less augmentations for fine-tune. by default here it uses no augmentations
-
-    # use a small learning rate and less regularizations
-    parser.set_defaults(image_shape='3,224,224',
-                        num_epochs=30,
-                        lr=.01,
-                        lr_step_epochs='20',
-                        wd=0,
-                        mom=0)
-    args = parser.parse_args()
-
-
-    # load pretrained model and params
-    dir_path = os.path.dirname(os.path.realpath(__file__))
-    (prefix, epoch) = modelzoo.download_model(
-        args.pretrained_model, os.path.join(dir_path, 'model'))
-    if prefix is None:
-        (prefix, epoch) = (args.pretrained_model, args.load_epoch)
-    sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
-
-    if args.dtype != 'float32':
-        # load symbol of trained network, so we can cast it to support other dtype
-        # fine tuning a network in a datatype which was not used for training originally,
-        # requires access to the code used to generate the symbol used to train that model.
-        # we then need to modify the symbol to add a layer at the beginning
-        # to cast data to that dtype. We also need to cast output of layers before softmax
-        # to float32 so that softmax can still be in float32.
-        # if the network chosen from symols/ folder doesn't have cast for the new datatype,
-        # it will still train in fp32
-        if args.network not in ['inception-v3',\
-                                 'inception-v4', 'resnet-v1', 'resnet', 'resnext', 'vgg']:
-            raise ValueError('Given network does not have support for dtypes other than float32.\
-                Please add a cast layer at the beginning to train in that mode.')
-        from importlib import import_module
-        net = import_module('symbols.'+args.network)
-        sym = net.get_symbol(**vars(args))
-
-    # remove the last fullc layer and add a new softmax layer
-    (new_sym, new_args) = get_fine_tune_model(sym, arg_params, args.num_classes,
-                                              args.layer_before_fullc, args.dtype)
-    # train
-    fit.fit(args        = args,
-            network     = new_sym,
-            data_loader = data.get_rec_iter,
-            arg_params  = new_args,
-            aux_params  = aux_params)
diff --git a/example/image-classification/score.py b/example/image-classification/score.py
deleted file mode 100644
index dbad44ef6981..000000000000
--- a/example/image-classification/score.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import argparse
-from common import modelzoo, find_mxnet
-import mxnet as mx
-import time
-import os
-import logging
-
-def score(model, data_val, metrics, gpus, batch_size, rgb_mean=None, mean_img=None,
-          image_shape='3,224,224', data_nthreads=4, label_name='softmax_label', max_num_examples=None):
-    # create data iterator
-    data_shape = tuple([int(i) for i in image_shape.split(',')])
-    if mean_img is not None:
-        mean_args = {'mean_img':mean_img}
-    elif rgb_mean is not None:
-        rgb_mean = [float(i) for i in rgb_mean.split(',')]
-        mean_args = {'mean_r':rgb_mean[0], 'mean_g':rgb_mean[1],
-          'mean_b':rgb_mean[2]}
-
-    data = mx.io.ImageRecordIter(
-        path_imgrec        = data_val,
-        label_width        = 1,
-        preprocess_threads = data_nthreads,
-        batch_size         = batch_size,
-        data_shape         = data_shape,
-        label_name         = label_name,
-        rand_crop          = False,
-        rand_mirror        = False,
-        **mean_args)
-
-    if isinstance(model, str):
-        # download model
-        dir_path = os.path.dirname(os.path.realpath(__file__))
-        (prefix, epoch) = modelzoo.download_model(
-            model, os.path.join(dir_path, 'model'))
-        sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
-    elif isinstance(model, tuple) or isinstance(model, list):
-        assert len(model) == 3
-        (sym, arg_params, aux_params) = model
-    else:
-        raise TypeError('model type [%s] is not supported' % str(type(model)))
-
-    # create module
-    if gpus == '':
-        devs = mx.cpu()
-    else:
-        devs = [mx.gpu(int(i)) for i in gpus.split(',')]
-
-    mod = mx.mod.Module(symbol=sym, context=devs, label_names=[label_name,])
-    mod.bind(for_training=False,
-             data_shapes=data.provide_data,
-             label_shapes=data.provide_label)
-    mod.set_params(arg_params, aux_params)
-    if not isinstance(metrics, list):
-        metrics = [metrics,]
-    tic = time.time()
-    num = 0
-    for batch in data:
-        mod.forward(batch, is_train=False)
-        for m in metrics:
-            mod.update_metric(m, batch.label)
-        num += batch_size
-        if max_num_examples is not None and num > max_num_examples:
-            break
-    return (num / (time.time() - tic), )
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='score a model on a dataset')
-    parser.add_argument('--model', type=str, required=True,
-                        help = 'the model name.')
-    parser.add_argument('--gpus', type=str, default='0')
-    parser.add_argument('--batch-size', type=int, default=64)
-    parser.add_argument('--rgb-mean', type=str, default='0,0,0')
-    parser.add_argument('--data-val', type=str, required=True)
-    parser.add_argument('--image-shape', type=str, default='3,224,224')
-    parser.add_argument('--data-nthreads', type=int, default=4,
-                        help='number of threads for data decoding')
-    args = parser.parse_args()
-
-    logger = logging.getLogger()
-    logger.setLevel(logging.DEBUG)
-
-    metrics = [mx.gluon.metric.create('acc'),
-               mx.gluon.metric.create('top_k_accuracy', top_k = 5)]
-
-    (speed,) = score(metrics = metrics, **vars(args))
-    logging.info('Finished with %f images per second', speed)
-
-    for m in metrics:
-        logging.info(m.get())
diff --git a/example/image-classification/symbol_alexnet.R b/example/image-classification/symbol_alexnet.R
deleted file mode 100644
index 4d3e7ecdc578..000000000000
--- a/example/image-classification/symbol_alexnet.R
+++ /dev/null
@@ -1,53 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-library(mxnet)
-
-get_symbol <- function(num_classes = 1000) {
-  input_data <- mx.symbol.Variable(name = "data")
-  # stage 1
-  conv1 <- mx.symbol.Convolution(data = input_data, kernel = c(11, 11), stride = c(4, 4), num_filter = 96)
-  relu1 <- mx.symbol.Activation(data = conv1, act_type = "relu")
-  lrn1 <- mx.symbol.LRN(data = relu1, alpha = 0.0001, beta = 0.75, knorm = 2, nsize = 5)
-  pool1 <- mx.symbol.Pooling(data = lrn1, kernel = c(3, 3), stride = c(2, 2), pool_type = "max")
-  # stage 2
-  conv2 <- mx.symbol.Convolution(data = lrn1, kernel = c(5, 5), pad = c(2, 2), num_filter = 256)
-  relu2 <- mx.symbol.Activation(data = conv2, act_type = "relu")
-  lrn2 <- mx.symbol.LRN(data = relu2, alpha = 0.0001, beta = 0.75, knorm = 2, nsize = 5)
-  pool2 <- mx.symbol.Pooling(data = lrn2, kernel = c(3, 3), stride = c(2, 2), pool_type = "max")  
-  # stage 3
-  conv3 <- mx.symbol.Convolution(data = lrn2, kernel = c(3, 3), pad = c(1, 1), num_filter = 384)
-  relu3 <- mx.symbol.Activation(data = conv3, act_type = "relu")
-  conv4 <- mx.symbol.Convolution(data = relu3, kernel = c(3, 3), pad = c(1, 1), num_filter = 384)
-  relu4 <- mx.symbol.Activation(data = conv4, act_type = "relu")
-  conv5 <- mx.symbol.Convolution(data = relu4, kernel = c(3, 3), pad = c(1, 1), num_filter = 256)
-  relu5 <- mx.symbol.Activation(data = conv5, act_type = "relu")
-  pool3 <- mx.symbol.Pooling(data = relu5, kernel = c(3, 3), stride = c(2, 2), pool_type = "max")
-  # stage 4
-  flatten <- mx.symbol.Flatten(data = pool3)
-  fc1 <- mx.symbol.FullyConnected(data = flatten, num_hidden = 4096)
-  relu6 <- mx.symbol.Activation(data = fc1, act_type = "relu")
-  dropout1 <- mx.symbol.Dropout(data = relu6, p = 0.5)
-  # stage 5
-  fc2 <- mx.symbol.FullyConnected(data = dropout1, num_hidden = 4096)
-  relu7 <- mx.symbol.Activation(data = fc2, act_type = "relu")
-  dropout2 <- mx.symbol.Dropout(data = relu7, p = 0.5)
-  # stage 6
-  fc3 <- mx.symbol.FullyConnected(data = dropout2, num_hidden = num_classes)
-  softmax <- mx.symbol.SoftmaxOutput(data = fc3, name = 'softmax')
-  return(softmax)
-}
diff --git a/example/image-classification/symbol_googlenet.R b/example/image-classification/symbol_googlenet.R
deleted file mode 100644
index fca9bfc0f1a7..000000000000
--- a/example/image-classification/symbol_googlenet.R
+++ /dev/null
@@ -1,84 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-library(mxnet)
-
-ConvFactory <- function(data, num_filter, kernel, stride = c(1, 1), pad = c(0, 0),
-                        name = '', suffix = '') {
-    conv <- mx.symbol.Convolution(data = data, num_filter = num_filter, kernel = kernel, stride = stride,
-                                  pad = pad, name = paste('conv_', name, suffix, sep = ""))
-    act <- mx.symbol.Activation(data = conv, act_type = 'relu', name = paste('relu_', name, suffix, sep = ''))
-    return(act)
-}
-
-InceptionFactory <- function(data, num_1x1, num_3x3red, num_3x3,
-                             num_d5x5red, num_d5x5, pool, proj, name) {
-    # 1x1
-    c1x1 <- ConvFactory(data = data, num_filter = num_1x1, kernel = c(1, 1),
-                        name = paste(name, '_1x1', sep = ''))
-    # 3x3 reduce + 3x3
-    c3x3r = ConvFactory(data = data, num_filter = num_3x3red, kernel = c(1, 1),
-                        name = paste(name, '_3x3', sep = ''), suffix = '_reduce')
-    c3x3 = ConvFactory(data = c3x3r, num_filter = num_3x3, kernel = c(3, 3),
-                       pad = c(1, 1), name = paste(name, '_3x3', sep = ''))
-    # double 3x3 reduce + double 3x3
-    cd5x5r = ConvFactory(data = data, num_filter = num_d5x5red, kernel = c(1, 1),
-                         name = paste(name, '_5x5', sep = ''), suffix = '_reduce')
-    cd5x5 = ConvFactory(data = cd5x5r, num_filter = num_d5x5, kernel = c(5, 5), pad = c(2, 2),
-                        name = paste(name, '_5x5', sep = ''))
-    # pool + proj
-    pooling = mx.symbol.Pooling(data = data, kernel = c(3, 3), stride = c(1, 1), 
-                                pad = c(1, 1), pool_type = pool,
-                                name = paste(pool, '_pool_', name, '_pool', sep = ''))
-
-    cproj = ConvFactory(data = pooling, num_filter = proj, kernel = c(1, 1), 
-                        name = paste(name, '_proj', sep = ''))
-    # concat
-    concat_lst <- list()
-    concat_lst <- c(c1x1, c3x3, cd5x5, cproj)
-    concat_lst$num.args = 4
-    concat_lst$name = paste('ch_concat_', name, '_chconcat', sep = '')
-    concat = mxnet:::mx.varg.symbol.Concat(concat_lst)
-    return(concat)
-}
-
-
-get_symbol <- function(num_classes = 1000) {
-  data <- mx.symbol.Variable("data")
-  conv1 <- ConvFactory(data, 64, kernel = c(7, 7), stride = c(2, 2), pad = c(3, 3), name = "conv1")
-  pool1 <- mx.symbol.Pooling(conv1, kernel = c(3, 3), stride = c(2, 2), pool_type = "max")
-  conv2 <- ConvFactory(pool1, 64, kernel = c(1, 1), stride = c(1, 1), name = "conv2")
-  conv3 <- ConvFactory(conv2, 192, kernel = c(3, 3), stride = c(1, 1), pad = c(1, 1), name = "conv3")
-  pool3 <- mx.symbol.Pooling(conv3, kernel = c(3, 3), stride = c(2, 2), pool_type = "max")
-  
-  in3a <- InceptionFactory(pool3, 64, 96, 128, 16, 32, "max", 32, name = "in3a")
-  in3b <- InceptionFactory(in3a, 128, 128, 192, 32, 96, "max", 64, name = "in3b")
-  pool4 <- mx.symbol.Pooling(in3b, kernel = c(3, 3), stride = c(2, 2), pool_type = "max")
-  in4a <- InceptionFactory(pool4, 192, 96, 208, 16, 48, "max", 64, name = "in4a")
-  in4b <- InceptionFactory(in4a, 160, 112, 224, 24, 64, "max", 64, name = "in4b")
-  in4c <- InceptionFactory(in4b, 128, 128, 256, 24, 64, "max", 64, name = "in4c")
-  in4d <- InceptionFactory(in4c, 112, 144, 288, 32, 64, "max", 64, name = "in4d")
-  in4e <- InceptionFactory(in4d, 256, 160, 320, 32, 128, "max", 128, name = "in4e")
-  pool5 <- mx.symbol.Pooling(in4e, kernel = c(3, 3), stride = c(2, 2), pool_type = "max")
-  in5a <- InceptionFactory(pool5, 256, 160, 320, 32, 128, "max", 128, name = "in5a")
-  in5b <- InceptionFactory(in5a, 384, 192, 384, 48, 128, "max", 128, name = "in5b")
-  pool6 <- mx.symbol.Pooling(in5b, kernel = c(7, 7), stride = c(1, 1), pool_type = "avg" )
-  flatten <- mx.symbol.Flatten(data = pool6, name = 'flatten0')
-  fc1 <- mx.symbol.FullyConnected(data = flatten, num_hidden = num_classes)
-  softmax <- mx.symbol.SoftmaxOutput(data = fc1, name = 'softmax')
-  return(softmax)
-}
diff --git a/example/image-classification/symbol_inception-bn-28-small.R b/example/image-classification/symbol_inception-bn-28-small.R
deleted file mode 100644
index 7ecbf5d88b52..000000000000
--- a/example/image-classification/symbol_inception-bn-28-small.R
+++ /dev/null
@@ -1,89 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-library(mxnet)
-
-# Basic Conv + BN + ReLU factory
-ConvFactory <- function(data, num_filter, kernel, stride = c(1,1),
-                        pad = c(0, 0), act_type = "relu") {
-  conv = mx.symbol.Convolution(
-    data = data, num_filter = num_filter, kernel = kernel, stride = stride, pad =
-      pad
-  )
-  bn = mx.symbol.BatchNorm(data = conv)
-  act = mx.symbol.Activation(data = bn, act_type = act_type)
-  return(act)
-}
-
-# A Simple Downsampling Factory
-DownsampleFactory <- function(data, ch_3x3) {
-  # conv 3x3
-  conv = ConvFactory(
-    data = data, kernel = c(3, 3), stride = c(2, 2), num_filter = ch_3x3, pad =
-      c(1, 1)
-  )
-  
-  # pool
-  pool = mx.symbol.Pooling(
-    data = data, kernel = c(3, 3), stride = c(2, 2), pad = c(1, 1), pool_type =
-      'max'
-  )
-  # concat
-  concat = mx.symbol.Concat(c(conv, pool), num.args = 2)
-  return(concat)
-}
-
-# A Simple module
-SimpleFactory <- function(data, ch_1x1, ch_3x3) {
-  # 1x1
-  conv1x1 = ConvFactory(
-    data = data, kernel = c(1, 1), pad = c(0, 0), num_filter = ch_1x1
-  )
-  # 3x3
-  conv3x3 = ConvFactory(
-    data = data, kernel = c(3, 3), pad = c(1, 1), num_filter = ch_3x3
-  )
-  #concat
-  concat = mx.symbol.Concat(c(conv1x1, conv3x3), num.args = 2)
-  return(concat)
-}
-
-get_symbol <- function(num_classes = 10) {
-  data = mx.symbol.Variable(name = "data")
-  conv1 = ConvFactory(
-    data = data, kernel = c(3,3), pad = c(1,1), num_filter = 96,
-    act_type = "relu"
-  )
-  in3a = SimpleFactory(conv1, 32, 32)
-  in3b = SimpleFactory(in3a, 32, 48)
-  in3c = DownsampleFactory(in3b, 80)
-  in4a = SimpleFactory(in3c, 112, 48)
-  in4b = SimpleFactory(in4a, 96, 64)
-  in4c = SimpleFactory(in4b, 80, 80)
-  in4d = SimpleFactory(in4c, 48, 96)
-  in4e = DownsampleFactory(in4d, 96)
-  in5a = SimpleFactory(in4e, 176, 160)
-  in5b = SimpleFactory(in5a, 176, 160)
-  pool = mx.symbol.Pooling(
-    data = in5b, pool_type = "avg", kernel = c(7,7), name = "global_pool"
-  )
-  flatten = mx.symbol.Flatten(data = pool, name = "flatten1")
-  fc = mx.symbol.FullyConnected(data = flatten, num_hidden = num_classes, name =
-                                  "fc1")
-  softmax = mx.symbol.SoftmaxOutput(data = fc, name = "softmax")
-  return(softmax)
-}
\ No newline at end of file
diff --git a/example/image-classification/symbol_inception-bn.R b/example/image-classification/symbol_inception-bn.R
deleted file mode 100644
index c70989ff059c..000000000000
--- a/example/image-classification/symbol_inception-bn.R
+++ /dev/null
@@ -1,134 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-library(mxnet)
-
-eps = 1e-10 + 1e-5
-bn_mom = 0.9
-fix_gamma = FALSE
-
-ConvFactory <- function(data, num_filter, kernel, stride = c(1, 1),
-                        pad = c(0, 0), name = '', suffix = '') {
-    conv <- mx.symbol.Convolution(data = data, num_filter = num_filter,
-                                  kernel = kernel, stride = stride, pad = pad,
-                                  name = paste('conv_', name, suffix, sep = ''))
-    
-    bn <- mx.symbol.BatchNorm(data = conv, eps = eps, momentum = bn_mom, fix.gamma = fix_gamma, name = paste('bn_', name, suffix, sep = ''))
-    act <- mx.symbol.Activation(data = bn, act_type = 'relu', name = paste('relu_', name, suffix, sep = ''))
-    return(act)
-}
-
-InceptionFactoryA <- function(data, num_1x1, num_3x3red, num_3x3, num_d3x3red,
-                              num_d3x3, pool, proj, name) {
-  # 1x1
-  c1x1 <- ConvFactory(data = data, num_filter = num_1x1, kernel = c(1, 1), name = paste(name, '_1x1', sep = '')
-    )
-  # 3x3 reduce + 3x3
-  c3x3r <- ConvFactory(data = data, num_filter = num_3x3red, kernel = c(1, 1),
-                       name = paste(name, '_3x3', sep = ''), suffix = '_reduce')
-
-  c3x3 <- ConvFactory(data = c3x3r, num_filter = num_3x3, kernel = c(3, 3),
-                      pad = c(1, 1), name = paste(name, '_3x3', sep = ''))
-  # double 3x3 reduce + double 3x3
-  cd3x3r <- ConvFactory(data = data, num_filter = num_d3x3red, kernel = c(1, 1),
-                        name = paste(name, '_double_3x3', sep = ''), suffix = '_reduce')
-
-  cd3x3 <- ConvFactory(data = cd3x3r, num_filter = num_d3x3, kernel = c(3, 3),
-                       pad = c(1, 1), name = paste(name, '_double_3x3_0', sep = ''))
-
-  cd3x3 <- ConvFactory(data = cd3x3, num_filter = num_d3x3, kernel = c(3, 3),
-                       pad = c(1, 1), name = paste(name, '_double_3x3_1', sep = ''))
-  # pool + proj
-  pooling <- mx.symbol.Pooling(data = data, kernel = c(3, 3), stride = c(1, 1),
-                               pad = c(1, 1), pool_type = pool,
-                               name = paste(pool, '_pool_', name, '_pool', sep = ''))
-  cproj <- ConvFactory(data = pooling, num_filter = proj, kernel = c(1, 1),
-                       name = paste(name, '_proj', sep = ''))
-  # concat
-  concat_lst <- list()
-  concat_lst <- c(c1x1, c3x3, cd3x3, cproj)
-  concat_lst$num.args = 4
-  concat_lst$name = paste('ch_concat_', name, '_chconcat', sep = '')
-  concat = mxnet:::mx.varg.symbol.Concat(concat_lst)
-  return(concat)
-}
-
-InceptionFactoryB <- function(data, num_3x3red, num_3x3, num_d3x3red, num_d3x3, name) {
-    # 3x3 reduce + 3x3
-    c3x3r <- ConvFactory(data = data, num_filter = num_3x3red, kernel = c(1, 1),
-                         name = paste(name, '_3x3', sep = ''), suffix = '_reduce')
-    c3x3 <- ConvFactory(data = c3x3r, num_filter = num_3x3, kernel = c(3, 3),
-                        pad = c(1, 1), stride = c(2, 2), name = paste(name, '_3x3', sep = ''))
-    # double 3x3 reduce + double 3x3
-    cd3x3r <- ConvFactory(data = data, num_filter = num_d3x3red, kernel = c(1, 1),
-                         name = paste(name, '_double_3x3', sep = ''), suffix = '_reduce')
-    cd3x3 <- ConvFactory(data = cd3x3r, num_filter = num_d3x3, kernel = c(3, 3),
-                         pad = c(1, 1), stride = c(1, 1), name = paste(name, '_double_3x3_0', sep = ''))
-    cd3x3 = ConvFactory(data = cd3x3, num_filter = num_d3x3, kernel = c(3, 3),
-                        pad = c(1, 1), stride = c(2, 2), name = paste(name, '_double_3x3_1', sep = ''))
-    # pool + proj
-    pooling = mx.symbol.Pooling(data = data, kernel = c(3, 3), stride = c(2, 2),
-                                pad = c(1, 1), pool_type = "max",
-                                name = paste('max_pool_', name, '_pool', sep = ''))
-    # concat
-    concat_lst <- list()
-    concat_lst <- c(c3x3, cd3x3, pooling)
-    concat_lst$num.args = 3
-    concat_lst$name = paste('ch_concat_', name, '_chconcat', sep = '')
-    concat = mxnet:::mx.varg.symbol.Concat(concat_lst)
-    return(concat)
-}
-
-get_symbol <- function(num_classes = 1000) {
-  # data
-  data = mx.symbol.Variable(name = "data")
-  # stage 1
-  conv1 = ConvFactory(data = data, num_filter = 64, kernel = c(7, 7),
-                      stride = c(2, 2), pad = c(3, 3), name = '1')
-  pool1 = mx.symbol.Pooling(data = conv1, kernel = c(3, 3), stride = c(2, 2),
-                            name = 'pool_1', pool_type = 'max')
-  # stage 2
-  conv2red = ConvFactory(data = pool1, num_filter = 64, kernel = c(1, 1),
-                         stride = c(1, 1), name = '2_red')
-  conv2 = ConvFactory(data = conv2red, num_filter = 192, kernel = c(3, 3),
-                      stride = c(1, 1), pad = c(1, 1), name = '2')
-  pool2 = mx.symbol.Pooling(data = conv2, kernel = c(3, 3), stride = c(2, 2),
-                            name = 'pool_2', pool_type = 'max')
-  # stage 2
-  in3a = InceptionFactoryA(pool2, 64, 64, 64, 64, 96, "avg", 32, '3a')
-  in3b = InceptionFactoryA(in3a, 64, 64, 96, 64, 96, "avg", 64, '3b')
-  in3c = InceptionFactoryB(in3b, 128, 160, 64, 96, '3c')
-  # stage 3
-  in4a = InceptionFactoryA(in3c, 224, 64, 96, 96, 128, "avg", 128, '4a')
-  in4b = InceptionFactoryA(in4a, 192, 96, 128, 96, 128, "avg", 128, '4b')
-  in4c = InceptionFactoryA(in4b, 160, 128, 160, 128, 160, "avg", 128, '4c')
-  in4d = InceptionFactoryA(in4c, 96, 128, 192, 160, 192, "avg", 128, '4d')
-  in4e = InceptionFactoryB(in4d, 128, 192, 192, 256, '4e')
-  # stage 4
-  in5a = InceptionFactoryA(in4e, 352, 192, 320, 160, 224, "avg", 128, '5a')
-  in5b = InceptionFactoryA(in5a, 352, 192, 320, 192, 224, "max", 128, '5b')
-  # global avg pooling
-  avg = mx.symbol.Pooling(data = in5b, kernel = c(7, 7), stride = c(1, 1),
-                          name = "global_pool", pool_type = 'avg')
-  # linear classifier
-  flatten = mx.symbol.Flatten(data = avg, name = 'flatten')
-  fc1 = mx.symbol.FullyConnected(data = flatten,
-                                 num_hidden = num_classes,
-                                 name = 'fc1')
-  softmax = mx.symbol.SoftmaxOutput(data = fc1, name = 'softmax')
-  return(softmax)
-}
diff --git a/example/image-classification/symbol_inception-resnet-v1.R b/example/image-classification/symbol_inception-resnet-v1.R
deleted file mode 100644
index 09ed713aeb85..000000000000
--- a/example/image-classification/symbol_inception-resnet-v1.R
+++ /dev/null
@@ -1,410 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# Inception resnet v1, suitable for images with around 299 x 299
-#
-# Reference:
-# Szegedy C, Ioffe S, Vanhoucke V. Inception-v4, inception-resnet and 
-# the impact of residual connections on learning, 2016.
-# Link to the paper: https://arxiv.org/abs/1602.07261
-#
-library(mxnet)
-
-Conv <- function(data, num_filter, kernel=c(1, 1), stride=c(1, 1), pad=c(0, 0), 
-                 name, suffix="", withRelu=TRUE, withBn=FALSE){
-  conv <- mx.symbol.Convolution(data=data, num_filter=num_filter, kernel=kernel, 
-                               stride=stride, pad=pad,
-                               name=paste0(name, suffix, "_conv2d"))
-  if (withBn){
-    conv <- mx.symbol.BatchNorm(data=conv, name=paste0(name, suffix, "_bn"))
-  }
-  if (withRelu){
-    conv <- mx.symbol.Activation(data=conv, act_type="relu", 
-                                name=paste0(name, suffix, "_relu"))
-  }
-  
-  return(conv)
-}
-
-# Input Shape is 299*299*3 (th)
-InceptionResnetStem <- function(data,
-                                num_1_1, num_1_2, num_1_3,
-                                num_2_1, num_2_2, num_2_3,
-                                name){
-  stem_3x3 <- Conv(data=data, num_filter=num_1_1, kernel=c(3, 3), stride=c(2, 2),
-                  name=paste0(name, "_conv"))
-  stem_3x3 <- Conv(data=stem_3x3, num_filter=num_1_2, kernel=c(3, 3), 
-                  name=paste0(name, "_stem"), suffix="_conv_1")
-  stem_3x3 <- Conv(data=stem_3x3, num_filter=num_1_3, kernel=c(3, 3), 
-                  name=paste0(name, "_stem"), suffix="_conv_2")
-  pool1 <- mx.symbol.Pooling(data=stem_3x3, kernel=c(3, 3), stride=c(2, 2), 
-                            pool_type="max", name=paste0("max_", name, "_pool1"))
-  stem_1_3x3 <- Conv(data=pool1, num_filter=num_2_1, name=paste0(name, "_stem_1"), 
-                    suffix="_conv_1")
-  stem_1_3x3 <- Conv(data=stem_1_3x3, num_filter=num_2_2, kernel=c(3, 3), 
-                    name=paste0(name, "_stem_1"), suffix="_conv_2")
-  stem_1_3x3 <- Conv(data=stem_1_3x3, num_filter=num_2_3, kernel=c(3, 3), 
-                    pad=c(1, 1), stride=c(2, 2), name=paste0(name, "_stem_1"), 
-                    suffix="_conv_3", withRelu=FALSE)
-  bn1 <- mx.symbol.BatchNorm(data=stem_1_3x3, name=paste0(name, "_bn1"))
-  act1 <- mx.symbol.Activation(data=bn1, act_type="relu", name=paste0(name, "_relu1"))
-  
-  return(act1)
-}
-
-InceptionResnetA <- function(data,
-                             num_1_1,
-                             num_2_1, num_2_2,
-                             num_3_1, num_3_2, num_3_3,
-                             proj,
-                             name,
-                             scaleResidual=TRUE){
-  init <- data
-  
-  a1 <- Conv(data=data, num_filter=num_1_1, name=paste0(name, "_a_1"), suffix="_conv")
-  
-  a2 <- Conv(data=data, num_filter=num_2_1, name=paste0(name, "_a_2"), suffix="_conv_1")
-  a2 <- Conv(data=a2, num_filter=num_2_2, kernel=c(3, 3), pad=c(1, 1), 
-            name=paste0(name, "_a_2"), suffix="_conv_2")
-  
-  a3 <- Conv(data=data, num_filter=num_3_1, name=paste0(name, "_a_3"), suffix="_conv_1")
-  a3 <- Conv(data=a3, num_filter=num_3_2, kernel=c(3, 3), pad=c(1, 1), 
-            name=paste0(name, "_a_3"), suffix="_conv_2")
-  a3 <- Conv(data=a3, num_filter=num_3_3, kernel=c(3, 3), pad=c(1, 1), 
-            name=paste0(name, "_a_3"), suffix="_conv_3")
-  # concat
-  merge_lst <- list()
-  merge_lst <- c(a1, a2, a3)
-  merge_lst$num.args <- length(merge_lst)
-  merge_lst$name <- paste0(name, "_a_concat1")
-  merge <- mxnet:::mx.varg.symbol.Concat(merge_lst)
-  
-  conv <- Conv(data=merge, num_filter=proj, name=paste0(name, "_a_liner_conv"), 
-              withRelu=FALSE)
-  if(scaleResidual){
-    conv <- conv*0.1
-  }
-  
-  out <- init + conv
-  bn <- mx.symbol.BatchNorm(data=out, name=paste0(name, "_a_bn1"))
-  act <- mx.symbol.Activation(data=bn, act_type="relu", name=paste0(name, "_a_relu1"))
-  
-  return(act)
-}
-
-InceptionResnetB <- function(data,
-                             num_1_1,
-                             num_2_1, num_2_2, num_2_3,
-                             proj,
-                             name,
-                             scaleResidual=TRUE){
-  init <- data
-  
-  b1 <- Conv(data=data, num_filter=num_1_1, name=paste0(name, "_b_1"), suffix="_conv")
-  
-  b2 <- Conv(data=data, num_filter=num_2_1, name=paste0(name, "_b_2"), suffix="_conv_1")
-  b2 <- Conv(data=b2, num_filter=num_2_2, kernel=c(1, 7), pad=c(0, 3), 
-            name=paste0(name, "_b_2"), suffix="_conv_2")
-  b2 <- Conv(data=b2, num_filter=num_2_3, kernel=c(7, 1), pad=c(3, 0), 
-            name=paste0(name, "_b_2"), suffix="_conv_3")
-  
-  merge_lst <- list()
-  merge_lst <- c(b1, b2)
-  merge_lst$num.args <- length(merge_lst)
-  merge_lst$name <- paste0(name, "_b_concat1")
-  merge <- mxnet:::mx.varg.symbol.Concat(merge_lst)
-  
-  conv <- Conv(data=merge, num_filter=proj, name=paste0(name, "_b_liner_conv"), 
-              withRelu=FALSE)
-  if(scaleResidual){
-    conv <- conv*0.1
-  }
-  
-  out <- init + conv
-  bn <- mx.symbol.BatchNorm(data=out, name=paste0(name, "_b_bn1"))
-  act <- mx.symbol.Activation(data=bn, act_type="relu", name=paste0(name, "_b_relu1"))
-  
-  return(act)
-}
-
-InceptionResnetC <- function(data,
-                             num_1_1,
-                             num_2_1, num_2_2, num_2_3,
-                             proj,
-                             name,
-                             scaleResidual=TRUE){
-  
-  init <- data
-  
-  c1 <- Conv(data=data, num_filter=num_1_1, name=paste0(name, "_c_1"), suffix="_conv")
-  
-  c2 <- Conv(data=data, num_filter=num_2_1, name=paste0(name, "_c_2"), suffix="_conv_1")
-  c2 <- Conv(data=c2, num_filter=num_2_2, kernel=c(1, 3), pad=c(0, 1), 
-            name=paste0(name, "_c_2"), suffix="_conv_2")
-  c2 <- Conv(data=c2, num_filter=num_2_3, kernel=c(3, 1), pad=c(1, 0), 
-            name=paste0(name, "_c_2"), suffix="_conv_3")
-  
-  merge_lst <- list()
-  merge_lst <- c(c1, c2)
-  merge_lst$num.args <- length(merge_lst)
-  merge_lst$name <- paste0(name, "_c_concat1")
-  merge <- mxnet:::mx.varg.symbol.Concat(merge_lst)
-  
-  conv <- Conv(data=merge, num_filter=proj, name=paste0(name, "_b_liner_conv"), 
-              withRelu=FALSE)
-  if(scaleResidual){
-    conv <- conv*0.1
-  }
-  
-  out <- init + conv
-  bn <- mx.symbol.BatchNorm(data=out, name=paste0(name, "_c_bn1"))
-  act <- mx.symbol.Activation(data=bn, act_type="relu", name=paste0(name, "_c_relu1"))
-  
-  return(act)
-}
-
-ReductionResnetA <- function(data,
-                             num_2_1,
-                             num_3_1, num_3_2, num_3_3,
-                             name){
-  
-  ra1 <- mx.symbol.Pooling(data=data, kernel=c(3, 3), stride=c(2, 2), 
-                          pool_type="max", name=paste0("max_", name, "_pool1"))
-  
-  ra2 <- Conv(data=data, num_filter=num_2_1, kernel=c(3, 3), stride=c(2, 2), 
-             name=paste0(name, "_ra_2"), suffix="_conv", withRelu=FALSE)
-  
-  ra3 <- Conv(data=data, num_filter=num_3_1, name=paste0(name, "_ra_3"), suffix="_conv_1")
-  ra3 <- Conv(data=ra3, num_filter=num_3_2, kernel=c(3, 3), pad=c(1, 1), 
-             name=paste0(name, "_ra_3"), suffix="_conv_2")
-  ra3 <- Conv(data=ra3, num_filter=num_3_3, kernel=c(3, 3), stride=c(2, 2), 
-             name=paste0(name, "_ra_3"), suffix="_conv_3", withRelu=FALSE)
-  
-  merge_lst <- list()
-  merge_lst <- c(ra1, ra2, ra3)
-  merge_lst$num.args <- length(merge_lst)
-  merge_lst$name <- paste0(name, "_ra_concat1")
-  m <- mxnet:::mx.varg.symbol.Concat(merge_lst)
-  
-  m <- mx.symbol.BatchNorm(data=m, name=paste0(name, "_ra_bn1"))
-  m <- mx.symbol.Activation(data=m, act_type="relu", name=paste0(name, "_ra_relu1"))
-  
-  return(m)
-}
-
-ReductionResnetB <- function(data,
-                             num_2_1, num_2_2,
-                             num_3_1, num_3_2,
-                             num_4_1, num_4_2, num_4_3,
-                             name){
-  rb1 <- mx.symbol.Pooling(data=data, kernel=c(3, 3), stride=c(2, 2), 
-                          pool_type="max", name=paste0("max_", name, "_pool1"))
-  
-  rb2 <- Conv(data=data, num_filter=num_2_1, name=paste0(name, "_rb_2"), suffix="_conv_1")
-  rb2 <- Conv(data=rb2, num_filter=num_2_2, kernel=c(3, 3), stride=c(2, 2), 
-             name=paste0(name, "_rb_2"), suffix="_conv_2", withRelu=FALSE)
-  
-  rb3 <- Conv(data=data, num_filter=num_3_1, name=paste0(name, "_rb_3"), suffix="_conv_1")
-  rb3 <- Conv(data=rb3, num_filter=num_3_2, kernel=c(3, 3), stride=c(2, 2), 
-             name=paste0(name, "_rb_3"), suffix="_conv_2", withRelu=FALSE)
-  
-  rb4 <- Conv(data=data, num_filter=num_4_1, name=paste0(name, "_rb_4"), suffix="_conv_1")
-  rb4 <- Conv(data=rb4, num_filter=num_4_2, kernel=c(3, 3), pad=c(1, 1), 
-             name=paste0(name, "_rb_4"), suffix="_conv_2")
-  rb4 <- Conv(data=rb4, num_filter=num_4_3, kernel=c(3, 3), stride=c(2, 2), 
-             name=paste0(name, "_rb_4"), suffix="_conv_3", withRelu=FALSE)
-  
-  merge_lst <- list()
-  merge_lst <- c(rb1, rb2, rb3, rb4)
-  merge_lst$num.args <- length(merge_lst)
-  merge_lst$name <- paste0(name, "_rb_concat1")
-  m <- mxnet:::mx.varg.symbol.Concat(merge_lst)
-  
-  m <- mx.symbol.BatchNorm(data=m, name=paste0(name, "_rb_bn1"))
-  m <- mx.symbol.Activation(data=m, act_type="relu", name=paste0(name, "_rb_relu1"))
-  
-  return(m)
-}
-
-circle_in3a <- function(data,
-                        num_1_1,
-                        num_2_1, num_2_2,
-                        num_3_1, num_3_2, num_3_3,
-                        proj,
-                        name,
-                        scale,
-                        round){
-  in3a <- data
-  for(i in 1:round){
-    in3a <- InceptionResnetA(in3a,
-                            num_1_1,
-                            num_2_1, num_2_2,
-                            num_3_1, num_3_2, num_3_3,
-                            proj,
-                            paste0(name, "_", i),
-                            scaleResidual=scale)
-  }
-  return(in3a)
-  
-}
-
-circle_in2b <- function(data,
-                        num_1_1,
-                        num_2_1, num_2_2, num_2_3,
-                        proj,
-                        name,
-                        scale,
-                        round){
-  in2b <- data
-  for(i in 1:round){
-    in2b <- InceptionResnetB(in2b,
-                            num_1_1,
-                            num_2_1, num_2_2, num_2_3,
-                            proj,
-                            paste0(name, "_", i),
-                            scaleResidual=scale)
-  }
-  return(in2b)
-}
-
-circle_in2c <- function(data,
-                        num_1_1,
-                        num_2_1, num_2_2, num_2_3,
-                        proj,
-                        name,
-                        scale,
-                        round){
-  in2c <- data
-  for(i in 1:round){
-    in2c <- InceptionResnetC(in2c,
-                            num_1_1,
-                            num_2_1, num_2_2, num_2_3,
-                            proj,
-                            paste0(name, "_", i),
-                            scaleResidual=scale)
-  }
-  return(in2c)
-}
-
-# create inception-resnet-v1
-get_symbol <- function(num_classes=1000, scale=TRUE){
-  
-  # input shape 229*229*3
-  data <- mx.symbol.Variable(name="data")
-  
-  # stage stem
-  num_1_1 <- 32
-  num_1_2 <- 32
-  num_1_3 <- 64
-  num_2_1 <- 80
-  num_2_2 <- 192
-  num_2_3 <- 256
-  
-  in_stem <- InceptionResnetStem(data,
-                                num_1_1, num_1_2, num_1_3,
-                                num_2_1, num_2_2, num_2_3,
-                                "stem_stage")
-  
-  # stage 5 x Inception Resnet A
-  num_1_1 <- 32
-  num_2_1 <- 32
-  num_2_2 <- 32
-  num_3_1 <- 32
-  num_3_2 <- 32
-  num_3_3 <- 32
-  proj <- 256
-  
-  in3a <- circle_in3a(in_stem,
-                     num_1_1,
-                     num_2_1, num_2_2,
-                     num_3_1, num_3_2, num_3_3,
-                     proj,
-                     "in3a",
-                     scale,
-                     5)
-  
-  # stage Reduction Resnet A
-  num_1_1 <- 384
-  num_2_1 <- 192
-  num_2_2 <- 192
-  num_2_3 <- 256
-  
-  re3a <- ReductionResnetA(in3a,
-                          num_1_1,
-                          num_2_1, num_2_2, num_2_3,
-                          "re3a")
-  
-  # stage 10 x Inception Resnet B
-  num_1_1 <- 128
-  num_2_1 <- 128
-  num_2_2 <- 128
-  num_2_3 <- 128
-  proj <- 896
-  
-  in2b <- circle_in2b(re3a,
-                     num_1_1,
-                     num_2_1, num_2_2, num_2_3,
-                     proj,
-                     "in2b",
-                     scale,
-                     10)
-  
-  # stage Reduction Resnet B
-  num_1_1 <- 256
-  num_1_2 <- 384
-  num_2_1 <- 256
-  num_2_2 <- 256
-  num_3_1 <- 256
-  num_3_2 <- 256
-  num_3_3 <- 256
-  
-  re4b <- ReductionResnetB(in2b,
-                          num_1_1, num_1_2,
-                          num_2_1, num_2_2,
-                          num_3_1, num_3_2, num_3_3,
-                          "re4b")
-  
-  # stage 5 x Inception Resnet C
-  num_1_1 <- 128
-  num_2_1 <- 192
-  num_2_2 <- 192
-  num_2_3 <- 192
-  proj <-  1792
-  
-  in2c <- circle_in2c(re4b,
-                     num_1_1,
-                     num_2_1, num_2_2, num_2_3,
-                     proj,
-                     "in2c",
-                     scale,
-                     5)
-  
-  # stage Average Pooling
-  pool <- mx.symbol.Pooling(data=in2c, kernel=c(8, 8), stride=c(1, 1), 
-                           pool_type="avg", name="global_pool")
-  
-  # stage Dropout
-  dropout <- mx.symbol.Dropout(data=pool, p=0.2)
-  # dropout =  mx.symbol.Dropout(data=pool, p=0.8)
-  flatten <- mx.symbol.Flatten(data=dropout, name="flatten")
-  
-  # output
-  fc1 <- mx.symbol.FullyConnected(data=flatten, num_hidden=num_classes, name="fc1")
-  softmax <- mx.symbol.SoftmaxOutput(data=fc1, name="softmax")
-  
-  return(softmax)
-}
\ No newline at end of file
diff --git a/example/image-classification/symbol_inception-resnet-v2.R b/example/image-classification/symbol_inception-resnet-v2.R
deleted file mode 100644
index 071432098638..000000000000
--- a/example/image-classification/symbol_inception-resnet-v2.R
+++ /dev/null
@@ -1,455 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# Inception resnet v2, suitable for images with around 299 x 299
-#
-# Reference:
-# Szegedy C, Ioffe S, Vanhoucke V. Inception-v4, inception-resnet and 
-# the impact of residual connections on learning, 2016.
-# Link to the paper: https://arxiv.org/abs/1602.07261
-#
-library(mxnet)
-
-Conv <- function(data, num_filter, kernel=c(1, 1), stride=c(1, 1), pad=c(0, 0), 
-                 name, suffix="", withRelu=TRUE, withBn=FALSE){
-  conv <- mx.symbol.Convolution(data=data, num_filter=num_filter, kernel=kernel, 
-                                stride=stride, pad=pad,
-                                name=paste0(name, suffix, "_conv2d"))
-  if (withBn){
-    conv <- mx.symbol.BatchNorm(data=conv, name=paste0(name, suffix, "_bn"))
-  }
-  if (withRelu){
-    conv <- mx.symbol.Activation(data=conv, act_type="relu", 
-                                 name=paste0(name, suffix, "_relu"))
-  }
-  
-  return(conv)
-}
-
-# Input Shape is 299*299*3 (th)
-InceptionResnetStem <- function(data,
-                                num_1_1, num_1_2, num_1_3,
-                                num_2_1,
-                                num_3_1, num_3_2,
-                                num_4_1, num_4_2, num_4_3, num_4_4,
-                                num_5_1,
-                                name){
-  stem_3x3 <- Conv(data=data, num_filter=num_1_1, kernel=c(3, 3), stride=c(2, 2),
-                   name=paste0(name, "_conv"))
-  stem_3x3 <- Conv(data=stem_3x3, num_filter=num_1_2, kernel=c(3, 3), 
-                   name=paste0(name, "_stem"), suffix="_conv")
-  stem_3x3 <- Conv(data=stem_3x3, num_filter=num_1_3, kernel=c(3, 3), pad=c(1,1),
-                   name=paste0(name, "_stem"), suffix="_conv_1")
-
-  
-  pool1 <- mx.symbol.Pooling(data=stem_3x3, kernel=c(3, 3), stride=c(2, 2), 
-                             pool_type="max", name=paste0("max_", name, "_pool1"))
-  
-  stem_1_3x3 <- Conv(data=stem_3x3, num_filter=num_2_1, kernel=c(3, 3), stride=c(2, 2), 
-                     name=paste0(name, "_stem_1"), suffix="_conv_1")
-
-  merge_lst <- list()
-  merge_lst <- c(pool1, stem_1_3x3)
-  merge_lst$num.args <- length(merge_lst)
-  merge_lst$name <- paste0(name, "_concat1")
-  concat1 <- mxnet:::mx.varg.symbol.Concat(merge_lst)
-  
-  stem_1_1x1 <- Conv(data=concat1, num_filter=num_3_1, name=paste0(name, "_stem_1"), suffix='_conv_2')
-  stem_1_3x3 <- Conv(data=stem_1_1x1, num_filter=num_3_2, kernel=c(3, 3), 
-                     name=paste0(name, "_stem_1"), suffix='_conv_3')
-  stem_2_1x1 <- Conv(data=concat1, num_filter=num_4_1, name=paste0(name, "_stem_2"), suffix='_conv_1')
-  stem_2_7x1 <- Conv(data=stem_2_1x1, num_filter=num_4_2, kernel=c(7, 1), pad=c(3, 0),
-                     name=paste0(name, "_stem_2"), suffix='_conv_2')
-  stem_2_1x7 <- Conv(data=stem_2_7x1, num_filter=num_4_3, kernel=c(1, 7), pad=c(0, 3),
-                     name=paste0(name, "_stem_2"), suffix='_conv_3')
-  stem_2_3x3 <- Conv(data=stem_2_1x7, num_filter=num_4_4, kernel=c(3, 3), 
-                     name=paste0(name, "_stem_2"), suffix='_conv_4')
-
-  merge_lst <- list()
-  merge_lst <- c(stem_1_3x3, stem_2_3x3)
-  merge_lst$num.args <- length(merge_lst)
-  merge_lst$name <- paste0(name, "_concat2")
-  concat2 <- mxnet:::mx.varg.symbol.Concat(merge_lst)
-  
-  pool2 <- mx.symbol.Pooling(data=concat2, kernel=c(3, 3), stride=c(2, 2), 
-                             pool_type="max", name=paste0("max_", name, "_pool2"))
-  
-  stem_3_3x3 <- Conv(data=concat2, num_filter=num_5_1, kernel=c(3, 3), stride=c(2, 2),
-                     name=paste0(name, "_stem_3"), suffix='_conv_1', withRelu=FALSE)
-  
-  merge_lst <- list()
-  merge_lst <- c(pool2, stem_3_3x3)
-  merge_lst$num.args <- length(merge_lst)
-  merge_lst$name <- paste0(name, "_concat3")
-  concat3 <- mxnet:::mx.varg.symbol.Concat(merge_lst)
-
-  bn1 <- mx.symbol.BatchNorm(data=concat3, name=paste0(name, "_bn1"))
-  act1 <- mx.symbol.Activation(data=bn1, act_type="relu", name=paste0(name, "_relu1"))
-  
-  return(act1)
-}
-
-InceptionResnetV2A <- function(data,
-                             num_1_1,
-                             num_2_1, num_2_2,
-                             num_3_1, num_3_2, num_3_3,
-                             proj,
-                             name,
-                             scaleResidual=TRUE){
-  init <- data
-  
-  a1 <- Conv(data=data, num_filter=num_1_1, name=paste0(name, "_a_1"), suffix="_conv")
-  
-  a2 <- Conv(data=data, num_filter=num_2_1, name=paste0(name, "_a_2"), suffix="_conv_1")
-  a2 <- Conv(data=a2, num_filter=num_2_2, kernel=c(3, 3), pad=c(1, 1), 
-             name=paste0(name, "_a_2"), suffix="_conv_2")
-  
-  a3 <- Conv(data=data, num_filter=num_3_1, name=paste0(name, "_a_3"), suffix="_conv_1")
-  a3 <- Conv(data=a3, num_filter=num_3_2, kernel=c(3, 3), pad=c(1, 1), 
-             name=paste0(name, "_a_3"), suffix="_conv_2")
-  a3 <- Conv(data=a3, num_filter=num_3_3, kernel=c(3, 3), pad=c(1, 1), 
-             name=paste0(name, "_a_3"), suffix="_conv_3")
-
-  merge_lst <- list()
-  merge_lst <- c(a1, a2, a3)
-  merge_lst$num.args <- length(merge_lst)
-  merge_lst$name <- paste0(name, "_a_concat1")
-  merge <- mxnet:::mx.varg.symbol.Concat(merge_lst)
-  
-  conv <- Conv(data=merge, num_filter=proj, name=paste0(name, "_a_liner_conv"), 
-               withRelu=FALSE)
-  if(scaleResidual){
-    conv <- conv*0.1
-  }
-  
-  out <- init + conv
-  bn <- mx.symbol.BatchNorm(data=out, name=paste0(name, "_a_bn1"))
-  act <- mx.symbol.Activation(data=bn, act_type="relu", name=paste0(name, "_a_relu1"))
-  
-  return(act)
-}
-
-InceptionResnetV2B <- function(data,
-                             num_1_1,
-                             num_2_1, num_2_2, num_2_3,
-                             proj,
-                             name,
-                             scaleResidual=TRUE){
-  init <- data
-  
-  b1 <- Conv(data=data, num_filter=num_1_1, name=paste0(name, "_b_1"), suffix="_conv")
-  
-  b2 <- Conv(data=data, num_filter=num_2_1, name=paste0(name, "_b_2"), suffix="_conv_1")
-  b2 <- Conv(data=b2, num_filter=num_2_2, kernel=c(1, 7), pad=c(0, 3), 
-             name=paste0(name, "_b_2"), suffix="_conv_2")
-  b2 <- Conv(data=b2, num_filter=num_2_3, kernel=c(7, 1), pad=c(3, 0), 
-             name=paste0(name, "_b_2"), suffix="_conv_3")
-  
-  merge_lst <- list()
-  merge_lst <- c(b1, b2)
-  merge_lst$num.args <- length(merge_lst)
-  merge_lst$name <- paste0(name, "_b_concat1")
-  merge <- mxnet:::mx.varg.symbol.Concat(merge_lst)
-  
-  conv <- Conv(data=merge, num_filter=proj, name=paste0(name, "_b_liner_conv"), 
-               withRelu=FALSE)
-  if(scaleResidual){
-    conv <- conv*0.1
-  }
-  
-  out <- init + conv
-  bn <- mx.symbol.BatchNorm(data=out, name=paste0(name, "_b_bn1"))
-  act <- mx.symbol.Activation(data=bn, act_type="relu", name=paste0(name, "_b_relu1"))
-  
-  return(act)
-}
-
-InceptionResnetV2C <- function(data,
-                             num_1_1,
-                             num_2_1, num_2_2, num_2_3,
-                             proj,
-                             name,
-                             scaleResidual=TRUE){
-  
-  init <- data
-  
-  c1 <- Conv(data=data, num_filter=num_1_1, name=paste0(name, "_c_1"), suffix="_conv")
-  
-  c2 <- Conv(data=data, num_filter=num_2_1, name=paste0(name, "_c_2"), suffix="_conv_1")
-  c2 <- Conv(data=c2, num_filter=num_2_2, kernel=c(1, 3), pad=c(0, 1), 
-             name=paste0(name, "_c_2"), suffix="_conv_2")
-  c2 <- Conv(data=c2, num_filter=num_2_3, kernel=c(3, 1), pad=c(1, 0), 
-             name=paste0(name, "_c_2"), suffix="_conv_3")
-  
-  merge_lst <- list()
-  merge_lst <- c(c1, c2)
-  merge_lst$num.args <- length(merge_lst)
-  merge_lst$name <- paste0(name, "_c_concat1")
-  merge <- mxnet:::mx.varg.symbol.Concat(merge_lst)
-  
-  conv <- Conv(data=merge, num_filter=proj, name=paste0(name, "_b_liner_conv"), 
-               withRelu=FALSE)
-  if(scaleResidual){
-    conv <- conv*0.1
-  }
-  
-  out <- init + conv
-  bn <- mx.symbol.BatchNorm(data=out, name=paste0(name, "_c_bn1"))
-  act <- mx.symbol.Activation(data=bn, act_type="relu", name=paste0(name, "_c_relu1"))
-  
-  return(act)
-}
-
-ReductionResnetV2A <- function(data,
-                             num_2_1,
-                             num_3_1, num_3_2, num_3_3,
-                             name){
-  
-  ra1 <- mx.symbol.Pooling(data=data, kernel=c(3, 3), stride=c(2, 2), 
-                           pool_type="max", name=paste0("max_", name, "_pool1"))
-  
-  ra2 <- Conv(data=data, num_filter=num_2_1, kernel=c(3, 3), stride=c(2, 2), 
-              name=paste0(name, "_ra_2"), suffix="_conv", withRelu=FALSE)
-  
-  ra3 <- Conv(data=data, num_filter=num_3_1, name=paste0(name, "_ra_3"), suffix="_conv_1")
-  ra3 <- Conv(data=ra3, num_filter=num_3_2, kernel=c(3, 3), pad=c(1, 1), 
-              name=paste0(name, "_ra_3"), suffix="_conv_2")
-  ra3 <- Conv(data=ra3, num_filter=num_3_3, kernel=c(3, 3), stride=c(2, 2), 
-              name=paste0(name, "_ra_3"), suffix="_conv_3", withRelu=FALSE)
-  
-  merge_lst <- list()
-  merge_lst <- c(ra1, ra2, ra3)
-  merge_lst$num.args <- length(merge_lst)
-  merge_lst$name <- paste0(name, "_ra_concat1")
-  m <- mxnet:::mx.varg.symbol.Concat(merge_lst)
-  
-  m <- mx.symbol.BatchNorm(data=m, name=paste0(name, "_ra_bn1"))
-  m <- mx.symbol.Activation(data=m, act_type="relu", name=paste0(name, "_ra_relu1"))
-  
-  return(m)
-}
-
-ReductionResnetV2B <- function(data,
-                             num_2_1, num_2_2,
-                             num_3_1, num_3_2,
-                             num_4_1, num_4_2, num_4_3,
-                             name){
-  rb1 <- mx.symbol.Pooling(data=data, kernel=c(3, 3), stride=c(2, 2), 
-                           pool_type="max", name=paste0("max_", name, "_pool1"))
-  
-  rb2 <- Conv(data=data, num_filter=num_2_1, name=paste0(name, "_rb_2"), suffix="_conv_1")
-  rb2 <- Conv(data=rb2, num_filter=num_2_2, kernel=c(3, 3), stride=c(2, 2), 
-              name=paste0(name, "_rb_2"), suffix="_conv_2", withRelu=FALSE)
-  
-  rb3 <- Conv(data=data, num_filter=num_3_1, name=paste0(name, "_rb_3"), suffix="_conv_1")
-  rb3 <- Conv(data=rb3, num_filter=num_3_2, kernel=c(3, 3), stride=c(2, 2), 
-              name=paste0(name, "_rb_3"), suffix="_conv_2", withRelu=FALSE)
-  
-  rb4 <- Conv(data=data, num_filter=num_4_1, name=paste0(name, "_rb_4"), suffix="_conv_1")
-  rb4 <- Conv(data=rb4, num_filter=num_4_2, kernel=c(3, 3), pad=c(1, 1), 
-              name=paste0(name, "_rb_4"), suffix="_conv_2")
-  rb4 <- Conv(data=rb4, num_filter=num_4_3, kernel=c(3, 3), stride=c(2, 2), 
-              name=paste0(name, "_rb_4"), suffix="_conv_3", withRelu=FALSE)
-  
-  merge_lst <- list()
-  merge_lst <- c(rb1, rb2, rb3, rb4)
-  merge_lst$num.args <- length(merge_lst)
-  merge_lst$name <- paste0(name, "_rb_concat1")
-  m <- mxnet:::mx.varg.symbol.Concat(merge_lst)
-  
-  m <- mx.symbol.BatchNorm(data=m, name=paste0(name, "_rb_bn1"))
-  m <- mx.symbol.Activation(data=m, act_type="relu", name=paste0(name, "_rb_relu1"))
-  
-  return(m)
-}
-
-circle_in3a <- function(data,
-                        num_1_1,
-                        num_2_1, num_2_2,
-                        num_3_1, num_3_2, num_3_3,
-                        proj,
-                        name,
-                        scale,
-                        round){
-  in3a <- data
-  for(i in 1:round){
-    in3a <- InceptionResnetV2A(in3a,
-                             num_1_1,
-                             num_2_1, num_2_2,
-                             num_3_1, num_3_2, num_3_3,
-                             proj,
-                             paste0(name, "_", i),
-                             scaleResidual=scale)
-  }
-  return(in3a)
-  
-}
-
-circle_in2b <- function(data,
-                        num_1_1,
-                        num_2_1, num_2_2, num_2_3,
-                        proj,
-                        name,
-                        scale,
-                        round){
-  in2b <- data
-  for(i in 1:round){
-    in2b <- InceptionResnetV2B(in2b,
-                             num_1_1,
-                             num_2_1, num_2_2, num_2_3,
-                             proj,
-                             paste0(name, "_", i),
-                             scaleResidual=scale)
-  }
-  return(in2b)
-}
-
-circle_in2c <- function(data,
-                        num_1_1,
-                        num_2_1, num_2_2, num_2_3,
-                        proj,
-                        name,
-                        scale,
-                        round){
-  in2c <- data
-  for(i in 1:round){
-    in2c <- InceptionResnetV2C(in2c,
-                             num_1_1,
-                             num_2_1, num_2_2, num_2_3,
-                             proj,
-                             paste0(name, "_", i),
-                             scaleResidual=scale)
-  }
-  return(in2c)
-}
-
-# create inception-resnet-v1
-get_symbol <- function(num_classes=1000, scale=TRUE){
-  
-  # input shape 229*229*3
-  data <- mx.symbol.Variable(name="data")
-  
-  # stage stem
-  num_1_1 <- 32
-  num_1_2 <- 32
-  num_1_3 <- 64
-  num_2_1 <- 96
-  num_3_1 <- 64
-  num_3_2 <- 96
-  num_4_1 <- 64
-  num_4_2 <- 64
-  num_4_3 <- 64
-  num_4_4 <- 96
-  num_5_1 <- 192
-  
-  in_stem <- InceptionResnetStem(data,
-                                 num_1_1, num_1_2, num_1_3,
-                                 num_2_1,
-                                 num_3_1, num_3_2,
-                                 num_4_1, num_4_2, num_4_3, num_4_4,
-                                 num_5_1,
-                                 "stem_stage")
-  
-  # stage 5 x Inception Resnet A
-  num_1_1 <- 32
-  num_2_1 <- 32
-  num_2_2 <- 32
-  num_3_1 <- 32
-  num_3_2 <- 48
-  num_3_3 <- 64
-  proj <- 384
-  
-  in3a <- circle_in3a(in_stem,
-                      num_1_1,
-                      num_2_1, num_2_2,
-                      num_3_1, num_3_2, num_3_3,
-                      proj,
-                      "in3a",
-                      scale,
-                      5)
-  
-  # stage Reduction Resnet A
-  num_1_1 <- 384
-  num_2_1 <- 256
-  num_2_2 <- 256
-  num_2_3 <- 384
-  
-  re3a <- ReductionResnetV2A(in3a,
-                           num_1_1,
-                           num_2_1, num_2_2, num_2_3,
-                           "re3a")
-  
-  # stage 10 x Inception Resnet B
-  num_1_1 <- 192
-  num_2_1 <- 128
-  num_2_2 <- 160
-  num_2_3 <- 192
-  proj <- 1152
-  
-  in2b <- circle_in2b(re3a,
-                      num_1_1,
-                      num_2_1, num_2_2, num_2_3,
-                      proj,
-                      "in2b",
-                      scale,
-                      10)
-  
-  # stage Reduction Resnet B
-  num_1_1 <- 256
-  num_1_2 <- 384
-  num_2_1 <- 256
-  num_2_2 <- 288
-  num_3_1 <- 256
-  num_3_2 <- 288
-  num_3_3 <- 320
-  
-  re4b <- ReductionResnetV2B(in2b,
-                           num_1_1, num_1_2,
-                           num_2_1, num_2_2,
-                           num_3_1, num_3_2, num_3_3,
-                           "re4b")
-  
-  # stage 5 x Inception Resnet C
-  num_1_1 <- 192
-  num_2_1 <- 192
-  num_2_2 <- 224
-  num_2_3 <- 256
-  proj <-  2144
-  
-  in2c <- circle_in2c(re4b,
-                      num_1_1,
-                      num_2_1, num_2_2, num_2_3,
-                      proj,
-                      "in2c",
-                      scale,
-                      5)
-  
-  # stage Average Pooling
-  pool <- mx.symbol.Pooling(data=in2c, kernel=c(8, 8), stride=c(1, 1), 
-                            pool_type="avg", name="global_pool")
-  
-  # stage Dropout
-  dropout <- mx.symbol.Dropout(data=pool, p=0.2)
-  # dropout =  mx.symbol.Dropout(data=pool, p=0.8)
-  flatten <- mx.symbol.Flatten(data=dropout, name="flatten")
-  
-  # output
-  fc1 <- mx.symbol.FullyConnected(data=flatten, num_hidden=num_classes, name="fc1")
-  softmax <- mx.symbol.SoftmaxOutput(data=fc1, name="softmax")
-  
-  return(softmax)
-}
\ No newline at end of file
diff --git a/example/image-classification/symbol_lenet.R b/example/image-classification/symbol_lenet.R
deleted file mode 100644
index d41217d6d88a..000000000000
--- a/example/image-classification/symbol_lenet.R
+++ /dev/null
@@ -1,41 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-library(mxnet)
-
-get_symbol <- function(num_classes = 1000) {
-  data <- mx.symbol.Variable('data')
-  # first conv
-  conv1 <- mx.symbol.Convolution(data = data, kernel = c(5, 5), num_filter = 20)
-
-  tanh1 <- mx.symbol.Activation(data = conv1, act_type = "tanh")
-  pool1 <- mx.symbol.Pooling(data = tanh1, pool_type = "max", kernel = c(2, 2), stride = c(2, 2))
-  
-  # second conv
-  conv2 <- mx.symbol.Convolution(data = pool1, kernel = c(5, 5), num_filter = 50)
-  tanh2 <- mx.symbol.Activation(data = conv2, act_type = "tanh")
-  pool2 <- mx.symbol.Pooling(data = tanh2, pool_type = "max", kernel = c(2, 2), stride = c(2, 2))
-  # first fullc
-  flatten <- mx.symbol.Flatten(data = pool2)
-  fc1 <- mx.symbol.FullyConnected(data = flatten, num_hidden = 500)
-  tanh3 <- mx.symbol.Activation(data = fc1, act_type = "tanh")
-  # second fullc
-  fc2 <- mx.symbol.FullyConnected(data = tanh3, num_hidden = num_classes)
-  # loss
-  lenet <- mx.symbol.SoftmaxOutput(data = fc2, name = 'softmax')
-  return(lenet)
-}
diff --git a/example/image-classification/symbol_mlp.R b/example/image-classification/symbol_mlp.R
deleted file mode 100644
index 87ec60d958b4..000000000000
--- a/example/image-classification/symbol_mlp.R
+++ /dev/null
@@ -1,29 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-library(mxnet)
-
-get_symbol <- function(num_classes = 1000) {
-  data <- mx.symbol.Variable('data')
-  fc1 <- mx.symbol.FullyConnected(data = data, name = 'fc1', num_hidden = 128)
-  act1 <- mx.symbol.Activation(data = fc1, name = 'relu1', act_type = "relu")
-  fc2 <- mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 64)
-  act2 <- mx.symbol.Activation(data = fc2, name = 'relu2', act_type = "relu")
-  fc3 <- mx.symbol.FullyConnected(data = act2, name = 'fc3', num_hidden = num_classes)
-  mlp <- mx.symbol.SoftmaxOutput(data = fc3, name = 'softmax')
-  return(mlp)
-}
diff --git a/example/image-classification/symbol_resnet-28-small.R b/example/image-classification/symbol_resnet-28-small.R
deleted file mode 100644
index e08a6af78ea4..000000000000
--- a/example/image-classification/symbol_resnet-28-small.R
+++ /dev/null
@@ -1,99 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-library(mxnet)
-
-conv_factory <- function(data, num_filter, kernel, stride,
-                         pad, act_type = 'relu', conv_type = 0) {
-    if (conv_type == 0) {
-      conv = mx.symbol.Convolution(data = data, num_filter = num_filter,
-                                   kernel = kernel, stride = stride, pad = pad)
-      bn = mx.symbol.BatchNorm(data = conv)
-      act = mx.symbol.Activation(data = bn, act_type = act_type)
-      return(act)
-    } else if (conv_type == 1) {
-      conv = mx.symbol.Convolution(data = data, num_filter = num_filter,
-                                   kernel = kernel, stride = stride, pad = pad)
-      bn = mx.symbol.BatchNorm(data = conv)
-      return(bn)
-    }
-}
-
-residual_factory <- function(data, num_filter, dim_match) {
-  if (dim_match) {
-    identity_data = data
-    conv1 = conv_factory(data = data, num_filter = num_filter, kernel = c(3, 3),
-                         stride = c(1, 1), pad = c(1, 1), act_type = 'relu', conv_type = 0)
-    
-    conv2 = conv_factory(data = conv1, num_filter = num_filter, kernel = c(3, 3),
-                         stride = c(1, 1), pad = c(1, 1), conv_type = 1)
-    new_data = identity_data + conv2
-    act = mx.symbol.Activation(data = new_data, act_type = 'relu')
-    return(act)
-  } else {
-    conv1 = conv_factory(data = data, num_filter = num_filter, kernel = c(3, 3),
-                         stride = c(2, 2), pad = c(1, 1), act_type = 'relu', conv_type = 0)
-    conv2 = conv_factory(data = conv1, num_filter = num_filter, kernel = c(3, 3),
-                         stride = c(1, 1), pad = c(1, 1), conv_type = 1)
-    
-    # adopt project method in the paper when dimension increased
-    project_data = conv_factory(data = data, num_filter = num_filter, kernel = c(1, 1),
-                                stride = c(2, 2), pad = c(0, 0), conv_type = 1)
-    new_data = project_data + conv2
-    act = mx.symbol.Activation(data = new_data, act_type = 'relu')
-    return(act)
-  }
-}
-
-residual_net <- function(data, n) {
-  #fisrt 2n layers
-  for (i in 1:n) {
-    data = residual_factory(data = data, num_filter = 16, dim_match = TRUE)
-  }
-  
-  
-  #second 2n layers
-  for (i in 1:n) {
-    if (i == 1) {
-      data = residual_factory(data = data, num_filter = 32, dim_match = FALSE)
-    } else {
-      data = residual_factory(data = data, num_filter = 32, dim_match = TRUE)
-    }
-  }
-  #third 2n layers
-  for (i in 1:n) {
-    if (i == 1) {
-      data = residual_factory(data = data, num_filter = 64, dim_match = FALSE)
-    } else {
-      data = residual_factory(data = data, num_filter = 64, dim_match = TRUE)
-    }
-  }
-  return(data)
-}
-
-get_symbol <- function(num_classes = 10) {
-  conv <- conv_factory(data = mx.symbol.Variable(name = 'data'), num_filter = 16,
-                      kernel = c(3, 3), stride = c(1, 1), pad = c(1, 1),
-                      act_type = 'relu', conv_type = 0)
-  n <- 3 # set n = 3 means get a model with 3*6+2=20 layers, set n = 9 means 9*6+2=56 layers
-  resnet <- residual_net(conv, n) #
-  pool <- mx.symbol.Pooling(data = resnet, kernel = c(7, 7), pool_type = 'avg')
-  flatten <- mx.symbol.Flatten(data = pool, name = 'flatten')
-  fc <- mx.symbol.FullyConnected(data = flatten, num_hidden = num_classes, name = 'fc1')
-  softmax <- mx.symbol.SoftmaxOutput(data = fc, name = 'softmax')
-  return(softmax)
-}
diff --git a/example/image-classification/symbol_resnet-v2.R b/example/image-classification/symbol_resnet-v2.R
deleted file mode 100644
index 4d4d874a0530..000000000000
--- a/example/image-classification/symbol_resnet-v2.R
+++ /dev/null
@@ -1,162 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-###
-# Reproducing parper:
-# Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. "Identity Mappings in Deep Residual Networks"
-###
-
-library(mxnet)
-
-residual_unit <- function(data, num_filter, stride, dim_match, name, bottle_neck=TRUE, bn_mom=0.9, workspace=512){
-  if(bottle_neck){
-    bn1 <- mx.symbol.BatchNorm(data=data, fix_gamma=FALSE, eps=2e-5, 
-                               momentum=bn_mom, name=paste0(name,'_bn1'))
-    act1 <- mx.symbol.Activation(data=bn1, act_type='relu', 
-                                 name=paste0(name, '_relu1'))
-    conv1 <- mx.symbol.Convolution(data=act1, num_filter=as.integer(num_filter*0.25), 
-                                   kernel=c(1,1), stride=c(1,1), pad=c(0,0),
-                                   no_bias=TRUE, workspace=workspace, 
-                                   name=paste0(name,'_conv1'))
-    bn2 <- mx.symbol.BatchNorm(data=conv1, fix_gamma=FALSE, eps=2e-5, 
-                               momentum=bn_mom, name=paste0(name, '_bn2'))
-    act2 <- mx.symbol.Activation(data=bn2, act_type='relu', name=paste0(name, '_relu2'))
-    conv2 <- mx.symbol.Convolution(data=act2, num_filter=as.integer(num_filter*0.25), 
-                                   kernel=c(3,3), stride=stride, pad=c(1,1),
-                                   no_bias=TRUE, workspace=workspace, 
-                                   name=paste0(name, '_conv2'))
-    bn3 <- mx.symbol.BatchNorm(data=conv2, fix_gamma=FALSE, eps=2e-5, 
-                               momentum=bn_mom, name=paste0(name, '_bn3'))
-    act3 <- mx.symbol.Activation(data=bn3, act_type='relu', name=paste0(name,'_relu3'))
-    conv3 <- mx.symbol.Convolution(data=act3, num_filter=num_filter, kernel=c(1,1), 
-                                   stride=c(1,1), pad=c(0,0), no_bias=TRUE,
-                                   workspace=workspace, name=paste0(name, '_conv3'))
-    if (dim_match){
-      shortcut <- data
-    } else{
-      shortcut <- mx.symbol.Convolution(data=act1, num_filter=num_filter, 
-                                        kernel=c(1,1), stride=stride, no_bias=TRUE,
-                                        workspace=workspace, name=paste0(name,'_sc'))
-    }
-    return (conv3 + shortcut)
-  } else{
-    bn1 <- mx.symbol.BatchNorm(data=data, fix_gamma=FALSE, momentum=bn_mom, 
-                               eps=2e-5, name=paste0(name,'_bn1'))
-    act1 <- mx.symbol.Activation(data=bn1, act_type='relu', name=paste0(name, '_relu1'))
-    conv1 <- mx.symbol.Convolution(data=act1, num_filter=num_filter, kernel=c(3,3), 
-                                   stride=stride, pad=c(1,1), no_bias=TRUE, 
-                                   workspace=workspace, name=paste0(name,'_conv1'))
-    bn2 <- mx.symbol.BatchNorm(data=conv1, fix_gamma=FALSE, momentum=bn_mom, 
-                               eps=2e-5, name=paste0(name, '_bn2'))
-    act2 <- mx.symbol.Activation(data=bn2, act_type='relu', 
-                                 name=paste0(name, '_relu2'))
-    conv2 <- mx.symbol.Convolution(data=act2, num_filter=num_filter, kernel=c(3,3), 
-                                   stride=c(1,1), pad=c(1,1), no_bias=TRUE, 
-                                   workspace=workspace, name=paste0(name, '_conv2'))
-    if (dim_match){
-      shortcut = data
-    } else {
-      shortcut <- mx.symbol.Convolution(data=act1, num_filter=num_filter, kernel=c(1,1), 
-                                        stride=stride, no_bias=TRUE,
-                                        workspace=workspace, name=paste0(name,'_sc'))
-    }
-    return (conv2 + shortcut)
-  }
-}
-
-
-
-resnet <- function(units, num_stage, filter_list, num_class, bottle_neck=TRUE, 
-                   bn_mom=0.9, workspace=512){
-  num_unit <- length(units)
-  if(num_unit != num_stage) stop("Number of units different from num_stage")
-  data <- mx.symbol.Variable(name='data')
-  data <- mx.symbol.BatchNorm(data=data, fix_gamma=TRUE, eps=2e-5, momentum=bn_mom, 
-                              name='bn_data')
-  body <- mx.symbol.Convolution(data=data, num_filter=filter_list[1], kernel=c(7, 7), 
-                                stride=c(2,2), pad=c(3, 3),
-                                no_bias=TRUE, name="conv0", workspace=workspace)
-  body <- mx.symbol.BatchNorm(data=body, fix_gamma=FALSE, eps=2e-5, 
-                              momentum=bn_mom, name='bn0')
-  body <- mx.symbol.Activation(data=body, act_type='relu', name='relu0')
-  body <- mx.symbol.Pooling(data=body, kernel=c(3, 3), stride=c(2,2), 
-                            pad=c(1,1), pool_type='max')
-  
-  
-  for(i in 1:num_stage){
-    if(i==1) stride <- c(1,1)
-    else stride <- c(2,2)
-    body <- residual_unit(body, filter_list[i+1], stride, FALSE,
-                          name=paste0('stage', i, '_unit1') , 
-                          bottle_neck=bottle_neck, workspace=workspace)
-                          for(j in 1:(units[i]-1)){
-                            body <- residual_unit(body, filter_list[i+1], c(1,1), 
-                                                  TRUE, name=paste0('stage',i, '_unit', j + 1),
-                                                  bottle_neck=bottle_neck, 
-                                                  workspace=workspace)
-                          }
-  }
-  bn1 <- mx.symbol.BatchNorm(data=body, fix_gamma=FALSE, eps=2e-5, 
-                             momentum=bn_mom, name='bn1')
-  relu1 <- mx.symbol.Activation(data=bn1, act_type='relu', name='relu1')
-  # Although kernel is not used here when global_pool=TRUE, we should put one
-  pool1 <-  mx.symbol.Pooling(data=relu1, global_pool=TRUE, kernel=c(7, 7), 
-                              pool_type='avg', name='pool1')
-  flat <- mx.symbol.Flatten(data=pool1)
-  fc1 <- mx.symbol.FullyConnected(data=flat, num_hidden=num_class, name='fc1')
-  resnet <- mx.symbol.SoftmaxOutput(data=fc1, name='softmax')
-  return(resnet)
-}
-
-get_symbol <- function(num_class, depth=18){
-  if (depth == 18){
-    units <- c(2, 2, 2, 2)
-  } else if (depth == 34){
-    units = c(3, 4, 6, 3)
-  } else if (depth == 50){
-    units = c(3, 4, 6, 3)
-  } else if (depth == 101){
-    units = c(3, 4, 23, 3)
-  } else if (depth == 152){
-    units = c(3, 8, 36, 3)
-  } else if (depth == 200){
-    units = c(3, 24, 36, 3) 
-  } else if (depth == 269){
-    units = c(3, 30, 48, 8)
-  } else{
-    stop(paste0("no experiments done on depth ", depth))
-  }
-  
-  if (depth >=50){
-    filter_list <- c(64, 256, 512, 1024, 2048)
-    bottle_neck <- TRUE
-  } else{
-    filter_list <- c(64, 64, 128, 256, 512)
-    bottle_neck <- FALSE
-  }
-  bn_mom <- 0.9 #momentum of batch normalization
-  workspace <- 500
-  symbol <- resnet(units=units, num_stage=4, filter_list=filter_list, 
-                   num_class=num_class, bottle_neck=bottle_neck, 
-                   bn_mom=bn_mom, workspace=workspace)
-  return(symbol)
-}
-
-
-
-
-
diff --git a/example/image-classification/symbol_resnet.R b/example/image-classification/symbol_resnet.R
deleted file mode 100644
index 0603f2c8ec4b..000000000000
--- a/example/image-classification/symbol_resnet.R
+++ /dev/null
@@ -1,87 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-library(mxnet)
-
-get_conv <- function(name, data, num_filter, kernel, stride,
-                     pad, with_relu, bn_momentum) {
-  conv = mx.symbol.Convolution(name = name, data = data, num_filter = num_filter,
-                               kernel = kernel, stride = stride, pad = pad, no_bias = TRUE)
-  bn = mx.symbol.BatchNorm(name = paste(name, '_bn', sep = ''), data = conv,
-                           fix_gamma = FALSE, momentum = bn_momentum, eps = 2e-5)
-  if (with_relu) {
-    return(mx.symbol.Activation(name = paste(name, '_relu', sep = ''),
-                                data = bn, act_type = 'relu'))
-  } else {
-    return(bn)
-  }
-}
-
-make_block <- function(name, data, num_filter, dim_match, bn_momentum) {
-  if (dim_match) {
-    conv1 = get_conv(name = paste(name, '_conv1', sep = ''), data = data,
-                     num_filter = num_filter, kernel = c(3, 3), stride = c(1, 1),
-                     pad = c(1, 1), with_relu = TRUE, bn_momentum = bn_momentum)
-  } else {
-    conv1 = get_conv(name = paste(name, '_conv1', sep = ''), data = data,
-                     num_filter = num_filter, kernel = c(3, 3), stride = c(2, 2),
-                     pad = c(1, 1), with_relu = TRUE, bn_momentum = bn_momentum)
-  }
-  
-  conv2 = get_conv(name = paste(name, '_conv2', sep = ''), data = conv1,
-                   num_filter = num_filter, kernel = c(3, 3), stride = c(1, 1),
-                   pad = c(1, 1), with_relu = FALSE, bn_momentum = bn_momentum)
-  if (dim_match) {
-    shortcut = data
-  } else {
-    shortcut = mx.symbol.Convolution(name = paste(name, '_proj', sep = ''),
-                                     data = data, num_filter = num_filter, kernel = c(2, 2), 
-                                     stride = c(2, 2), pad = c(0, 0), no_bias = TRUE)
-  }
-  fused = shortcut + conv2
-  return(mx.symbol.Activation(name = paste(name, '_relu', sep = ''), data = fused, act_type = 'relu'))
-}
-
-get_body <- function(data, num_level, num_block, num_filter, bn_momentum) {
-  for (level in 1:num_level) {
-    for (block in 1:num_block) {
-      data = make_block(
-        name = paste('level', level, '_block', block, sep = ''),
-        data = data,
-        num_filter = num_filter * 2 ^ (level - 1),
-        dim_match = (level == 1 || block > 1),
-        bn_momentum = bn_momentum
-      )
-    }
-  }
-  return(data)
-}
-
-get_symbol <- function(num_class, num_level = 3, num_block = 9,
-                       num_filter = 16, bn_momentum = 0.9, pool_kernel = c(8, 8)) {
-  data = mx.symbol.Variable(name = 'data')
-  zscore = mx.symbol.BatchNorm(name = 'zscore', data = data, 
-                               fix_gamma = TRUE, momentum = bn_momentum)
-  conv = get_conv(name = 'conv0', data = zscore, num_filter = num_filter,
-                  kernel = c(3, 3), stride = c(1, 1), pad = c(1, 1),
-                  with_relu = TRUE, bn_momentum = bn_momentum)
-  body = get_body(conv, num_level, num_block, num_filter, bn_momentum)
-  pool = mx.symbol.Pooling(data = body, kernel = pool_kernel, pool_type = 'avg')
-  flat = mx.symbol.Flatten(data = pool)
-  fc = mx.symbol.FullyConnected(data = flat, num_hidden = num_class, name = 'fc')
-  return(mx.symbol.SoftmaxOutput(data = fc, name = 'softmax'))
-}
diff --git a/example/image-classification/symbol_vgg.R b/example/image-classification/symbol_vgg.R
deleted file mode 100644
index 04b600751f31..000000000000
--- a/example/image-classification/symbol_vgg.R
+++ /dev/null
@@ -1,75 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-library(mxnet)
-
-get_symbol <- function(num_classes = 1000) {
-  ## define alexnet
-  data = mx.symbol.Variable(name = "data")
-  # group 1
-  conv1_1 = mx.symbol.Convolution(data = data, kernel = c(3, 3), pad = c(1, 1),
-                                  num_filter = 64, name = "conv1_1")
-  relu1_1 = mx.symbol.Activation(data = conv1_1, act_type = "relu", name = "relu1_1")
-  pool1 = mx.symbol.Pooling(data = relu1_1, pool_type = "max", kernel = c(2, 2),
-                            stride = c(2, 2), name = "pool1")
-  # group 2
-  conv2_1 = mx.symbol.Convolution(data = pool1, kernel = c(3, 3), pad = c(1, 1),
-                                  num_filter = 128, name = "conv2_1")
-  relu2_1 = mx.symbol.Activation(data = conv2_1, act_type = "relu", name = "relu2_1")
-  pool2 = mx.symbol.Pooling(data = relu2_1, pool_type = "max", kernel = c(2, 2),
-                            stride = c(2, 2), name = "pool2")
-  # group 3
-  conv3_1 = mx.symbol.Convolution(data = pool2, kernel = c(3, 3), pad = c(1, 1),
-                                  num_filter = 256, name = "conv3_1")
-  relu3_1 = mx.symbol.Activation(data = conv3_1, act_type = "relu", name = "relu3_1")
-  conv3_2 = mx.symbol.Convolution(data = relu3_1, kernel = c(3, 3), pad = c(1, 1),
-                                  num_filter = 256, name = "conv3_2")
-  relu3_2 = mx.symbol.Activation(data = conv3_2, act_type = "relu", name = "relu3_2")
-  pool3 = mx.symbol.Pooling(data = relu3_2, pool_type = "max", kernel = c(2, 2),
-                            stride = c(2, 2), name = "pool3")
-  # group 4
-  conv4_1 = mx.symbol.Convolution(data = pool3, kernel = c(3, 3), pad = c(1, 1),
-                                  num_filter = 512, name = "conv4_1")
-  relu4_1 = mx.symbol.Activation(data = conv4_1, act_type = "relu", name = "relu4_1")
-  conv4_2 = mx.symbol.Convolution(data = relu4_1, kernel = c(3, 3), pad = c(1, 1),
-                                  num_filter = 512, name = "conv4_2")
-  relu4_2 = mx.symbol.Activation(data = conv4_2, act_type = "relu", name = "relu4_2")
-  pool4 = mx.symbol.Pooling(data = relu4_2, pool_type = "max",
-                            kernel = c(2, 2), stride = c(2, 2), name = "pool4")
-  # group 5
-  conv5_1 = mx.symbol.Convolution(data = pool4, kernel = c(3, 3),
-                                  pad = c(1, 1), num_filter = 512, name = "conv5_1")
-  relu5_1 = mx.symbol.Activation(data = conv5_1, act_type = "relu", name = "relu5_1")
-  conv5_2 = mx.symbol.Convolution(data = relu5_1, kernel = c(3, 3),
-                                  pad = c(1, 1), num_filter = 512, name = "conv5_2")
-  relu5_2 = mx.symbol.Activation(data = conv5_2, act_type = "relu", name = "relu5_2")
-  pool5 = mx.symbol.Pooling(data = relu5_2, pool_type = "max",
-                            kernel = c(2, 2), stride = c(2, 2), name = "pool5")
-  # group 6
-  flatten = mx.symbol.Flatten(data = pool5, name = "flatten")
-  fc6 = mx.symbol.FullyConnected(data = flatten, num_hidden = 4096, name = "fc6")
-  relu6 = mx.symbol.Activation(data = fc6, act_type = "relu", name = "relu6")
-  drop6 = mx.symbol.Dropout(data = relu6, p = 0.5, name = "drop6")
-  # group 7
-  fc7 = mx.symbol.FullyConnected(data = drop6, num_hidden = 4096, name = "fc7")
-  relu7 = mx.symbol.Activation(data = fc7, act_type = "relu", name = "relu7")
-  drop7 = mx.symbol.Dropout(data = relu7, p = 0.5, name = "drop7")
-  # output
-  fc8 = mx.symbol.FullyConnected(data = drop7, num_hidden = num_classes, name = "fc8")
-  softmax = mx.symbol.SoftmaxOutput(data = fc8, name = 'softmax')
-  return(softmax)
-}
diff --git a/example/image-classification/symbols/README.md b/example/image-classification/symbols/README.md
deleted file mode 100644
index b43b904ab051..000000000000
--- a/example/image-classification/symbols/README.md
+++ /dev/null
@@ -1,13 +0,0 @@
-# Symbol
-
-This fold contains definition of various networks. To add a new network, please
-use the following format.
-
-## Python
-
-- A file implements one network proposed in a paper, with the network name as the
-filename.
-- Mention the paper and the modifications made if any at the beginning
-of the file.
-- Indicate how to reproduce the accuracy numbers in the paper if it is not straightforward
-- Provide a function `get_symbol()` that return the network
diff --git a/example/image-classification/symbols/__init__.py b/example/image-classification/symbols/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/example/image-classification/symbols/alexnet.py b/example/image-classification/symbols/alexnet.py
deleted file mode 100644
index f945b9f87cd9..000000000000
--- a/example/image-classification/symbols/alexnet.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""
-Reference:
-
-Krizhevsky, Alex, Ilya Sutskever, and Geoffrey E. Hinton. "Imagenet classification with deep convolutional neural networks." Advances in neural information processing systems. 2012.
-"""
-import mxnet as mx
-import numpy as np
-
-def get_symbol(num_classes, dtype='float32', **kwargs):
-    input_data = mx.sym.Variable(name="data")
-    if dtype == 'float16':
-        input_data = mx.sym.Cast(data=input_data, dtype=np.float16)
-    # stage 1
-    conv1 = mx.sym.Convolution(name='conv1',
-        data=input_data, kernel=(11, 11), stride=(4, 4), num_filter=96)
-    relu1 = mx.sym.Activation(data=conv1, act_type="relu")
-    lrn1 = mx.sym.LRN(data=relu1, alpha=0.0001, beta=0.75, knorm=2, nsize=5)
-    pool1 = mx.sym.Pooling(
-        data=lrn1, pool_type="max", kernel=(3, 3), stride=(2,2))
-    # stage 2
-    conv2 = mx.sym.Convolution(name='conv2',
-        data=pool1, kernel=(5, 5), pad=(2, 2), num_filter=256)
-    relu2 = mx.sym.Activation(data=conv2, act_type="relu")
-    lrn2 = mx.sym.LRN(data=relu2, alpha=0.0001, beta=0.75, knorm=2, nsize=5)
-    pool2 = mx.sym.Pooling(data=lrn2, kernel=(3, 3), stride=(2, 2), pool_type="max")
-    # stage 3
-    conv3 = mx.sym.Convolution(name='conv3',
-        data=pool2, kernel=(3, 3), pad=(1, 1), num_filter=384)
-    relu3 = mx.sym.Activation(data=conv3, act_type="relu")
-    conv4 = mx.sym.Convolution(name='conv4',
-        data=relu3, kernel=(3, 3), pad=(1, 1), num_filter=384)
-    relu4 = mx.sym.Activation(data=conv4, act_type="relu")
-    conv5 = mx.sym.Convolution(name='conv5',
-        data=relu4, kernel=(3, 3), pad=(1, 1), num_filter=256)
-    relu5 = mx.sym.Activation(data=conv5, act_type="relu")
-    pool3 = mx.sym.Pooling(data=relu5, kernel=(3, 3), stride=(2, 2), pool_type="max")
-    # stage 4
-    flatten = mx.sym.Flatten(data=pool3)
-    fc1 = mx.sym.FullyConnected(name='fc1', data=flatten, num_hidden=4096)
-    relu6 = mx.sym.Activation(data=fc1, act_type="relu")
-    dropout1 = mx.sym.Dropout(data=relu6, p=0.5)
-    # stage 5
-    fc2 = mx.sym.FullyConnected(name='fc2', data=dropout1, num_hidden=4096)
-    relu7 = mx.sym.Activation(data=fc2, act_type="relu")
-    dropout2 = mx.sym.Dropout(data=relu7, p=0.5)
-    # stage 6
-    fc3 = mx.sym.FullyConnected(name='fc3', data=dropout2, num_hidden=num_classes)
-    if dtype == 'float16':
-        fc3 = mx.sym.Cast(data=fc3, dtype=np.float32)
-    softmax = mx.sym.SoftmaxOutput(data=fc3, name='softmax')
-    return softmax
diff --git a/example/image-classification/symbols/googlenet.py b/example/image-classification/symbols/googlenet.py
deleted file mode 100644
index 6745ba7b0068..000000000000
--- a/example/image-classification/symbols/googlenet.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""References:
-
-Szegedy, Christian, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed, Dragomir
-Anguelov, Dumitru Erhan, Vincent Vanhoucke, and Andrew Rabinovich. "Going deeper
-with convolutions." arXiv preprint arXiv:1409.4842 (2014).
-
-"""
-
-import mxnet as mx
-
-def ConvFactory(data, num_filter, kernel, stride=(1,1), pad=(0, 0), name=None, suffix=''):
-    conv = mx.symbol.Convolution(data=data, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad, name='conv_%s%s' %(name, suffix))
-    act = mx.symbol.Activation(data=conv, act_type='relu', name='relu_%s%s' %(name, suffix))
-    return act
-
-def InceptionFactory(data, num_1x1, num_3x3red, num_3x3, num_d5x5red, num_d5x5, pool, proj, name):
-    # 1x1
-    c1x1 = ConvFactory(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_1x1' % name))
-    # 3x3 reduce + 3x3
-    c3x3r = ConvFactory(data=data, num_filter=num_3x3red, kernel=(1, 1), name=('%s_3x3' % name), suffix='_reduce')
-    c3x3 = ConvFactory(data=c3x3r, num_filter=num_3x3, kernel=(3, 3), pad=(1, 1), name=('%s_3x3' % name))
-    # double 3x3 reduce + double 3x3
-    cd5x5r = ConvFactory(data=data, num_filter=num_d5x5red, kernel=(1, 1), name=('%s_5x5' % name), suffix='_reduce')
-    cd5x5 = ConvFactory(data=cd5x5r, num_filter=num_d5x5, kernel=(5, 5), pad=(2, 2), name=('%s_5x5' % name))
-    # pool + proj
-    pooling = mx.symbol.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
-    cproj = ConvFactory(data=pooling, num_filter=proj, kernel=(1, 1), name=('%s_proj' %  name))
-    # concat
-    concat = mx.symbol.Concat(*[c1x1, c3x3, cd5x5, cproj], name='ch_concat_%s_chconcat' % name)
-    return concat
-
-def get_symbol(num_classes = 1000, **kwargs):
-    data = mx.sym.Variable("data")
-    conv1 = ConvFactory(data, 64, kernel=(7, 7), stride=(2,2), pad=(3, 3), name="conv1")
-    pool1 = mx.sym.Pooling(conv1, kernel=(3, 3), stride=(2, 2), pool_type="max")
-    conv2 = ConvFactory(pool1, 64, kernel=(1, 1), stride=(1,1), name="conv2")
-    conv3 = ConvFactory(conv2, 192, kernel=(3, 3), stride=(1, 1), pad=(1,1), name="conv3")
-    pool3 = mx.sym.Pooling(conv3, kernel=(3, 3), stride=(2, 2), pool_type="max")
-
-    in3a = InceptionFactory(pool3, 64, 96, 128, 16, 32, "max", 32, name="in3a")
-    in3b = InceptionFactory(in3a, 128, 128, 192, 32, 96, "max", 64, name="in3b")
-    pool4 = mx.sym.Pooling(in3b, kernel=(3, 3), stride=(2, 2), pool_type="max")
-    in4a = InceptionFactory(pool4, 192, 96, 208, 16, 48, "max", 64, name="in4a")
-    in4b = InceptionFactory(in4a, 160, 112, 224, 24, 64, "max", 64, name="in4b")
-    in4c = InceptionFactory(in4b, 128, 128, 256, 24, 64, "max", 64, name="in4c")
-    in4d = InceptionFactory(in4c, 112, 144, 288, 32, 64, "max", 64, name="in4d")
-    in4e = InceptionFactory(in4d, 256, 160, 320, 32, 128, "max", 128, name="in4e")
-    pool5 = mx.sym.Pooling(in4e, kernel=(3, 3), stride=(2, 2), pool_type="max")
-    in5a = InceptionFactory(pool5, 256, 160, 320, 32, 128, "max", 128, name="in5a")
-    in5b = InceptionFactory(in5a, 384, 192, 384, 48, 128, "max", 128, name="in5b")
-    pool6 = mx.sym.Pooling(in5b, kernel=(7, 7), stride=(1,1), global_pool=True, pool_type="avg")
-    flatten = mx.sym.Flatten(data=pool6)
-    fc1 = mx.sym.FullyConnected(data=flatten, num_hidden=num_classes)
-    softmax = mx.symbol.SoftmaxOutput(data=fc1, name='softmax')
-    return softmax
diff --git a/example/image-classification/symbols/inception-bn.py b/example/image-classification/symbols/inception-bn.py
deleted file mode 100644
index 84934a5f72aa..000000000000
--- a/example/image-classification/symbols/inception-bn.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""
-
-Inception + BN, suitable for images with around 224 x 224
-
-Reference:
-
-Sergey Ioffe and Christian Szegedy. Batch normalization: Accelerating deep
-network training by reducing internal covariate shift. arXiv preprint
-arXiv:1502.03167, 2015.
-
-"""
-import mxnet as mx
-
-eps = 1e-10 + 1e-5
-bn_mom = 0.9
-fix_gamma = False
-
-
-def ConvFactory(data, num_filter, kernel, stride=(1,1), pad=(0, 0), name=None, suffix='', attr={}):
-    conv = mx.symbol.Convolution(data=data, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad, name='conv_%s%s' %(name, suffix))
-    bn = mx.symbol.BatchNorm(data=conv, fix_gamma=fix_gamma, eps=eps, momentum=bn_mom, name='bn_%s%s' %(name, suffix))
-    act = mx.symbol.Activation(data=bn, act_type='relu', name='relu_%s%s' %(name, suffix), attr=attr)
-    return act
-
-def InceptionFactoryA(data, num_1x1, num_3x3red, num_3x3, num_d3x3red, num_d3x3, pool, proj, name):
-    # 1x1
-    c1x1 = ConvFactory(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_1x1' % name))
-    # 3x3 reduce + 3x3
-    c3x3r = ConvFactory(data=data, num_filter=num_3x3red, kernel=(1, 1), name=('%s_3x3' % name), suffix='_reduce')
-    c3x3 = ConvFactory(data=c3x3r, num_filter=num_3x3, kernel=(3, 3), pad=(1, 1), name=('%s_3x3' % name))
-    # double 3x3 reduce + double 3x3
-    cd3x3r = ConvFactory(data=data, num_filter=num_d3x3red, kernel=(1, 1), name=('%s_double_3x3' % name), suffix='_reduce')
-    cd3x3 = ConvFactory(data=cd3x3r, num_filter=num_d3x3, kernel=(3, 3), pad=(1, 1), name=('%s_double_3x3_0' % name))
-    cd3x3 = ConvFactory(data=cd3x3, num_filter=num_d3x3, kernel=(3, 3), pad=(1, 1), name=('%s_double_3x3_1' % name))
-    # pool + proj
-    pooling = mx.symbol.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
-    cproj = ConvFactory(data=pooling, num_filter=proj, kernel=(1, 1), name=('%s_proj' %  name))
-    # concat
-    concat = mx.symbol.Concat(*[c1x1, c3x3, cd3x3, cproj], name='ch_concat_%s_chconcat' % name)
-    return concat
-
-def InceptionFactoryB(data, num_3x3red, num_3x3, num_d3x3red, num_d3x3, name):
-    # 3x3 reduce + 3x3
-    c3x3r = ConvFactory(data=data, num_filter=num_3x3red, kernel=(1, 1), name=('%s_3x3' % name), suffix='_reduce')
-    c3x3 = ConvFactory(data=c3x3r, num_filter=num_3x3, kernel=(3, 3), pad=(1, 1), stride=(2, 2), name=('%s_3x3' % name))
-    # double 3x3 reduce + double 3x3
-    cd3x3r = ConvFactory(data=data, num_filter=num_d3x3red, kernel=(1, 1),  name=('%s_double_3x3' % name), suffix='_reduce')
-    cd3x3 = ConvFactory(data=cd3x3r, num_filter=num_d3x3, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name=('%s_double_3x3_0' % name))
-    cd3x3 = ConvFactory(data=cd3x3, num_filter=num_d3x3, kernel=(3, 3), pad=(1, 1), stride=(2, 2), name=('%s_double_3x3_1' % name))
-    # pool + proj
-    pooling = mx.symbol.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pad=(1, 1), pool_type="max", name=('max_pool_%s_pool' % name))
-    # concat
-    concat = mx.symbol.Concat(*[c3x3, cd3x3, pooling], name='ch_concat_%s_chconcat' % name)
-    return concat
-
-# A Simple Downsampling Factory
-def DownsampleFactory(data, ch_3x3, name, attr):
-    # conv 3x3
-    conv = ConvFactory(data=data, name=name+'_conv',kernel=(3, 3), stride=(2, 2), num_filter=ch_3x3, pad=(1, 1), attr=attr)
-    # pool
-    pool = mx.symbol.Pooling(data=data, name=name+'_pool',kernel=(3, 3), stride=(2, 2), pad=(1, 1), pool_type='max', attr=attr)
-    # concat
-    concat = mx.symbol.Concat(*[conv, pool], name=name+'_ch_concat')
-    return concat
-
-# A Simple module
-def SimpleFactory(data, ch_1x1, ch_3x3, name, attr):
-    # 1x1
-    conv1x1 = ConvFactory(data=data, name=name+'_1x1', kernel=(1, 1), pad=(0, 0), num_filter=ch_1x1, attr=attr)
-    # 3x3
-    conv3x3 = ConvFactory(data=data, name=name+'_3x3', kernel=(3, 3), pad=(1, 1), num_filter=ch_3x3, attr=attr)
-    #concat
-    concat = mx.symbol.Concat(*[conv1x1, conv3x3], name=name+'_ch_concat')
-    return concat
-
-
-def get_symbol(num_classes, image_shape, **kwargs):
-    image_shape = [int(l) for l in image_shape.split(',')]
-    (nchannel, height, width) = image_shape
-    # attr = {'force_mirroring': 'true'}
-    attr = {}
-
-    # data
-    data = mx.symbol.Variable(name="data")
-    if height <= 28:
-        # a simper version
-        conv1 = ConvFactory(data=data, kernel=(3,3), pad=(1,1), name="1", num_filter=96, attr=attr)
-        in3a = SimpleFactory(conv1, 32, 32, 'in3a', attr)
-        in3b = SimpleFactory(in3a, 32, 48, 'in3b', attr)
-        in3c = DownsampleFactory(in3b, 80, 'in3c', attr)
-        in4a = SimpleFactory(in3c, 112, 48, 'in4a', attr)
-        in4b = SimpleFactory(in4a, 96, 64, 'in4b', attr)
-        in4c = SimpleFactory(in4b, 80, 80, 'in4c', attr)
-        in4d = SimpleFactory(in4c, 48, 96, 'in4d', attr)
-        in4e = DownsampleFactory(in4d, 96, 'in4e', attr)
-        in5a = SimpleFactory(in4e, 176, 160, 'in5a', attr)
-        in5b = SimpleFactory(in5a, 176, 160, 'in5b', attr)
-        pool = mx.symbol.Pooling(data=in5b, pool_type="avg", kernel=(7,7), name="global_pool", attr=attr)
-    else:
-        # stage 1
-        conv1 = ConvFactory(data=data, num_filter=64, kernel=(7, 7), stride=(2, 2), pad=(3, 3), name='1')
-        pool1 = mx.symbol.Pooling(data=conv1, kernel=(3, 3), stride=(2, 2), name='pool_1', pool_type='max')
-        # stage 2
-        conv2red = ConvFactory(data=pool1, num_filter=64, kernel=(1, 1), stride=(1, 1), name='2_red')
-        conv2 = ConvFactory(data=conv2red, num_filter=192, kernel=(3, 3), stride=(1, 1), pad=(1, 1), name='2')
-        pool2 = mx.symbol.Pooling(data=conv2, kernel=(3, 3), stride=(2, 2), name='pool_2', pool_type='max')
-        # stage 2
-        in3a = InceptionFactoryA(pool2, 64, 64, 64, 64, 96, "avg", 32, '3a')
-        in3b = InceptionFactoryA(in3a, 64, 64, 96, 64, 96, "avg", 64, '3b')
-        in3c = InceptionFactoryB(in3b, 128, 160, 64, 96, '3c')
-        # stage 3
-        in4a = InceptionFactoryA(in3c, 224, 64, 96, 96, 128, "avg", 128, '4a')
-        in4b = InceptionFactoryA(in4a, 192, 96, 128, 96, 128, "avg", 128, '4b')
-        in4c = InceptionFactoryA(in4b, 160, 128, 160, 128, 160, "avg", 128, '4c')
-        in4d = InceptionFactoryA(in4c, 96, 128, 192, 160, 192, "avg", 128, '4d')
-        in4e = InceptionFactoryB(in4d, 128, 192, 192, 256, '4e')
-        # stage 4
-        in5a = InceptionFactoryA(in4e, 352, 192, 320, 160, 224, "avg", 128, '5a')
-        in5b = InceptionFactoryA(in5a, 352, 192, 320, 192, 224, "max", 128, '5b')
-        # global avg pooling
-        pool = mx.symbol.Pooling(data=in5b, kernel=(7, 7), stride=(1, 1), name="global_pool", pool_type='avg')
-
-    # linear classifier
-    flatten = mx.symbol.Flatten(data=pool)
-    fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=num_classes)
-    softmax = mx.symbol.SoftmaxOutput(data=fc1, name='softmax')
-    return softmax
diff --git a/example/image-classification/symbols/inception-resnet-v2.py b/example/image-classification/symbols/inception-resnet-v2.py
deleted file mode 100644
index 866d8106ba9a..000000000000
--- a/example/image-classification/symbols/inception-resnet-v2.py
+++ /dev/null
@@ -1,158 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""
-Contains the definition of the Inception Resnet V2 architecture.
-As described in http://arxiv.org/abs/1602.07261.
-Inception-v4, Inception-ResNet and the Impact of Residual Connections
-on Learning
-Christian Szegedy, Sergey Ioffe, Vincent Vanhoucke, Alex Alemi
-"""
-import mxnet as mx
-
-
-def ConvFactory(data, num_filter, kernel, stride=(1, 1), pad=(0, 0), act_type="relu", mirror_attr={}, with_act=True):
-    conv = mx.symbol.Convolution(
-        data=data, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad)
-    bn = mx.symbol.BatchNorm(data=conv)
-    if with_act:
-        act = mx.symbol.Activation(
-            data=bn, act_type=act_type, attr=mirror_attr)
-        return act
-    else:
-        return bn
-
-
-def block35(net, input_num_channels, scale=1.0, with_act=True, act_type='relu', mirror_attr={}):
-    tower_conv = ConvFactory(net, 32, (1, 1))
-    tower_conv1_0 = ConvFactory(net, 32, (1, 1))
-    tower_conv1_1 = ConvFactory(tower_conv1_0, 32, (3, 3), pad=(1, 1))
-    tower_conv2_0 = ConvFactory(net, 32, (1, 1))
-    tower_conv2_1 = ConvFactory(tower_conv2_0, 48, (3, 3), pad=(1, 1))
-    tower_conv2_2 = ConvFactory(tower_conv2_1, 64, (3, 3), pad=(1, 1))
-    tower_mixed = mx.symbol.Concat(*[tower_conv, tower_conv1_1, tower_conv2_2])
-    tower_out = ConvFactory(
-        tower_mixed, input_num_channels, (1, 1), with_act=False)
-
-    net = net + scale * tower_out
-    if with_act:
-        act = mx.symbol.Activation(
-            data=net, act_type=act_type, attr=mirror_attr)
-        return act
-    else:
-        return net
-
-
-def block17(net, input_num_channels, scale=1.0, with_act=True, act_type='relu', mirror_attr={}):
-    tower_conv = ConvFactory(net, 192, (1, 1))
-    tower_conv1_0 = ConvFactory(net, 129, (1, 1))
-    tower_conv1_1 = ConvFactory(tower_conv1_0, 160, (1, 7), pad=(1, 2))
-    tower_conv1_2 = ConvFactory(tower_conv1_1, 192, (7, 1), pad=(2, 1))
-    tower_mixed = mx.symbol.Concat(*[tower_conv, tower_conv1_2])
-    tower_out = ConvFactory(
-        tower_mixed, input_num_channels, (1, 1), with_act=False)
-    net = net + scale * tower_out
-    if with_act:
-        act = mx.symbol.Activation(
-            data=net, act_type=act_type, attr=mirror_attr)
-        return act
-    else:
-        return net
-
-
-def block8(net, input_num_channels, scale=1.0, with_act=True, act_type='relu', mirror_attr={}):
-    tower_conv = ConvFactory(net, 192, (1, 1))
-    tower_conv1_0 = ConvFactory(net, 192, (1, 1))
-    tower_conv1_1 = ConvFactory(tower_conv1_0, 224, (1, 3), pad=(0, 1))
-    tower_conv1_2 = ConvFactory(tower_conv1_1, 256, (3, 1), pad=(1, 0))
-    tower_mixed = mx.symbol.Concat(*[tower_conv, tower_conv1_2])
-    tower_out = ConvFactory(
-        tower_mixed, input_num_channels, (1, 1), with_act=False)
-    net = net + scale * tower_out
-    if with_act:
-        act = mx.symbol.Activation(
-            data=net, act_type=act_type, attr=mirror_attr)
-        return act
-    else:
-        return net
-
-
-def repeat(inputs, repetitions, layer, *args, **kwargs):
-    outputs = inputs
-    for i in range(repetitions):
-        outputs = layer(outputs, *args, **kwargs)
-    return outputs
-
-
-def get_symbol(num_classes=1000, **kwargs):
-    data = mx.symbol.Variable(name='data')
-    conv1a_3_3 = ConvFactory(data=data, num_filter=32,
-                             kernel=(3, 3), stride=(2, 2))
-    conv2a_3_3 = ConvFactory(conv1a_3_3, 32, (3, 3))
-    conv2b_3_3 = ConvFactory(conv2a_3_3, 64, (3, 3), pad=(1, 1))
-    maxpool3a_3_3 = mx.symbol.Pooling(
-        data=conv2b_3_3, kernel=(3, 3), stride=(2, 2), pool_type='max')
-    conv3b_1_1 = ConvFactory(maxpool3a_3_3, 80, (1, 1))
-    conv4a_3_3 = ConvFactory(conv3b_1_1, 192, (3, 3))
-    maxpool5a_3_3 = mx.symbol.Pooling(
-        data=conv4a_3_3, kernel=(3, 3), stride=(2, 2), pool_type='max')
-
-    tower_conv = ConvFactory(maxpool5a_3_3, 96, (1, 1))
-    tower_conv1_0 = ConvFactory(maxpool5a_3_3, 48, (1, 1))
-    tower_conv1_1 = ConvFactory(tower_conv1_0, 64, (5, 5), pad=(2, 2))
-
-    tower_conv2_0 = ConvFactory(maxpool5a_3_3, 64, (1, 1))
-    tower_conv2_1 = ConvFactory(tower_conv2_0, 96, (3, 3), pad=(1, 1))
-    tower_conv2_2 = ConvFactory(tower_conv2_1, 96, (3, 3), pad=(1, 1))
-
-    tower_pool3_0 = mx.symbol.Pooling(data=maxpool5a_3_3, kernel=(
-        3, 3), stride=(1, 1), pad=(1, 1), pool_type='avg')
-    tower_conv3_1 = ConvFactory(tower_pool3_0, 64, (1, 1))
-    tower_5b_out = mx.symbol.Concat(
-        *[tower_conv, tower_conv1_1, tower_conv2_2, tower_conv3_1])
-    net = repeat(tower_5b_out, 10, block35, scale=0.17, input_num_channels=320)
-    tower_conv = ConvFactory(net, 384, (3, 3), stride=(2, 2))
-    tower_conv1_0 = ConvFactory(net, 256, (1, 1))
-    tower_conv1_1 = ConvFactory(tower_conv1_0, 256, (3, 3), pad=(1, 1))
-    tower_conv1_2 = ConvFactory(tower_conv1_1, 384, (3, 3), stride=(2, 2))
-    tower_pool = mx.symbol.Pooling(net, kernel=(
-        3, 3), stride=(2, 2), pool_type='max')
-    net = mx.symbol.Concat(*[tower_conv, tower_conv1_2, tower_pool])
-    net = repeat(net, 20, block17, scale=0.1, input_num_channels=1088)
-    tower_conv = ConvFactory(net, 256, (1, 1))
-    tower_conv0_1 = ConvFactory(tower_conv, 384, (3, 3), stride=(2, 2))
-    tower_conv1 = ConvFactory(net, 256, (1, 1))
-    tower_conv1_1 = ConvFactory(tower_conv1, 288, (3, 3), stride=(2, 2))
-    tower_conv2 = ConvFactory(net, 256, (1, 1))
-    tower_conv2_1 = ConvFactory(tower_conv2, 288, (3, 3), pad=(1, 1))
-    tower_conv2_2 = ConvFactory(tower_conv2_1, 320, (3, 3),  stride=(2, 2))
-    tower_pool = mx.symbol.Pooling(net, kernel=(
-        3, 3), stride=(2, 2), pool_type='max')
-    net = mx.symbol.Concat(
-        *[tower_conv0_1, tower_conv1_1, tower_conv2_2, tower_pool])
-
-    net = repeat(net, 9, block8, scale=0.2, input_num_channels=2080)
-    net = block8(net, with_act=False, input_num_channels=2080)
-
-    net = ConvFactory(net, 1536, (1, 1))
-    net = mx.symbol.Pooling(net, kernel=(
-        1, 1), global_pool=True, stride=(2, 2), pool_type='avg')
-    net = mx.symbol.Flatten(net)
-    net = mx.symbol.Dropout(data=net, p=0.2)
-    net = mx.symbol.FullyConnected(data=net, num_hidden=num_classes)
-    softmax = mx.symbol.SoftmaxOutput(data=net, name='softmax')
-    return softmax
diff --git a/example/image-classification/symbols/inception-v3.py b/example/image-classification/symbols/inception-v3.py
deleted file mode 100644
index 5108579ffd3a..000000000000
--- a/example/image-classification/symbols/inception-v3.py
+++ /dev/null
@@ -1,193 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""
-Inception V3, suitable for images with around 299 x 299
-
-Reference:
-
-Szegedy, Christian, et al. "Rethinking the Inception Architecture for Computer Vision." arXiv preprint arXiv:1512.00567 (2015).
-"""
-import mxnet as mx
-import numpy as np
-
-def Conv(data, num_filter, kernel=(1, 1), stride=(1, 1), pad=(0, 0), name=None, suffix=''):
-    conv = mx.sym.Convolution(data=data, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad, no_bias=True, name='%s%s_conv2d' %(name, suffix))
-    bn = mx.sym.BatchNorm(data=conv, name='%s%s_batchnorm' %(name, suffix), fix_gamma=True)
-    act = mx.sym.Activation(data=bn, act_type='relu', name='%s%s_relu' %(name, suffix))
-    return act
-
-
-def Inception7A(data,
-                num_1x1,
-                num_3x3_red, num_3x3_1, num_3x3_2,
-                num_5x5_red, num_5x5,
-                pool, proj,
-                name):
-    tower_1x1 = Conv(data, num_1x1, name=('%s_conv' % name))
-    tower_5x5 = Conv(data, num_5x5_red, name=('%s_tower' % name), suffix='_conv')
-    tower_5x5 = Conv(tower_5x5, num_5x5, kernel=(5, 5), pad=(2, 2), name=('%s_tower' % name), suffix='_conv_1')
-    tower_3x3 = Conv(data, num_3x3_red, name=('%s_tower_1' % name), suffix='_conv')
-    tower_3x3 = Conv(tower_3x3, num_3x3_1, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), suffix='_conv_1')
-    tower_3x3 = Conv(tower_3x3, num_3x3_2, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), suffix='_conv_2')
-    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
-    cproj = Conv(pooling, proj, name=('%s_tower_2' %  name), suffix='_conv')
-    concat = mx.sym.Concat(*[tower_1x1, tower_5x5, tower_3x3, cproj], name='ch_concat_%s_chconcat' % name)
-    return concat
-
-# First Downsample
-def Inception7B(data,
-                num_3x3,
-                num_d3x3_red, num_d3x3_1, num_d3x3_2,
-                pool,
-                name):
-    tower_3x3 = Conv(data, num_3x3, kernel=(3, 3), pad=(0, 0), stride=(2, 2), name=('%s_conv' % name))
-    tower_d3x3 = Conv(data, num_d3x3_red, name=('%s_tower' % name), suffix='_conv')
-    tower_d3x3 = Conv(tower_d3x3, num_d3x3_1, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name=('%s_tower' % name), suffix='_conv_1')
-    tower_d3x3 = Conv(tower_d3x3, num_d3x3_2, kernel=(3, 3), pad=(0, 0), stride=(2, 2), name=('%s_tower' % name), suffix='_conv_2')
-    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pad=(0,0), pool_type="max", name=('max_pool_%s_pool' % name))
-    concat = mx.sym.Concat(*[tower_3x3, tower_d3x3, pooling], name='ch_concat_%s_chconcat' % name)
-    return concat
-
-def Inception7C(data,
-                num_1x1,
-                num_d7_red, num_d7_1, num_d7_2,
-                num_q7_red, num_q7_1, num_q7_2, num_q7_3, num_q7_4,
-                pool, proj,
-                name):
-    tower_1x1 = Conv(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_conv' % name))
-    tower_d7 = Conv(data=data, num_filter=num_d7_red, name=('%s_tower' % name), suffix='_conv')
-    tower_d7 = Conv(data=tower_d7, num_filter=num_d7_1, kernel=(1, 7), pad=(0, 3), name=('%s_tower' % name), suffix='_conv_1')
-    tower_d7 = Conv(data=tower_d7, num_filter=num_d7_2, kernel=(7, 1), pad=(3, 0), name=('%s_tower' % name), suffix='_conv_2')
-    tower_q7 = Conv(data=data, num_filter=num_q7_red, name=('%s_tower_1' % name), suffix='_conv')
-    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_1, kernel=(7, 1), pad=(3, 0), name=('%s_tower_1' % name), suffix='_conv_1')
-    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_2, kernel=(1, 7), pad=(0, 3), name=('%s_tower_1' % name), suffix='_conv_2')
-    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_3, kernel=(7, 1), pad=(3, 0), name=('%s_tower_1' % name), suffix='_conv_3')
-    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_4, kernel=(1, 7), pad=(0, 3), name=('%s_tower_1' % name), suffix='_conv_4')
-    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
-    cproj = Conv(data=pooling, num_filter=proj, kernel=(1, 1), name=('%s_tower_2' %  name), suffix='_conv')
-    # concat
-    concat = mx.sym.Concat(*[tower_1x1, tower_d7, tower_q7, cproj], name='ch_concat_%s_chconcat' % name)
-    return concat
-
-def Inception7D(data,
-                num_3x3_red, num_3x3,
-                num_d7_3x3_red, num_d7_1, num_d7_2, num_d7_3x3,
-                pool,
-                name):
-    tower_3x3 = Conv(data=data, num_filter=num_3x3_red, name=('%s_tower' % name), suffix='_conv')
-    tower_3x3 = Conv(data=tower_3x3, num_filter=num_3x3, kernel=(3, 3), pad=(0,0), stride=(2, 2), name=('%s_tower' % name), suffix='_conv_1')
-    tower_d7_3x3 = Conv(data=data, num_filter=num_d7_3x3_red, name=('%s_tower_1' % name), suffix='_conv')
-    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_1, kernel=(1, 7), pad=(0, 3), name=('%s_tower_1' % name), suffix='_conv_1')
-    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_2, kernel=(7, 1), pad=(3, 0), name=('%s_tower_1' % name), suffix='_conv_2')
-    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_3x3, kernel=(3, 3), stride=(2, 2), name=('%s_tower_1' % name), suffix='_conv_3')
-    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
-    # concat
-    concat = mx.sym.Concat(*[tower_3x3, tower_d7_3x3, pooling], name='ch_concat_%s_chconcat' % name)
-    return concat
-
-def Inception7E(data,
-                num_1x1,
-                num_d3_red, num_d3_1, num_d3_2,
-                num_3x3_d3_red, num_3x3, num_3x3_d3_1, num_3x3_d3_2,
-                pool, proj,
-                name):
-    tower_1x1 = Conv(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_conv' % name))
-    tower_d3 = Conv(data=data, num_filter=num_d3_red, name=('%s_tower' % name), suffix='_conv')
-    tower_d3_a = Conv(data=tower_d3, num_filter=num_d3_1, kernel=(1, 3), pad=(0, 1), name=('%s_tower' % name), suffix='_mixed_conv')
-    tower_d3_b = Conv(data=tower_d3, num_filter=num_d3_2, kernel=(3, 1), pad=(1, 0), name=('%s_tower' % name), suffix='_mixed_conv_1')
-    tower_3x3_d3 = Conv(data=data, num_filter=num_3x3_d3_red, name=('%s_tower_1' % name), suffix='_conv')
-    tower_3x3_d3 = Conv(data=tower_3x3_d3, num_filter=num_3x3, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), suffix='_conv_1')
-    tower_3x3_d3_a = Conv(data=tower_3x3_d3, num_filter=num_3x3_d3_1, kernel=(1, 3), pad=(0, 1), name=('%s_tower_1' % name), suffix='_mixed_conv')
-    tower_3x3_d3_b = Conv(data=tower_3x3_d3, num_filter=num_3x3_d3_2, kernel=(3, 1), pad=(1, 0), name=('%s_tower_1' % name), suffix='_mixed_conv_1')
-    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
-    cproj = Conv(data=pooling, num_filter=proj, kernel=(1, 1), name=('%s_tower_2' %  name), suffix='_conv')
-    # concat
-    concat = mx.sym.Concat(*[tower_1x1, tower_d3_a, tower_d3_b, tower_3x3_d3_a, tower_3x3_d3_b, cproj], name='ch_concat_%s_chconcat' % name)
-    return concat
-
-# In[49]:
-
-def get_symbol(num_classes=1000, dtype='float32', **kwargs):
-    data = mx.sym.Variable(name="data")
-    if dtype == 'float32':
-        data = mx.sym.identity(data=data, name='id')
-    else:
-        if dtype == 'float16':
-            data = mx.sym.Cast(data=data, dtype=np.float16)
-    # stage 1
-    conv = Conv(data, 32, kernel=(3, 3), stride=(2, 2), name="conv")
-    conv_1 = Conv(conv, 32, kernel=(3, 3), name="conv_1")
-    conv_2 = Conv(conv_1, 64, kernel=(3, 3), pad=(1, 1), name="conv_2")
-    pool = mx.sym.Pooling(data=conv_2, kernel=(3, 3), stride=(2, 2), pool_type="max", name="pool")
-    # stage 2
-    conv_3 = Conv(pool, 80, kernel=(1, 1), name="conv_3")
-    conv_4 = Conv(conv_3, 192, kernel=(3, 3), name="conv_4")
-    pool1 = mx.sym.Pooling(data=conv_4, kernel=(3, 3), stride=(2, 2), pool_type="max", name="pool1")
-    # stage 3
-    in3a = Inception7A(pool1, 64,
-                       64, 96, 96,
-                       48, 64,
-                       "avg", 32, "mixed")
-    in3b = Inception7A(in3a, 64,
-                       64, 96, 96,
-                       48, 64,
-                       "avg", 64, "mixed_1")
-    in3c = Inception7A(in3b, 64,
-                       64, 96, 96,
-                       48, 64,
-                       "avg", 64, "mixed_2")
-    in3d = Inception7B(in3c, 384,
-                       64, 96, 96,
-                       "max", "mixed_3")
-    # stage 4
-    in4a = Inception7C(in3d, 192,
-                       128, 128, 192,
-                       128, 128, 128, 128, 192,
-                       "avg", 192, "mixed_4")
-    in4b = Inception7C(in4a, 192,
-                       160, 160, 192,
-                       160, 160, 160, 160, 192,
-                       "avg", 192, "mixed_5")
-    in4c = Inception7C(in4b, 192,
-                       160, 160, 192,
-                       160, 160, 160, 160, 192,
-                       "avg", 192, "mixed_6")
-    in4d = Inception7C(in4c, 192,
-                       192, 192, 192,
-                       192, 192, 192, 192, 192,
-                       "avg", 192, "mixed_7")
-    in4e = Inception7D(in4d, 192, 320,
-                       192, 192, 192, 192,
-                       "max", "mixed_8")
-    # stage 5
-    in5a = Inception7E(in4e, 320,
-                       384, 384, 384,
-                       448, 384, 384, 384,
-                       "avg", 192, "mixed_9")
-    in5b = Inception7E(in5a, 320,
-                       384, 384, 384,
-                       448, 384, 384, 384,
-                       "max", 192, "mixed_10")
-    # pool
-    pool = mx.sym.Pooling(data=in5b, kernel=(8, 8), stride=(1, 1), pool_type="avg", name="global_pool")
-    flatten = mx.sym.Flatten(data=pool, name="flatten")
-    fc1 = mx.sym.FullyConnected(data=flatten, num_hidden=num_classes, name='fc1')
-    if dtype == 'float16':
-        fc1 = mx.sym.Cast(data=fc1, dtype=np.float32)
-    softmax = mx.sym.SoftmaxOutput(data=fc1, name='softmax')
-    return softmax
diff --git a/example/image-classification/symbols/inception-v4.py b/example/image-classification/symbols/inception-v4.py
deleted file mode 100644
index 2b4fe6fbb0c7..000000000000
--- a/example/image-classification/symbols/inception-v4.py
+++ /dev/null
@@ -1,215 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# -*- coding:utf-8 -*-
-__author__ = 'zhangshuai'
-modified_date = '16/7/5'
-__modify__ = 'anchengwu'
-modified_date = '17/2/22'
-
-'''
-Inception v4 , suittable for image with around 299 x 299
-
-Reference:
-    Inception-v4, Inception-ResNet and the Impact of Residual Connections on Learning
-    Christian Szegedy, Sergey Ioffe, Vincent Vanhoucke
-    arXiv.1602.07261
-'''
-import mxnet as mx
-import numpy as np
-
-def Conv(data, num_filter, kernel=(1, 1), stride=(1, 1), pad=(0, 0), name=None, suffix=''):
-    conv = mx.sym.Convolution(data=data, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad, no_bias=True, name='%s%s_conv2d' %(name, suffix))
-    bn = mx.sym.BatchNorm(data=conv, name='%s%s_batchnorm' %(name, suffix), fix_gamma=True)
-    act = mx.sym.Activation(data=bn, act_type='relu', name='%s%s_relu' %(name, suffix))
-
-    return act
-
-
-def Inception_stem(data, name= None):
-    c = Conv(data, 32, kernel=(3, 3), stride=(2, 2), name='%s_conv1_3*3' %name)
-    c = Conv(c, 32, kernel=(3, 3), name='%s_conv2_3*3' %name)
-    c = Conv(c, 64, kernel=(3, 3), pad=(1, 1), name='%s_conv3_3*3' %name)
-
-    p1 = mx.sym.Pooling(c, kernel=(3, 3), stride=(2, 2), pool_type='max', name='%s_maxpool_1' %name)
-    c2 = Conv(c, 96, kernel=(3, 3), stride=(2, 2), name='%s_conv4_3*3' %name)
-    concat = mx.sym.Concat(*[p1, c2], name='%s_concat_1' %name)
-
-    c1 = Conv(concat, 64, kernel=(1, 1), pad=(0, 0), name='%s_conv5_1*1' %name)
-    c1 = Conv(c1, 96, kernel=(3, 3), name='%s_conv6_3*3' %name)
-
-    c2 = Conv(concat, 64, kernel=(1, 1), pad=(0, 0), name='%s_conv7_1*1' %name)
-    c2 = Conv(c2, 64, kernel=(7, 1), pad=(3, 0), name='%s_conv8_7*1' %name)
-    c2 = Conv(c2, 64, kernel=(1, 7), pad=(0, 3), name='%s_conv9_1*7' %name)
-    c2 = Conv(c2, 96, kernel=(3, 3), pad=(0, 0), name='%s_conv10_3*3' %name)
-
-    concat = mx.sym.Concat(*[c1, c2], name='%s_concat_2' %name)
-
-    c1 = Conv(concat, 192, kernel=(3, 3), stride=(2, 2), name='%s_conv11_3*3' %name)
-    p1 = mx.sym.Pooling(concat, kernel=(3, 3), stride=(2, 2), pool_type='max', name='%s_maxpool_2' %name)
-
-    concat = mx.sym.Concat(*[c1, p1], name='%s_concat_3' %name)
-
-    return concat
-
-
-def InceptionA(input, name=None):
-    p1 = mx.sym.Pooling(input, kernel=(3, 3), pad=(1, 1), pool_type='avg', name='%s_avgpool_1' %name)
-    c1 = Conv(p1, 96, kernel=(1, 1), pad=(0, 0), name='%s_conv1_1*1' %name)
-
-    c2 = Conv(input, 96, kernel=(1, 1), pad=(0, 0), name='%s_conv2_1*1' %name)
-
-    c3 = Conv(input, 64, kernel=(1, 1), pad=(0, 0), name='%s_conv3_1*1' %name)
-    c3 = Conv(c3, 96, kernel=(3, 3), pad=(1, 1), name='%s_conv4_3*3' %name)
-
-    c4 = Conv(input, 64, kernel=(1, 1), pad=(0, 0), name='%s_conv5_1*1' % name)
-    c4 = Conv(c4, 96, kernel=(3, 3), pad=(1, 1), name='%s_conv6_3*3' % name)
-    c4 = Conv(c4, 96, kernel=(3, 3), pad=(1, 1), name='%s_conv7_3*3' %name)
-
-    concat = mx.sym.Concat(*[c1, c2, c3, c4], name='%s_concat_1' %name)
-
-    return concat
-
-
-def ReductionA(input, name=None):
-    p1 = mx.sym.Pooling(input, kernel=(3, 3), stride=(2, 2), pool_type='max', name='%s_maxpool_1' %name)
-
-    c2 = Conv(input, 384, kernel=(3, 3), stride=(2, 2), name='%s_conv1_3*3' %name)
-
-    c3 = Conv(input, 192, kernel=(1, 1), pad=(0, 0), name='%s_conv2_1*1' %name)
-    c3 = Conv(c3, 224, kernel=(3, 3), pad=(1, 1), name='%s_conv3_3*3' %name)
-    c3 = Conv(c3, 256, kernel=(3, 3), stride=(2, 2), pad=(0, 0), name='%s_conv4_3*3' %name)
-
-    concat = mx.sym.Concat(*[p1, c2, c3], name='%s_concat_1' %name)
-
-    return concat
-
-def InceptionB(input, name=None):
-    p1 = mx.sym.Pooling(input, kernel=(3, 3), pad=(1, 1), pool_type='avg', name='%s_avgpool_1' %name)
-    c1 = Conv(p1, 128, kernel=(1, 1), pad=(0, 0), name='%s_conv1_1*1' %name)
-
-    c2 = Conv(input, 384, kernel=(1, 1), pad=(0, 0), name='%s_conv2_1*1' %name)
-
-    c3 = Conv(input, 192, kernel=(1, 1), pad=(0, 0), name='%s_conv3_1*1' %name)
-    c3 = Conv(c3, 224, kernel=(1, 7), pad=(0, 3), name='%s_conv4_1*7' %name)
-    #paper wrong
-    c3 = Conv(c3, 256, kernel=(7, 1), pad=(3, 0), name='%s_conv5_1*7' %name)
-
-    c4 = Conv(input, 192, kernel=(1, 1), pad=(0, 0), name='%s_conv6_1*1' %name)
-    c4 = Conv(c4, 192, kernel=(1, 7), pad=(0, 3), name='%s_conv7_1*7' %name)
-    c4 = Conv(c4, 224, kernel=(7, 1), pad=(3, 0), name='%s_conv8_7*1' %name)
-    c4 = Conv(c4, 224, kernel=(1, 7), pad=(0, 3), name='%s_conv9_1*7' %name)
-    c4 = Conv(c4, 256, kernel=(7, 1), pad=(3, 0), name='%s_conv10_7*1' %name)
-
-    concat = mx.sym.Concat(*[c1, c2, c3, c4], name='%s_concat_1' %name)
-
-    return concat
-
-def ReductionB(input,name=None):
-    p1 = mx.sym.Pooling(input, kernel=(3, 3), stride=(2, 2), pool_type='max', name='%s_maxpool_1' %name)
-
-    c2 = Conv(input, 192, kernel=(1, 1), pad=(0, 0), name='%s_conv1_1*1' %name)
-    c2 = Conv(c2, 192, kernel=(3, 3), stride=(2, 2), name='%s_conv2_3*3' %name)
-
-    c3 = Conv(input, 256, kernel=(1, 1), pad=(0, 0), name='%s_conv3_1*1' %name)
-    c3 = Conv(c3, 256, kernel=(1, 7), pad=(0, 3), name='%s_conv4_1*7' %name)
-    c3 = Conv(c3, 320, kernel=(7, 1), pad=(3, 0), name='%s_conv5_7*1' %name)
-    c3 = Conv(c3, 320, kernel=(3, 3), stride=(2, 2), name='%s_conv6_3*3' %name)
-
-    concat = mx.sym.Concat(*[p1, c2, c3], name='%s_concat_1' %name)
-
-    return concat
-
-
-def InceptionC(input, name=None):
-    p1 = mx.sym.Pooling(input, kernel=(3, 3), pad=(1, 1), pool_type='avg', name='%s_avgpool_1' %name)
-    c1 = Conv(p1, 256, kernel=(1, 1), pad=(0, 0), name='%s_conv1_1*1' %name)
-
-    c2 = Conv(input, 256, kernel=(1, 1), pad=(0, 0), name='%s_conv2_1*1' %name)
-
-    c3 = Conv(input, 384, kernel=(1, 1), pad=(0, 0), name='%s_conv3_1*1' %name)
-    c3_1 = Conv(c3, 256, kernel=(1, 3), pad=(0, 1), name='%s_conv4_3*1' %name)
-    c3_2 = Conv(c3, 256, kernel=(3, 1), pad=(1, 0), name='%s_conv5_1*3' %name)
-
-    c4 = Conv(input, 384, kernel=(1, 1), pad=(0, 0), name='%s_conv6_1*1' %name)
-    c4 = Conv(c4, 448, kernel=(1, 3), pad=(0, 1), name='%s_conv7_1*3' %name)
-    c4 = Conv(c4, 512, kernel=(3, 1), pad=(1, 0), name='%s_conv8_3*1' %name)
-    c4_1 = Conv(c4, 256, kernel=(3, 1), pad=(1, 0), name='%s_conv9_1*3' %name)
-    c4_2 = Conv(c4, 256, kernel=(1, 3), pad=(0, 1), name='%s_conv10_3*1' %name)
-
-    concat = mx.sym.Concat(*[c1, c2, c3_1, c3_2, c4_1, c4_2], name='%s_concat' %name)
-
-    return concat
-
-
-def get_symbol(num_classes=1000, dtype='float32', **kwargs):
-    data = mx.sym.Variable(name="data")
-    if dtype == 'float32':
-        data = mx.sym.identity(data=data, name='id')
-    else:
-        if dtype == 'float16':
-            data = mx.sym.Cast(data=data, dtype=np.float16)
-    x = Inception_stem(data, name='in_stem')
-
-    #4 * InceptionA
-    # x = InceptionA(x, name='in1A')
-    # x = InceptionA(x, name='in2A')
-    # x = InceptionA(x, name='in3A')
-    # x = InceptionA(x, name='in4A')
-
-    for i in range(4):
-        x = InceptionA(x, name='in%dA' %(i+1))
-
-    #Reduction A
-    x = ReductionA(x, name='re1A')
-
-    #7 * InceptionB
-    # x = InceptionB(x, name='in1B')
-    # x = InceptionB(x, name='in2B')
-    # x = InceptionB(x, name='in3B')
-    # x = InceptionB(x, name='in4B')
-    # x = InceptionB(x, name='in5B')
-    # x = InceptionB(x, name='in6B')
-    # x = InceptionB(x, name='in7B')
-
-    for i in range(7):
-        x = InceptionB(x, name='in%dB' %(i+1))
-
-    #ReductionB
-    x = ReductionB(x, name='re1B')
-
-    #3 * InceptionC
-    # x = InceptionC(x, name='in1C')
-    # x = InceptionC(x, name='in2C')
-    # x = InceptionC(x, name='in3C')
-
-    for i in range(3):
-        x = InceptionC(x, name='in%dC' %(i+1))
-
-    #Average Pooling
-    x = mx.sym.Pooling(x, kernel=(8, 8), pad=(1, 1), pool_type='avg', name='global_avgpool')
-
-    #Dropout
-    x = mx.sym.Dropout(x, p=0.2)
-
-    flatten = mx.sym.Flatten(x, name='flatten')
-    fc1 = mx.sym.FullyConnected(flatten, num_hidden=num_classes, name='fc1')
-    if dtype == 'float16':
-        fc1 = mx.sym.Cast(data=fc1, dtype=np.float32)
-    softmax = mx.sym.SoftmaxOutput(fc1, name='softmax')
-
-    return softmax
diff --git a/example/image-classification/symbols/lenet.py b/example/image-classification/symbols/lenet.py
deleted file mode 100644
index f2cc106f60ac..000000000000
--- a/example/image-classification/symbols/lenet.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""
-LeCun, Yann, Leon Bottou, Yoshua Bengio, and Patrick Haffner.
-Gradient-based learning applied to document recognition.
-Proceedings of the IEEE (1998)
-"""
-import mxnet as mx
-
-def get_loc(data, attr={'lr_mult':'0.01'}):
-    """
-    the localisation network in lenet-stn, it will increase acc about more than 1%,
-    when num-epoch >=15
-    """
-    loc = mx.symbol.Convolution(data=data, num_filter=30, kernel=(5, 5), stride=(2,2))
-    loc = mx.symbol.Activation(data = loc, act_type='relu')
-    loc = mx.symbol.Pooling(data=loc, kernel=(2, 2), stride=(2, 2), pool_type='max')
-    loc = mx.symbol.Convolution(data=loc, num_filter=60, kernel=(3, 3), stride=(1,1), pad=(1, 1))
-    loc = mx.symbol.Activation(data = loc, act_type='relu')
-    loc = mx.symbol.Pooling(data=loc, global_pool=True, kernel=(2, 2), pool_type='avg')
-    loc = mx.symbol.Flatten(data=loc)
-    loc = mx.symbol.FullyConnected(data=loc, num_hidden=6, name="stn_loc", attr=attr)
-    return loc
-
-
-def get_symbol(num_classes=10, add_stn=False, **kwargs):
-    data = mx.symbol.Variable('data')
-    if add_stn:
-        data = mx.sym.SpatialTransformer(data=data, loc=get_loc(data), target_shape = (28,28),
-                                         transform_type="affine", sampler_type="bilinear")
-    # first conv
-    conv1 = mx.symbol.Convolution(data=data, kernel=(5,5), num_filter=20)
-    tanh1 = mx.symbol.Activation(data=conv1, act_type="tanh")
-    pool1 = mx.symbol.Pooling(data=tanh1, pool_type="max",
-                              kernel=(2,2), stride=(2,2))
-    # second conv
-    conv2 = mx.symbol.Convolution(data=pool1, kernel=(5,5), num_filter=50)
-    tanh2 = mx.symbol.Activation(data=conv2, act_type="tanh")
-    pool2 = mx.symbol.Pooling(data=tanh2, pool_type="max",
-                              kernel=(2,2), stride=(2,2))
-    # first fullc
-    flatten = mx.symbol.Flatten(data=pool2)
-    fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=500)
-    tanh3 = mx.symbol.Activation(data=fc1, act_type="tanh")
-    # second fullc
-    fc2 = mx.symbol.FullyConnected(data=tanh3, num_hidden=num_classes)
-    # loss
-    lenet = mx.symbol.SoftmaxOutput(data=fc2, name='softmax')
-    return lenet
diff --git a/example/image-classification/symbols/mlp.py b/example/image-classification/symbols/mlp.py
deleted file mode 100644
index 4b190b29db9e..000000000000
--- a/example/image-classification/symbols/mlp.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""
-a simple multilayer perceptron
-"""
-import mxnet as mx
-
-def get_symbol(num_classes=10, **kwargs):
-    data = mx.symbol.Variable('data')
-    data = mx.sym.Flatten(data=data)
-    fc1  = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128)
-    act1 = mx.symbol.Activation(data = fc1, name='relu1', act_type="relu")
-    fc2  = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 64)
-    act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu")
-    fc3  = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=num_classes)
-    mlp  = mx.symbol.SoftmaxOutput(data = fc3, name = 'softmax')
-    return mlp
diff --git a/example/image-classification/symbols/mobilenet.py b/example/image-classification/symbols/mobilenet.py
deleted file mode 100644
index bf3de4a2c6f8..000000000000
--- a/example/image-classification/symbols/mobilenet.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# -*- coding:utf-8 -*-
-'''
-mobilenet
-Suittable for image with around resolution x resolution, resolution is multiple of 32.
-
-Reference:
-MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications
-https://arxiv.org/abs/1704.04861
-'''
-
-__author__ = 'qingzhouzhen'
-__date__ = '17/8/5'
-__modify__ = 'dwSun'
-__modified_date__ = '17/11/30'
-
-
-import mxnet as mx
-
-alpha_values = [0.25, 0.50, 0.75, 1.0]
-
-
-def Conv(data, num_filter=1, kernel=(1, 1), stride=(1, 1), pad=(0, 0), num_group=1, name='', suffix=''):
-    conv = mx.sym.Convolution(data=data, num_filter=num_filter, kernel=kernel, num_group=num_group, stride=stride, pad=pad, no_bias=True, name='%s%s_conv2d' % (name, suffix))
-    bn = mx.sym.BatchNorm(data=conv, name='%s%s_batchnorm' % (name, suffix), fix_gamma=True)
-    act = mx.sym.Activation(data=bn, act_type='relu', name='%s%s_relu' % (name, suffix))
-    return act
-
-
-def Conv_DPW(data, depth=1, stride=(1, 1), name='', idx=0, suffix=''):
-    conv_dw = Conv(data, num_group=depth, num_filter=depth, kernel=(3, 3), pad=(1, 1), stride=stride, name="conv_%d_dw" % (idx), suffix=suffix)
-    conv = Conv(conv_dw, num_filter=depth * stride[0], kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_%d" % (idx), suffix=suffix)
-    return conv
-
-
-def get_symbol_compact(num_classes, alpha=1, resolution=224, **kwargs):
-    assert alpha in alpha_values, 'Invalid alpha={0}, must be one of {1}'.format(alpha, alpha_values)
-    assert resolution % 32 == 0, 'resolution must be multiple of 32'
-
-    base = int(32 * alpha)
-
-    data = mx.symbol.Variable(name="data")  # 224
-    conv_1 = Conv(data, num_filter=base, kernel=(3, 3), pad=(1, 1), stride=(2, 2), name="conv_1")  # 32*alpha, 224/112
-
-    conv_2_dw = Conv(conv_1, num_group=base, num_filter=base, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name="conv_2_dw")  # 112/112
-    conv_2 = Conv(conv_2_dw, num_filter=base * 2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_2")  # 32*alpha, 112/112
-
-    conv_3_dpw = Conv_DPW(conv_2, depth=base * 2, stride=(2, 2), idx=3)  # 64*alpha, 112/56 => 56/56
-    conv_4_dpw = Conv_DPW(conv_3_dpw, depth=base * 4, stride=(1, 1), idx=4)  # 128*alpha, 56/56 =>56/56
-    conv_5_dpw = Conv_DPW(conv_4_dpw, depth=base * 4, stride=(2, 2), idx=5)  # 128*alpha, 56/28 => 28/28
-    conv_6_dpw = Conv_DPW(conv_5_dpw, depth=base * 8, stride=(1, 1), idx=6)  # 256*alpha, 28/28 => 28/28
-    conv_7_dpw = Conv_DPW(conv_6_dpw, depth=base * 8, stride=(2, 2), idx=7)  # 256*alpha, 28/14 => 14/14
-    conv_dpw = conv_7_dpw
-
-    for idx in range(8, 13):
-        conv_dpw = Conv_DPW(conv_dpw, depth=base * 16, stride=(1, 1), idx=idx)  # 512*alpha, 14/14
-
-    conv_12_dpw = conv_dpw
-    conv_13_dpw = Conv_DPW(conv_12_dpw, depth=base * 16, stride=(2, 2), idx=13)  # 512*alpha, 14/7 => 7/7
-    conv_14_dpw = Conv_DPW(conv_13_dpw, depth=base * 32, stride=(1, 1), idx=14)  # 1024*alpha, 7/7 => 7/7
-
-    pool_size = int(resolution / 32)
-    pool = mx.sym.Pooling(data=conv_14_dpw, kernel=(pool_size, pool_size), stride=(1, 1), pool_type="avg", name="global_pool")
-    flatten = mx.sym.Flatten(data=pool, name="flatten")
-    fc = mx.symbol.FullyConnected(data=flatten, num_hidden=num_classes, name='fc')
-    softmax = mx.symbol.SoftmaxOutput(data=fc, name='softmax')
-    return softmax
-
-
-def get_symbol(num_classes, alpha=1, resolution=224, **kwargs):
-    assert alpha in alpha_values, 'Invalid alpha=[{0}], must be one of [{1}]'.format(alpha, alpha_values)
-    assert resolution % 32 == 0, 'resolution must be multpile of 32'
-
-    base = int(32 * alpha)
-
-    data = mx.symbol.Variable(name="data")  # 224
-    depth = base  # 32*alpha
-    conv_1 = Conv(data, num_filter=depth, kernel=(3, 3), pad=(1, 1), stride=(2, 2), name="conv_1")  # 224/112
-
-    depth = base  # 32*alpha
-    conv_2_dw = Conv(conv_1, num_group=depth, num_filter=depth, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name="conv_2_dw")  # 112/112
-    conv_2 = Conv(conv_2_dw, num_filter=depth * 2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_2")  # 112/112
-
-    depth = base * 2  # 64*alpha
-    conv_3_dw = Conv(conv_2, num_group=depth, num_filter=depth, kernel=(3, 3), pad=(1, 1), stride=(2, 2), name="conv_3_dw")  # 112/56
-    conv_3 = Conv(conv_3_dw, num_filter=depth * 2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_3")  # 56/56
-
-    depth = base * 4  # 128*alpha
-    conv_4_dw = Conv(conv_3, num_group=depth, num_filter=depth, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name="conv_4_dw")  # 56/56
-    conv_4 = Conv(conv_4_dw, num_filter=depth, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_4")  # 56/56
-
-    depth = base * 4  # 128*alpha
-    conv_5_dw = Conv(conv_4, num_group=depth, num_filter=depth, kernel=(3, 3), pad=(1, 1), stride=(2, 2), name="conv_5_dw")  # 56/28
-    conv_5 = Conv(conv_5_dw, num_filter=depth * 2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_5")  # 28/28
-
-    depth = base * 8  # 256*alpha
-    conv_6_dw = Conv(conv_5, num_group=depth, num_filter=depth, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name="conv_6_dw")  # 28/28
-    conv_6 = Conv(conv_6_dw, num_filter=depth, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_6")  # 28/28
-
-    depth = base * 8  # 256*alpha
-    conv_7_dw = Conv(conv_6, num_group=depth, num_filter=depth, kernel=(3, 3), pad=(1, 1), stride=(2, 2), name="conv_7_dw")  # 28/14
-    conv_7 = Conv(conv_7_dw, num_filter=depth * 2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_7")  # 14/14
-
-    depth = base * 16  # 512*alpha
-    conv_8_dw = Conv(conv_7, num_group=depth, num_filter=depth, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name="conv_8_dw")  # 14/14
-    conv_8 = Conv(conv_8_dw, num_filter=depth, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_8")  # 14/14
-    conv_9_dw = Conv(conv_8, num_group=depth, num_filter=depth, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name="conv_9_dw")  # 14/14
-    conv_9 = Conv(conv_9_dw, num_filter=depth, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_9")  # 14/14
-    conv_10_dw = Conv(conv_9, num_group=depth, num_filter=depth, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name="conv_10_dw")  # 14/14
-    conv_10 = Conv(conv_10_dw, num_filter=depth, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_10")  # 14/14
-    conv_11_dw = Conv(conv_10, num_group=depth, num_filter=depth, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name="conv_11_dw")  # 14/14
-    conv_11 = Conv(conv_11_dw, num_filter=depth, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_11")  # 14/14
-    conv_12_dw = Conv(conv_11, num_group=depth, num_filter=depth, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name="conv_12_dw")  # 14/14
-    conv_12 = Conv(conv_12_dw, num_filter=depth, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_12")  # 14/14
-
-    depth = base * 16  # 512*alpha
-    conv_13_dw = Conv(conv_12, num_group=depth, num_filter=depth, kernel=(3, 3), pad=(1, 1), stride=(2, 2), name="conv_13_dw")  # 14/7
-    conv_13 = Conv(conv_13_dw, num_filter=depth * 2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_13")  # 7/7
-
-    depth = base * 32  # 1024*alpha
-    conv_14_dw = Conv(conv_13, num_group=depth, num_filter=depth, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name="conv_14_dw")  # 7/7
-    conv_14 = Conv(conv_14_dw, num_filter=depth, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_14")  # 7/7
-
-    pool_size = int(resolution / 32)
-    pool = mx.sym.Pooling(data=conv_14, kernel=(pool_size, pool_size), stride=(1, 1), pool_type="avg", name="global_pool")
-    flatten = mx.sym.Flatten(data=pool, name="flatten")
-    fc = mx.symbol.FullyConnected(data=flatten, num_hidden=num_classes, name='fc')
-    softmax = mx.symbol.SoftmaxOutput(data=fc, name='softmax')
-    return softmax
diff --git a/example/image-classification/symbols/mobilenetv2.py b/example/image-classification/symbols/mobilenetv2.py
deleted file mode 100644
index 00831ce4023e..000000000000
--- a/example/image-classification/symbols/mobilenetv2.py
+++ /dev/null
@@ -1,218 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# -*- coding:utf-8 -*-
-'''
-MobileNetV2, implemented in built-in symbols.
-
-Reference:
-Inverted Residuals and Linear Bottlenecks:
-Mobile Networks for Classification, Detection and Segmentation
-https://arxiv.org/abs/1801.04381
-'''
-__author__ = 'liangfu'
-__date__ = '18/4/3'
-
-import mxnet as mx
-
-def relu6(data, prefix):
-    return mx.sym.clip(data,0,6,name='%s-relu6'%prefix)
-
-def shortcut(data_in, data_residual, prefix):
-    out=mx.sym.elemwise_add(data_in, data_residual, name='%s-shortcut'%prefix)
-    return out
-
-def mobilenet_unit(data, num_filter=1, kernel=(1, 1), stride=(1, 1), pad=(0, 0), num_group=1, if_act=True, prefix=''):
-    conv = mx.sym.Convolution(
-        data=data,
-        num_filter=num_filter,
-        kernel=kernel,
-        num_group=num_group,
-        stride=stride,
-        pad=pad,
-        no_bias=True,
-        name='%s-conv2d'%prefix)
-    bn = mx.sym.BatchNorm(data=conv, name='%s-batchnorm'%prefix, fix_gamma=False, use_global_stats=False, eps=1e-5)
-    if if_act:
-        act = relu6(bn, prefix)
-        return act
-    else:
-        return bn
-
-def inverted_residual_unit(data, num_in_filter, num_filter, ifshortcut, stride, kernel, pad, expansion_factor, prefix):
-    num_expfilter = int(round(num_in_filter*expansion_factor))
-
-    channel_expand = mobilenet_unit(
-        data=data,
-        num_filter=num_expfilter,
-        kernel=(1,1),
-        stride=(1,1),
-        pad=(0,0),
-        num_group=1,
-        if_act=True,
-        prefix='%s-exp'%prefix,
-    )
-    bottleneck_conv = mobilenet_unit(
-        data= channel_expand,
-        num_filter=num_expfilter,
-        stride=stride,
-        kernel=kernel,
-        pad=pad,
-        num_group=num_expfilter,
-        if_act=True,
-        prefix='%s-depthwise'%prefix,
-    )
-    linear_out = mobilenet_unit(
-        data=bottleneck_conv,
-        num_filter=num_filter,
-        kernel=(1, 1),
-        stride=(1, 1),
-        pad=(0, 0),
-        num_group=1,
-        if_act=False,
-        prefix='%s-linear'%prefix
-    )
-    if ifshortcut:
-        out = shortcut(
-            data_in=data,
-            data_residual=linear_out,
-            prefix=prefix,
-        ) 
-        return out
-    else:
-        return linear_out
-
-def inverted_residual_blocks(data, in_c, t, c, n, s, prefix):
-    first_block = inverted_residual_unit(
-        data=data,
-        num_in_filter=in_c,
-        num_filter=c,
-        ifshortcut=False,
-        stride=(s,s),
-        kernel=(3,3),
-        pad=(1,1),
-        expansion_factor=t,
-        prefix='%s-block0'%prefix
-    )
-
-    last_residual_block = first_block
-    last_c = c
-
-    for i in range(1,n):
-        last_residual_block = inverted_residual_unit(
-            data=last_residual_block,
-            num_in_filter=last_c,
-            num_filter=c,
-            ifshortcut=True,
-            stride=(1,1),
-            kernel=(3,3),
-            pad=(1,1),
-            expansion_factor=t,
-            prefix='%s-block%d'%(prefix, i)
-        )
-    return last_residual_block
-
-MNETV2_CONFIGS_MAP = {
-    (224,224):{
-        'firstconv_filter_num': 32, # 3*224*224 -> 32*112*112
-        # t, c, n, s
-        'bottleneck_params_list':[
-            (1, 16, 1, 1), # 32x112x112 -> 16x112x112
-            (6, 24, 2, 2), # 16x112x112 -> 24x56x56
-            (6, 32, 3, 2), # 24x56x56 -> 32x28x28
-            (6, 64, 4, 2), # 32x28x28 -> 64x14x14
-            (6, 96, 3, 1), # 64x14x14 -> 96x14x14
-            (6, 160, 3, 2), # 96x14x14 -> 160x7x7
-            (6, 320, 1, 1), # 160x7x7 -> 320x7x7
-        ],
-        'filter_num_before_gp': 1280, # 320x7x7 -> 1280x7x7
-    } 
-}
-
-class MobileNetV2(object):
-    def __init__(self, data_wh, multiplier, **kargs):
-        super(MobileNetV2, self).__init__()
-        self.data_wh=data_wh
-        self.multiplier=multiplier
-        if self.data_wh in MNETV2_CONFIGS_MAP:
-            self.config_map=MNETV2_CONFIGS_MAP[self.data_wh]
-        else:
-            self.config_map=MNETV2_CONFIGS_MAP[(224, 224)]
-    
-    def build_network(self, class_num=1000, **configs):
-        data = mx.sym.Variable('data')
-        self.config_map.update(configs)
-        # first conv2d block
-        first_c = int(round(self.config_map['firstconv_filter_num']*self.multiplier))
-        first_layer = mobilenet_unit(
-            data=data,
-            num_filter=first_c,
-            kernel=(3,3),
-            stride=(2,2),
-            pad=(1,1),
-            if_act=True,
-            prefix='first-3x3-conv'
-        )
-        last_bottleneck_layer = first_layer
-        in_c = first_c
-        # bottleneck sequences
-        for i, layer_setting in enumerate(self.config_map['bottleneck_params_list']):
-            t, c, n, s = layer_setting
-            last_bottleneck_layer = inverted_residual_blocks(
-                data=last_bottleneck_layer,
-                in_c=in_c, t=t, c=int(round(c*self.multiplier)), n=n, s=s, 
-                prefix='seq-%d'%i
-            )
-            in_c = int(round(c*self.multiplier))
-        # last conv2d block before global pooling
-        last_fm = mobilenet_unit(
-            data=last_bottleneck_layer,
-            num_filter=int(1280 * self.multiplier) if self.multiplier > 1.0 else 1280,
-            kernel=(1,1),
-            stride=(1,1),
-            pad=(0,0),
-            if_act=True,
-            prefix='last-1x1-conv'
-        )
-        # global average pooling
-        pool_size = int(self.data_wh[0] / 32)
-        pool = mx.sym.Pooling(data=last_fm, kernel=(pool_size, pool_size), stride=(1, 1), 
-                              pool_type="avg", name="global_pool", global_pool=True)
-        flatten = mx.sym.Flatten(data=pool, name="flatten")
-        fc = mx.symbol.FullyConnected(data=flatten, num_hidden=class_num, name='fc')
-        softmax = mx.symbol.SoftmaxOutput(data=fc, name='softmax')
-        
-        return softmax
-
-    def __call__(self, class_num=1000, layer_out=None, **configs):
-        # build the whole architecture of mobilenet v2 here
-        sym = self.build_network(class_num=class_num,**configs)
-        if layer_out is None:
-            return sym
-
-        internals = sym.get_internals()
-        if type(layer_out) is list or type(layer_out) is tuple:
-            layers_out = [internals[layer_nm.strip() + '_output'] for layer_nm in layer_out]
-            return layers_out
-        else:
-            layer_out = internals[layer_out.strip() + '_output']
-            return layer_out
-
-def get_symbol(num_classes=1000, multiplier=1.0):
-    mnetgen = MobileNetV2((224,224), multiplier=multiplier)
-    mnetv2_sym = mnetgen(class_num=num_classes, layer_out=None)
-    return mnetv2_sym
diff --git a/example/image-classification/symbols/resnet-v1.py b/example/image-classification/symbols/resnet-v1.py
deleted file mode 100644
index e5752f775447..000000000000
--- a/example/image-classification/symbols/resnet-v1.py
+++ /dev/null
@@ -1,200 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-'''
-Adapted from https://github.com/tornadomeet/ResNet/blob/master/symbol_resnet.py
-(Original author Wei Wu) by Antti-Pekka Hynninen
-
-Implementing the original resnet ILSVRC 2015 winning network from:
-
-Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. "Deep Residual Learning for Image Recognition"
-'''
-import mxnet as mx
-import numpy as np
-
-def residual_unit(data, num_filter, stride, dim_match, name, bottle_neck=True, bn_mom=0.9, workspace=256, memonger=False):
-    """Return ResNet Unit symbol for building ResNet
-    Parameters
-    ----------
-    data : str
-        Input data
-    num_filter : int
-        Number of output channels
-    bnf : int
-        Bottle neck channels factor with regard to num_filter
-    stride : tuple
-        Stride used in convolution
-    dim_match : Boolean
-        True means channel number between input and output is the same, otherwise means differ
-    name : str
-        Base name of the operators
-    workspace : int
-        Workspace used in convolution operator
-    """
-    if bottle_neck:
-        conv1 = mx.sym.Convolution(data=data, num_filter=int(num_filter*0.25), kernel=(1,1), stride=stride, pad=(0,0),
-                                   no_bias=True, workspace=workspace, name=name + '_conv1')
-        bn1 = mx.sym.BatchNorm(data=conv1, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn1')
-        act1 = mx.sym.Activation(data=bn1, act_type='relu', name=name + '_relu1')
-        conv2 = mx.sym.Convolution(data=act1, num_filter=int(num_filter*0.25), kernel=(3,3), stride=(1,1), pad=(1,1),
-                                   no_bias=True, workspace=workspace, name=name + '_conv2')
-        bn2 = mx.sym.BatchNorm(data=conv2, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn2')
-        act2 = mx.sym.Activation(data=bn2, act_type='relu', name=name + '_relu2')
-        conv3 = mx.sym.Convolution(data=act2, num_filter=num_filter, kernel=(1,1), stride=(1,1), pad=(0,0), no_bias=True,
-                                   workspace=workspace, name=name + '_conv3')
-        bn3 = mx.sym.BatchNorm(data=conv3, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn3')
-
-        if dim_match:
-            shortcut = data
-        else:
-            conv1sc = mx.sym.Convolution(data=data, num_filter=num_filter, kernel=(1,1), stride=stride, no_bias=True,
-                                            workspace=workspace, name=name+'_conv1sc')
-            shortcut = mx.sym.BatchNorm(data=conv1sc, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_sc')
-        if memonger:
-            shortcut._set_attr(mirror_stage='True')
-        return mx.sym.Activation(data=bn3 + shortcut, act_type='relu', name=name + '_relu3')
-    else:
-        conv1 = mx.sym.Convolution(data=data, num_filter=num_filter, kernel=(3,3), stride=stride, pad=(1,1),
-                                      no_bias=True, workspace=workspace, name=name + '_conv1')
-        bn1 = mx.sym.BatchNorm(data=conv1, fix_gamma=False, momentum=bn_mom, eps=2e-5, name=name + '_bn1')
-        act1 = mx.sym.Activation(data=bn1, act_type='relu', name=name + '_relu1')
-        conv2 = mx.sym.Convolution(data=act1, num_filter=num_filter, kernel=(3,3), stride=(1,1), pad=(1,1),
-                                      no_bias=True, workspace=workspace, name=name + '_conv2')
-        bn2 = mx.sym.BatchNorm(data=conv2, fix_gamma=False, momentum=bn_mom, eps=2e-5, name=name + '_bn2')
-
-        if dim_match:
-            shortcut = data
-        else:
-            conv1sc = mx.sym.Convolution(data=data, num_filter=num_filter, kernel=(1,1), stride=stride, no_bias=True,
-                                            workspace=workspace, name=name+'_conv1sc')
-            shortcut = mx.sym.BatchNorm(data=conv1sc, fix_gamma=False, momentum=bn_mom, eps=2e-5, name=name + '_sc')
-        if memonger:
-            shortcut._set_attr(mirror_stage='True')
-        return mx.sym.Activation(data=bn2 + shortcut, act_type='relu', name=name + '_relu3')
-
-def resnet(units, num_stages, filter_list, num_classes, image_shape, bottle_neck=True, bn_mom=0.9, workspace=256, dtype='float32', memonger=False):
-    """Return ResNet symbol of
-    Parameters
-    ----------
-    units : list
-        Number of units in each stage
-    num_stages : int
-        Number of stage
-    filter_list : list
-        Channel size of each stage
-    num_classes : int
-        Ouput size of symbol
-    dataset : str
-        Dataset type, only cifar10 and imagenet supports
-    workspace : int
-        Workspace used in convolution operator
-    dtype : str
-        Precision (float32 or float16)
-    """
-    num_unit = len(units)
-    assert(num_unit == num_stages)
-    data = mx.sym.Variable(name='data')
-    if dtype == 'float32':
-        data = mx.sym.identity(data=data, name='id')
-    else:
-        if dtype == 'float16':
-            data = mx.sym.Cast(data=data, dtype=np.float16)
-    (nchannel, height, width) = image_shape
-    if height <= 32:            # such as cifar10
-        body = mx.sym.Convolution(data=data, num_filter=filter_list[0], kernel=(3, 3), stride=(1,1), pad=(1, 1),
-                                  no_bias=True, name="conv0", workspace=workspace)
-        # Is this BatchNorm supposed to be here?
-        body = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name='bn0')
-    else:                       # often expected to be 224 such as imagenet
-        body = mx.sym.Convolution(data=data, num_filter=filter_list[0], kernel=(7, 7), stride=(2,2), pad=(3, 3),
-                                  no_bias=True, name="conv0", workspace=workspace)
-        body = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name='bn0')
-        body = mx.sym.Activation(data=body, act_type='relu', name='relu0')
-        body = mx.sym.Pooling(data=body, kernel=(3, 3), stride=(2,2), pad=(1,1), pool_type='max')
-
-    for i in range(num_stages):
-        body = residual_unit(body, filter_list[i+1], (1 if i==0 else 2, 1 if i==0 else 2), False,
-                             name='stage%d_unit%d' % (i + 1, 1), bottle_neck=bottle_neck, workspace=workspace,
-                             memonger=memonger)
-        for j in range(units[i]-1):
-            body = residual_unit(body, filter_list[i+1], (1,1), True, name='stage%d_unit%d' % (i + 1, j + 2),
-                                 bottle_neck=bottle_neck, workspace=workspace, memonger=memonger)
-    # bn1 = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name='bn1')
-    # relu1 = mx.sym.Activation(data=bn1, act_type='relu', name='relu1')
-    # Although kernel is not used here when global_pool=True, we should put one
-    pool1 = mx.sym.Pooling(data=body, global_pool=True, kernel=(7, 7), pool_type='avg', name='pool1')
-    flat = mx.sym.Flatten(data=pool1)
-    fc1 = mx.sym.FullyConnected(data=flat, num_hidden=num_classes, name='fc1')
-    if dtype == 'float16':
-        fc1 = mx.sym.Cast(data=fc1, dtype=np.float32)
-    return mx.sym.SoftmaxOutput(data=fc1, name='softmax')
-
-def get_symbol(num_classes, num_layers, image_shape, conv_workspace=256, dtype='float32', **kwargs):
-    """
-    Adapted from https://github.com/tornadomeet/ResNet/blob/master/symbol_resnet.py
-    (Original author Wei Wu) by Antti-Pekka Hynninen
-    Implementing the original resnet ILSVRC 2015 winning network from:
-    Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. "Deep Residual Learning for Image Recognition"
-    """
-    image_shape = [int(l) for l in image_shape.split(',')]
-    (nchannel, height, width) = image_shape
-    if height <= 28:
-        num_stages = 3
-        if (num_layers-2) % 9 == 0 and num_layers >= 164:
-            per_unit = [(num_layers-2)//9]
-            filter_list = [16, 64, 128, 256]
-            bottle_neck = True
-        elif (num_layers-2) % 6 == 0 and num_layers < 164:
-            per_unit = [(num_layers-2)//6]
-            filter_list = [16, 16, 32, 64]
-            bottle_neck = False
-        else:
-            raise ValueError("no experiments done on num_layers {}, you can do it yourself".format(num_layers))
-        units = per_unit * num_stages
-    else:
-        if num_layers >= 50:
-            filter_list = [64, 256, 512, 1024, 2048]
-            bottle_neck = True
-        else:
-            filter_list = [64, 64, 128, 256, 512]
-            bottle_neck = False
-        num_stages = 4
-        if num_layers == 18:
-            units = [2, 2, 2, 2]
-        elif num_layers == 34:
-            units = [3, 4, 6, 3]
-        elif num_layers == 50:
-            units = [3, 4, 6, 3]
-        elif num_layers == 101:
-            units = [3, 4, 23, 3]
-        elif num_layers == 152:
-            units = [3, 8, 36, 3]
-        elif num_layers == 200:
-            units = [3, 24, 36, 3]
-        elif num_layers == 269:
-            units = [3, 30, 48, 8]
-        else:
-            raise ValueError("no experiments done on num_layers {}, you can do it yourself".format(num_layers))
-
-    return resnet(units       = units,
-                  num_stages  = num_stages,
-                  filter_list = filter_list,
-                  num_classes = num_classes,
-                  image_shape = image_shape,
-                  bottle_neck = bottle_neck,
-                  workspace   = conv_workspace,
-                  dtype       = dtype)
diff --git a/example/image-classification/symbols/resnetv1.py b/example/image-classification/symbols/resnetv1.py
deleted file mode 100644
index e5752f775447..000000000000
--- a/example/image-classification/symbols/resnetv1.py
+++ /dev/null
@@ -1,200 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-'''
-Adapted from https://github.com/tornadomeet/ResNet/blob/master/symbol_resnet.py
-(Original author Wei Wu) by Antti-Pekka Hynninen
-
-Implementing the original resnet ILSVRC 2015 winning network from:
-
-Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. "Deep Residual Learning for Image Recognition"
-'''
-import mxnet as mx
-import numpy as np
-
-def residual_unit(data, num_filter, stride, dim_match, name, bottle_neck=True, bn_mom=0.9, workspace=256, memonger=False):
-    """Return ResNet Unit symbol for building ResNet
-    Parameters
-    ----------
-    data : str
-        Input data
-    num_filter : int
-        Number of output channels
-    bnf : int
-        Bottle neck channels factor with regard to num_filter
-    stride : tuple
-        Stride used in convolution
-    dim_match : Boolean
-        True means channel number between input and output is the same, otherwise means differ
-    name : str
-        Base name of the operators
-    workspace : int
-        Workspace used in convolution operator
-    """
-    if bottle_neck:
-        conv1 = mx.sym.Convolution(data=data, num_filter=int(num_filter*0.25), kernel=(1,1), stride=stride, pad=(0,0),
-                                   no_bias=True, workspace=workspace, name=name + '_conv1')
-        bn1 = mx.sym.BatchNorm(data=conv1, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn1')
-        act1 = mx.sym.Activation(data=bn1, act_type='relu', name=name + '_relu1')
-        conv2 = mx.sym.Convolution(data=act1, num_filter=int(num_filter*0.25), kernel=(3,3), stride=(1,1), pad=(1,1),
-                                   no_bias=True, workspace=workspace, name=name + '_conv2')
-        bn2 = mx.sym.BatchNorm(data=conv2, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn2')
-        act2 = mx.sym.Activation(data=bn2, act_type='relu', name=name + '_relu2')
-        conv3 = mx.sym.Convolution(data=act2, num_filter=num_filter, kernel=(1,1), stride=(1,1), pad=(0,0), no_bias=True,
-                                   workspace=workspace, name=name + '_conv3')
-        bn3 = mx.sym.BatchNorm(data=conv3, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn3')
-
-        if dim_match:
-            shortcut = data
-        else:
-            conv1sc = mx.sym.Convolution(data=data, num_filter=num_filter, kernel=(1,1), stride=stride, no_bias=True,
-                                            workspace=workspace, name=name+'_conv1sc')
-            shortcut = mx.sym.BatchNorm(data=conv1sc, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_sc')
-        if memonger:
-            shortcut._set_attr(mirror_stage='True')
-        return mx.sym.Activation(data=bn3 + shortcut, act_type='relu', name=name + '_relu3')
-    else:
-        conv1 = mx.sym.Convolution(data=data, num_filter=num_filter, kernel=(3,3), stride=stride, pad=(1,1),
-                                      no_bias=True, workspace=workspace, name=name + '_conv1')
-        bn1 = mx.sym.BatchNorm(data=conv1, fix_gamma=False, momentum=bn_mom, eps=2e-5, name=name + '_bn1')
-        act1 = mx.sym.Activation(data=bn1, act_type='relu', name=name + '_relu1')
-        conv2 = mx.sym.Convolution(data=act1, num_filter=num_filter, kernel=(3,3), stride=(1,1), pad=(1,1),
-                                      no_bias=True, workspace=workspace, name=name + '_conv2')
-        bn2 = mx.sym.BatchNorm(data=conv2, fix_gamma=False, momentum=bn_mom, eps=2e-5, name=name + '_bn2')
-
-        if dim_match:
-            shortcut = data
-        else:
-            conv1sc = mx.sym.Convolution(data=data, num_filter=num_filter, kernel=(1,1), stride=stride, no_bias=True,
-                                            workspace=workspace, name=name+'_conv1sc')
-            shortcut = mx.sym.BatchNorm(data=conv1sc, fix_gamma=False, momentum=bn_mom, eps=2e-5, name=name + '_sc')
-        if memonger:
-            shortcut._set_attr(mirror_stage='True')
-        return mx.sym.Activation(data=bn2 + shortcut, act_type='relu', name=name + '_relu3')
-
-def resnet(units, num_stages, filter_list, num_classes, image_shape, bottle_neck=True, bn_mom=0.9, workspace=256, dtype='float32', memonger=False):
-    """Return ResNet symbol of
-    Parameters
-    ----------
-    units : list
-        Number of units in each stage
-    num_stages : int
-        Number of stage
-    filter_list : list
-        Channel size of each stage
-    num_classes : int
-        Ouput size of symbol
-    dataset : str
-        Dataset type, only cifar10 and imagenet supports
-    workspace : int
-        Workspace used in convolution operator
-    dtype : str
-        Precision (float32 or float16)
-    """
-    num_unit = len(units)
-    assert(num_unit == num_stages)
-    data = mx.sym.Variable(name='data')
-    if dtype == 'float32':
-        data = mx.sym.identity(data=data, name='id')
-    else:
-        if dtype == 'float16':
-            data = mx.sym.Cast(data=data, dtype=np.float16)
-    (nchannel, height, width) = image_shape
-    if height <= 32:            # such as cifar10
-        body = mx.sym.Convolution(data=data, num_filter=filter_list[0], kernel=(3, 3), stride=(1,1), pad=(1, 1),
-                                  no_bias=True, name="conv0", workspace=workspace)
-        # Is this BatchNorm supposed to be here?
-        body = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name='bn0')
-    else:                       # often expected to be 224 such as imagenet
-        body = mx.sym.Convolution(data=data, num_filter=filter_list[0], kernel=(7, 7), stride=(2,2), pad=(3, 3),
-                                  no_bias=True, name="conv0", workspace=workspace)
-        body = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name='bn0')
-        body = mx.sym.Activation(data=body, act_type='relu', name='relu0')
-        body = mx.sym.Pooling(data=body, kernel=(3, 3), stride=(2,2), pad=(1,1), pool_type='max')
-
-    for i in range(num_stages):
-        body = residual_unit(body, filter_list[i+1], (1 if i==0 else 2, 1 if i==0 else 2), False,
-                             name='stage%d_unit%d' % (i + 1, 1), bottle_neck=bottle_neck, workspace=workspace,
-                             memonger=memonger)
-        for j in range(units[i]-1):
-            body = residual_unit(body, filter_list[i+1], (1,1), True, name='stage%d_unit%d' % (i + 1, j + 2),
-                                 bottle_neck=bottle_neck, workspace=workspace, memonger=memonger)
-    # bn1 = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name='bn1')
-    # relu1 = mx.sym.Activation(data=bn1, act_type='relu', name='relu1')
-    # Although kernel is not used here when global_pool=True, we should put one
-    pool1 = mx.sym.Pooling(data=body, global_pool=True, kernel=(7, 7), pool_type='avg', name='pool1')
-    flat = mx.sym.Flatten(data=pool1)
-    fc1 = mx.sym.FullyConnected(data=flat, num_hidden=num_classes, name='fc1')
-    if dtype == 'float16':
-        fc1 = mx.sym.Cast(data=fc1, dtype=np.float32)
-    return mx.sym.SoftmaxOutput(data=fc1, name='softmax')
-
-def get_symbol(num_classes, num_layers, image_shape, conv_workspace=256, dtype='float32', **kwargs):
-    """
-    Adapted from https://github.com/tornadomeet/ResNet/blob/master/symbol_resnet.py
-    (Original author Wei Wu) by Antti-Pekka Hynninen
-    Implementing the original resnet ILSVRC 2015 winning network from:
-    Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. "Deep Residual Learning for Image Recognition"
-    """
-    image_shape = [int(l) for l in image_shape.split(',')]
-    (nchannel, height, width) = image_shape
-    if height <= 28:
-        num_stages = 3
-        if (num_layers-2) % 9 == 0 and num_layers >= 164:
-            per_unit = [(num_layers-2)//9]
-            filter_list = [16, 64, 128, 256]
-            bottle_neck = True
-        elif (num_layers-2) % 6 == 0 and num_layers < 164:
-            per_unit = [(num_layers-2)//6]
-            filter_list = [16, 16, 32, 64]
-            bottle_neck = False
-        else:
-            raise ValueError("no experiments done on num_layers {}, you can do it yourself".format(num_layers))
-        units = per_unit * num_stages
-    else:
-        if num_layers >= 50:
-            filter_list = [64, 256, 512, 1024, 2048]
-            bottle_neck = True
-        else:
-            filter_list = [64, 64, 128, 256, 512]
-            bottle_neck = False
-        num_stages = 4
-        if num_layers == 18:
-            units = [2, 2, 2, 2]
-        elif num_layers == 34:
-            units = [3, 4, 6, 3]
-        elif num_layers == 50:
-            units = [3, 4, 6, 3]
-        elif num_layers == 101:
-            units = [3, 4, 23, 3]
-        elif num_layers == 152:
-            units = [3, 8, 36, 3]
-        elif num_layers == 200:
-            units = [3, 24, 36, 3]
-        elif num_layers == 269:
-            units = [3, 30, 48, 8]
-        else:
-            raise ValueError("no experiments done on num_layers {}, you can do it yourself".format(num_layers))
-
-    return resnet(units       = units,
-                  num_stages  = num_stages,
-                  filter_list = filter_list,
-                  num_classes = num_classes,
-                  image_shape = image_shape,
-                  bottle_neck = bottle_neck,
-                  workspace   = conv_workspace,
-                  dtype       = dtype)
diff --git a/example/image-classification/symbols/resnext.py b/example/image-classification/symbols/resnext.py
deleted file mode 100644
index 59749430c76c..000000000000
--- a/example/image-classification/symbols/resnext.py
+++ /dev/null
@@ -1,210 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-'''
-Adapted from https://github.com/tornadomeet/ResNet/blob/master/symbol_resnet.py
-Original author Wei Wu
-
-Implemented the following paper:
-Saining Xie, Ross Girshick, Piotr Dollar, Zhuowen Tu, Kaiming He. "Aggregated Residual Transformations for Deep Neural Network"
-'''
-import mxnet as mx
-import numpy as np
-
-def residual_unit(data, num_filter, stride, dim_match, name, bottle_neck=True, num_group=32, bn_mom=0.9, workspace=256, memonger=False):
-    """Return ResNet Unit symbol for building ResNet
-    Parameters
-    ----------
-    data : str
-        Input data
-    num_filter : int
-        Number of output channels
-    bnf : int
-        Bottle neck channels factor with regard to num_filter
-    stride : tuple
-        Stride used in convolution
-    dim_match : Boolean
-        True means channel number between input and output is the same, otherwise means differ
-    name : str
-        Base name of the operators
-    workspace : int
-        Workspace used in convolution operator
-    """
-    if bottle_neck:
-        # the same as https://github.com/facebook/fb.resnet.torch#notes, a bit difference with origin paper
-
-        conv1 = mx.sym.Convolution(data=data, num_filter=int(num_filter*0.5), kernel=(1,1), stride=(1,1), pad=(0,0),
-                                      no_bias=True, workspace=workspace, name=name + '_conv1')
-        bn1 = mx.sym.BatchNorm(data=conv1, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn1')
-        act1 = mx.sym.Activation(data=bn1, act_type='relu', name=name + '_relu1')
-
-
-        conv2 = mx.sym.Convolution(data=act1, num_filter=int(num_filter*0.5), num_group=num_group, kernel=(3,3), stride=stride, pad=(1,1),
-                                      no_bias=True, workspace=workspace, name=name + '_conv2')
-        bn2 = mx.sym.BatchNorm(data=conv2, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn2')
-        act2 = mx.sym.Activation(data=bn2, act_type='relu', name=name + '_relu2')
-
-
-        conv3 = mx.sym.Convolution(data=act2, num_filter=num_filter, kernel=(1,1), stride=(1,1), pad=(0,0), no_bias=True,
-                                   workspace=workspace, name=name + '_conv3')
-        bn3 = mx.sym.BatchNorm(data=conv3, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn3')
-
-        if dim_match:
-            shortcut = data
-        else:
-            shortcut_conv = mx.sym.Convolution(data=data, num_filter=num_filter, kernel=(1,1), stride=stride, no_bias=True,
-                                            workspace=workspace, name=name+'_sc')
-            shortcut = mx.sym.BatchNorm(data=shortcut_conv, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_sc_bn')
-
-        if memonger:
-            shortcut._set_attr(mirror_stage='True')
-        eltwise =  bn3 + shortcut
-        return mx.sym.Activation(data=eltwise, act_type='relu', name=name + '_relu')
-    else:
-
-        conv1 = mx.sym.Convolution(data=data, num_filter=num_filter, kernel=(3,3), stride=stride, pad=(1,1),
-                                      no_bias=True, workspace=workspace, name=name + '_conv1')
-        bn1 = mx.sym.BatchNorm(data=conv1, fix_gamma=False, momentum=bn_mom, eps=2e-5, name=name + '_bn1')
-        act1 = mx.sym.Activation(data=bn1, act_type='relu', name=name + '_relu1')
-
-
-        conv2 = mx.sym.Convolution(data=act1, num_filter=num_filter, kernel=(3,3), stride=(1,1), pad=(1,1),
-                                      no_bias=True, workspace=workspace, name=name + '_conv2')
-        bn2 = mx.sym.BatchNorm(data=conv2, fix_gamma=False, momentum=bn_mom, eps=2e-5, name=name + '_bn2')
-
-        if dim_match:
-            shortcut = data
-        else:
-            shortcut_conv = mx.sym.Convolution(data=data, num_filter=num_filter, kernel=(1,1), stride=stride, no_bias=True,
-                                            workspace=workspace, name=name+'_sc')
-            shortcut = mx.sym.BatchNorm(data=shortcut_conv, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_sc_bn')
-
-        if memonger:
-            shortcut._set_attr(mirror_stage='True')
-        eltwise = bn2 + shortcut
-        return mx.sym.Activation(data=eltwise, act_type='relu', name=name + '_relu')
-
-def resnext(units, num_stages, filter_list, num_classes, num_group, image_shape, bottle_neck=True, bn_mom=0.9, workspace=256, dtype='float32', memonger=False):
-    """Return ResNeXt symbol of
-    Parameters
-    ----------
-    units : list
-        Number of units in each stage
-    num_stages : int
-        Number of stage
-    filter_list : list
-        Channel size of each stage
-    num_classes : int
-        Ouput size of symbol
-    num_groupes: int
-    Number of conv groups
-    dataset : str
-        Dataset type, only cifar10 and imagenet supports
-    workspace : int
-        Workspace used in convolution operator
-    dtype : str
-        Precision (float32 or float16)
-    """
-    num_unit = len(units)
-    assert(num_unit == num_stages)
-    data = mx.sym.Variable(name='data')
-    if dtype == 'float32':
-        data = mx.sym.identity(data=data, name='id')
-    else:
-        if dtype == 'float16':
-            data = mx.sym.Cast(data=data, dtype=np.float16)
-    data = mx.sym.BatchNorm(data=data, fix_gamma=True, eps=2e-5, momentum=bn_mom, name='bn_data')
-    (nchannel, height, width) = image_shape
-    if height <= 32:            # such as cifar10
-        body = mx.sym.Convolution(data=data, num_filter=filter_list[0], kernel=(3, 3), stride=(1,1), pad=(1, 1),
-                                  no_bias=True, name="conv0", workspace=workspace)
-    else:                       # often expected to be 224 such as imagenet
-        body = mx.sym.Convolution(data=data, num_filter=filter_list[0], kernel=(7, 7), stride=(2,2), pad=(3, 3),
-                                  no_bias=True, name="conv0", workspace=workspace)
-        body = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name='bn0')
-        body = mx.sym.Activation(data=body, act_type='relu', name='relu0')
-        body = mx.sym.Pooling(data=body, kernel=(3, 3), stride=(2,2), pad=(1,1), pool_type='max')
-
-    for i in range(num_stages):
-        body = residual_unit(body, filter_list[i+1], (1 if i==0 else 2, 1 if i==0 else 2), False,
-                             name='stage%d_unit%d' % (i + 1, 1), bottle_neck=bottle_neck, num_group=num_group,
-                             bn_mom=bn_mom, workspace=workspace, memonger=memonger)
-        for j in range(units[i]-1):
-            body = residual_unit(body, filter_list[i+1], (1,1), True, name='stage%d_unit%d' % (i + 1, j + 2),
-                                 bottle_neck=bottle_neck, num_group=num_group, bn_mom=bn_mom, workspace=workspace, memonger=memonger)
-
-    pool1 = mx.sym.Pooling(data=body, global_pool=True, kernel=(7, 7), pool_type='avg', name='pool1')
-    flat = mx.sym.Flatten(data=pool1)
-    fc1 = mx.sym.FullyConnected(data=flat, num_hidden=num_classes, name='fc1')
-    if dtype == 'float16':
-        fc1 = mx.sym.Cast(data=fc1, dtype=np.float32)
-    return mx.sym.SoftmaxOutput(data=fc1, name='softmax')
-
-def get_symbol(num_classes, num_layers, image_shape, num_group=32, conv_workspace=256, dtype='float32', **kwargs):
-    """
-    Adapted from https://github.com/tornadomeet/ResNet/blob/master/train_resnet.py
-    Original author Wei Wu
-    """
-    image_shape = [int(l) for l in image_shape.split(',')]
-    (nchannel, height, width) = image_shape
-    if height <= 32:
-        num_stages = 3
-        if (num_layers-2) % 9 == 0 and num_layers >= 164:
-            per_unit = [(num_layers-2)//9]
-            filter_list = [16, 64, 128, 256]
-            bottle_neck = True
-        elif (num_layers-2) % 6 == 0 and num_layers < 164:
-            per_unit = [(num_layers-2)//6]
-            filter_list = [16, 16, 32, 64]
-            bottle_neck = False
-        else:
-            raise ValueError("no experiments done on num_layers {}, you can do it yourself".format(num_layers))
-        units = per_unit * num_stages
-    else:
-        if num_layers >= 50:
-            filter_list = [64, 256, 512, 1024, 2048]
-            bottle_neck = True
-        else:
-            filter_list = [64, 64, 128, 256, 512]
-            bottle_neck = False
-        num_stages = 4
-        if num_layers == 18:
-            units = [2, 2, 2, 2]
-        elif num_layers == 34:
-            units = [3, 4, 6, 3]
-        elif num_layers == 50:
-            units = [3, 4, 6, 3]
-        elif num_layers == 101:
-            units = [3, 4, 23, 3]
-        elif num_layers == 152:
-            units = [3, 8, 36, 3]
-        elif num_layers == 200:
-            units = [3, 24, 36, 3]
-        elif num_layers == 269:
-            units = [3, 30, 48, 8]
-        else:
-            raise ValueError("no experiments done on num_layers {}, you can do it yourself".format(num_layers))
-
-    return resnext(units      = units,
-                  num_stages  = num_stages,
-                  filter_list = filter_list,
-                  num_classes = num_classes,
-                  num_group   = num_group,
-                  image_shape = image_shape,
-                  bottle_neck = bottle_neck,
-                  workspace   = conv_workspace,
-                  dtype       = dtype)
diff --git a/example/image-classification/symbols/vgg.py b/example/image-classification/symbols/vgg.py
deleted file mode 100644
index e715ff6edc92..000000000000
--- a/example/image-classification/symbols/vgg.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""References:
-
-Simonyan, Karen, and Andrew Zisserman. "Very deep convolutional networks for
-large-scale image recognition." arXiv preprint arXiv:1409.1556 (2014).
-"""
-
-import mxnet as mx
-import numpy as np
-
-def get_feature(internel_layer, layers, filters, batch_norm = False, **kwargs):
-    for i, num in enumerate(layers):
-        for j in range(num):
-            internel_layer = mx.sym.Convolution(data = internel_layer, kernel=(3, 3), pad=(1, 1), num_filter=filters[i], name="conv%s_%s" %(i + 1, j + 1))
-            if batch_norm:
-                internel_layer = mx.symbol.BatchNorm(data=internel_layer, name="bn%s_%s" %(i + 1, j + 1))
-            internel_layer = mx.sym.Activation(data=internel_layer, act_type="relu", name="relu%s_%s" %(i + 1, j + 1))
-        internel_layer = mx.sym.Pooling(data=internel_layer, pool_type="max", kernel=(2, 2), stride=(2,2), name="pool%s" %(i + 1))
-    return internel_layer
-
-def get_classifier(input_data, num_classes, **kwargs):
-    flatten = mx.sym.Flatten(data=input_data, name="flatten")
-    fc6 = mx.sym.FullyConnected(data=flatten, num_hidden=4096, name="fc6")
-    relu6 = mx.sym.Activation(data=fc6, act_type="relu", name="relu6")
-    drop6 = mx.sym.Dropout(data=relu6, p=0.5, name="drop6")
-    fc7 = mx.sym.FullyConnected(data=drop6, num_hidden=4096, name="fc7")
-    relu7 = mx.sym.Activation(data=fc7, act_type="relu", name="relu7")
-    drop7 = mx.sym.Dropout(data=relu7, p=0.5, name="drop7")
-    fc8 = mx.sym.FullyConnected(data=drop7, num_hidden=num_classes, name="fc8")
-    return fc8
-
-def get_symbol(num_classes, num_layers=11, batch_norm=False, dtype='float32', **kwargs):
-    """
-    Parameters
-    ----------
-    num_classes : int, default 1000
-        Number of classification classes.
-    num_layers : int
-        Number of layers for the variant of densenet. Options are 11, 13, 16, 19.
-    batch_norm : bool, default False
-        Use batch normalization.
-    dtype: str, float32 or float16
-        Data precision.
-    """
-    vgg_spec = {11: ([1, 1, 2, 2, 2], [64, 128, 256, 512, 512]),
-                13: ([2, 2, 2, 2, 2], [64, 128, 256, 512, 512]),
-                16: ([2, 2, 3, 3, 3], [64, 128, 256, 512, 512]),
-                19: ([2, 2, 4, 4, 4], [64, 128, 256, 512, 512])}
-    if num_layers not in vgg_spec:
-        raise ValueError("Invalide num_layers {}. Possible choices are 11,13,16,19.".format(num_layers))
-    layers, filters = vgg_spec[num_layers]
-    data = mx.sym.Variable(name="data")
-    if dtype == 'float16':
-        data = mx.sym.Cast(data=data, dtype=np.float16)
-    feature = get_feature(data, layers, filters, batch_norm)
-    classifier = get_classifier(feature, num_classes)
-    if dtype == 'float16':
-        classifier = mx.sym.Cast(data=classifier, dtype=np.float32)
-    symbol = mx.sym.SoftmaxOutput(data=classifier, name='softmax')
-    return symbol
diff --git a/example/image-classification/test_score.py b/example/image-classification/test_score.py
deleted file mode 100644
index 1a82bcff5ba3..000000000000
--- a/example/image-classification/test_score.py
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""
-test pretrained models
-"""
-from __future__ import print_function
-import mxnet as mx
-from common import find_mxnet, modelzoo
-from score import score
-import pytest
-
-@pytest.fixture(scope="session")
-def imagenet_val_5k_settings():
-    mx.test_utils.download(
-        'http://data.mxnet.io/data/val-5k-256.rec', 'data/val-5k-256.rec')
-    num_gpus = mx.context.num_gpus()
-    assert num_gpus > 0
-    gpus = ','.join(map(str, range(num_gpus)))
-    batch_size = 16 * num_gpus
-    kwargs = {'gpus':gpus, 'batch_size':batch_size, 'max_num_examples':500}
-    return 'data/val-5k-256.rec', kwargs
-
-def test_imagenet1k_resnet(imagenet_val_5k_settings):
-    imagenet_val_5k, kwargs = imagenet_val_5k_settings
-    models = ['imagenet1k-resnet-50', 'imagenet1k-resnet-152']
-    accs = [.77, .78]
-    for (m, g) in zip(models, accs):
-        acc = mx.gluon.metric.create('acc')
-        (speed,) = score(model=m, data_val=imagenet_val_5k,
-                         rgb_mean='0,0,0', metrics=acc, **kwargs)
-        r = acc.get()[1]
-        print('Tested %s, acc = %f, speed = %f img/sec' % (m, r, speed))
-        assert r > g and r < g + .1
-
-def test_imagenet1k_inception_bn(imagenet_val_5k_settings):
-    imagenet_val_5k, kwargs = imagenet_val_5k_settings
-    acc = mx.gluon.metric.create('acc')
-    m = 'imagenet1k-inception-bn'
-    g = 0.75
-    (speed,) = score(model=m,
-                     data_val=imagenet_val_5k,
-                     rgb_mean='123.68,116.779,103.939', metrics=acc, **kwargs)
-    r = acc.get()[1]
-    print('Tested %s acc = %f, speed = %f img/sec' % (m, r, speed))
-    assert r > g and r < g + .1
-
diff --git a/example/image-classification/train_cifar10.R b/example/image-classification/train_cifar10.R
deleted file mode 100644
index d4ac355348f6..000000000000
--- a/example/image-classification/train_cifar10.R
+++ /dev/null
@@ -1,145 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-require(mxnet)
-require(argparse)
-
-get_iterator <- function(data.shape) {
-  data_dir <- args$data_dir
-  data.shape <- data.shape
-  train <- mx.io.ImageRecordIter(
-    path.imgrec     = paste0(data_dir, "train.rec"),
-    batch.size      = args$batch_size,
-    data.shape      = data.shape,
-    rand.crop       = TRUE,
-    rand.mirror     = TRUE,
-    mean.img        = paste0(data_dir, "mean.bin")
-  )
-  
-  val <- mx.io.ImageRecordIter(
-    path.imgrec     = paste0(data_dir, "test.rec"),
-    path.imglist    = paste0(data_dir, "test.lst"),
-    batch.size      = args$batch_size,
-    data.shape      = data.shape,
-    rand.crop       = TRUE,
-    rand.mirror     = TRUE,
-    mean.img        = paste0(data_dir, "mean.bin")
-  )
-  ret <- list(train = train, value = val)
-}
-
-parse_args <- function() {
-  parser <- ArgumentParser(description = 'train an image classifer on CIFAR10')
-  parser$add_argument('--network',
-                      type = 'character',
-                      default = 'resnet-28-small',
-                      choices = c('alexnet',
-                                  'lenet',
-                                  'resnet',
-                                  'googlenet',
-                                  'inception-bn-28-small',
-                                  'resnet-28-small'),
-                      help = 'the network to use')
-  parser$add_argument('--data-dir',
-                      type = 'character',
-                      default = 'data/cifar10/',
-                      help = 'the input data directory')
-  # num-examples
-  parser$add_argument('--cpu',
-                      type = 'character',
-                      default = F,
-                      help = 'CPU will be used if true."')
-  parser$add_argument('--gpus',
-                      type = 'character',
-                      default = "0",
-                      help = 'the gpus will be used, e.g "0,1,2,3"')
-  parser$add_argument('--batch-size',
-                      type = 'integer',
-                      default = 128,
-                      help = 'the batch size')
-  parser$add_argument('--lr',
-                      type = 'double',
-                      default = .05,
-                      help = 'the initial learning rate')
-  # lr-factor, lr-factor-epoch
-  parser$add_argument('--model-prefix', type = 'character',
-                      help = 'the prefix of the model to load/save')
-  parser$add_argument('--resume-model-prefix', type = 'character',
-                      help = 'resume prefix of the model to load/save')
-  parser$add_argument('--num-round',
-                      type = 'integer',
-                      default = 10,
-                      help = 'the number of iterations over training data to train the model')
-  parser$add_argument('--kv-store',
-                      type = 'character',
-                      default = 'local',
-                      help = 'the kvstore type')
-  parser$parse_args()
-}
-args <- parse_args()
-
-# load network definition
-source(paste("symbol_", args$network, ".R", sep = ''))
-print(paste0("Network used: ", args$network))
-net <- get_symbol(10)
-
-# save model
-if (is.null(args$model_prefix)) {
-  checkpoint <- NULL
-} else {
-  checkpoint <- mx.callback.save.checkpoint(args$model_prefix)
-}
-
-# data
-data.shape <- c(28, 28, 3)
-data <- get_iterator(data.shape = data.shape)
-train <- data$train
-val <- data$value
-
-# train
-if (args$cpu) {
-  print("Computing with CPU")
-  devs <- mx.cpu()
-} else {
-  print(paste0("GPU used: ", args$gpus))
-  if (grepl(',', args$gpu)) {
-    devs <- lapply(unlist(strsplit(args$gpus, ",")), function(i) {
-      mx.gpu(as.integer(i))
-    })
-  } else {
-    devs <- mx.gpu(as.integer(args$gpus))
-  }
-}
-
-#train
-model <- mx.model.FeedForward.create(
-  X                  = train,
-  eval.data          = val,
-  ctx                = devs,
-  symbol             = net,
-  eval.metric        = mx.metric.accuracy,
-  num.round          = args$num_round,
-  learning.rate      = args$lr,
-  momentum           = 0.9,
-  wd                 = 0.00001,
-  kvstore            = args$kv_store,
-  array.batch.size   = args$batch_size,
-  epoch.end.callback = checkpoint,
-  batch.end.callback = mx.callback.log.train.metric(50),
-  initializer        = mx.init.Xavier(factor_type = "in", magnitude = 2.34),
-  optimizer          = "sgd"
-)
diff --git a/example/image-classification/train_cifar10.py b/example/image-classification/train_cifar10.py
deleted file mode 100644
index f449aad68836..000000000000
--- a/example/image-classification/train_cifar10.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import os
-import argparse
-import logging
-logging.basicConfig(level=logging.DEBUG)
-from common import find_mxnet, data, fit
-from common.util import download_file
-import mxnet as mx
-
-def download_cifar10():
-    data_dir="data"
-    fnames = (os.path.join(data_dir, "cifar10_train.rec"),
-              os.path.join(data_dir, "cifar10_val.rec"))
-    download_file('http://data.mxnet.io/data/cifar10/cifar10_val.rec', fnames[1])
-    download_file('http://data.mxnet.io/data/cifar10/cifar10_train.rec', fnames[0])
-    return fnames
-
-def set_cifar_aug(aug):
-    aug.set_defaults(rgb_mean='125.307,122.961,113.8575', rgb_std='51.5865,50.847,51.255')
-    aug.set_defaults(random_mirror=1, pad=4, fill_value=0, random_crop=1)
-    aug.set_defaults(min_random_size=32, max_random_size=32)
-
-if __name__ == '__main__':
-    # download data
-    (train_fname, val_fname) = download_cifar10()
-
-    # parse args
-    parser = argparse.ArgumentParser(description="train cifar10",
-                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    fit.add_fit_args(parser)
-    data.add_data_args(parser)
-    data.add_data_aug_args(parser)
-    # uncomment to set standard cifar augmentations
-    # set_cifar_aug(parser)
-    parser.set_defaults(
-        # network
-        network        = 'resnet',
-        num_layers     = 110,
-        # data
-        data_train     = train_fname,
-        data_val       = val_fname,
-        num_classes    = 10,
-        num_examples  = 50000,
-        image_shape    = '3,28,28',
-        pad_size       = 4,
-        # train
-        batch_size     = 128,
-        num_epochs     = 300,
-        lr             = .05,
-        lr_step_epochs = '200,250',
-    )
-    args = parser.parse_args()
-
-    # load network
-    from importlib import import_module
-    net = import_module('symbols.'+args.network)
-    sym = net.get_symbol(**vars(args))
-
-    # train
-    fit.fit(args, sym, data.get_rec_iter)
diff --git a/example/image-classification/train_imagenet.R b/example/image-classification/train_imagenet.R
deleted file mode 100644
index d977be291a33..000000000000
--- a/example/image-classification/train_imagenet.R
+++ /dev/null
@@ -1,140 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# 
-# This file shows how to train ImageNet dataset with several Convolutional Neural Network architectures in R.
-# More information: https://blogs.technet.microsoft.com/machinelearning/2016/11/15/imagenet-deep-neural-network-training-using-microsoft-r-server-and-azure-gpu-vms/
-#
-# To train ResNet-18:
-# Rscript train_imagenet.R --network resnet --depth 18 --batch-size 512 --lr 0.1 --lr-factor 0.94 --gpu 0,1,2,3 --num-round 120 /
-# --data-dir /path/to/data --train-dataset train.rec --val-dataset val.rec --log-dir $PWD --log-file resnet18-log.txt /
-# --model-prefix resnet18 --kv-store device
-#
-
-# Train imagenet
-require(mxnet)
-require(argparse)
-
-# Iterator
-get_iterator <- function(args) {
-  data.shape <- c(args$data_shape, args$data_shape, 3)
-  train = mx.io.ImageRecordIter(
-    path.imgrec     = file.path(args$data_dir, args$train_dataset),
-    batch.size      = args$batch_size,
-    data.shape      = data.shape,
-    mean.r          = 123.68,
-    mean.g          = 116.779,
-    mean.b          = 103.939,
-    rand.crop       = TRUE,
-    rand.mirror     = TRUE
-  )
-  
-  val = mx.io.ImageRecordIter(
-    path.imgrec     = file.path(args$data_dir, args$val_dataset),
-    batch.size      = args$batch_size,
-    data.shape      = data.shape,
-    mean.r          = 123.68,
-    mean.g          = 116.779,
-    mean.b          = 103.939,
-    rand.crop       = FALSE,
-    rand.mirror     = FALSE
-  )
-  ret = list(train=train, value=val)
-}
-
-# parse arguments
-parse_args <- function() {
-  parser <- ArgumentParser(description='train an image classifer on ImageNet')
-  parser$add_argument('--network', type='character', default='resnet',
-                      choices = c('resnet', 'inception-bn', 'googlenet', 'inception-resnet-v1',
-                                  'inception-resnet-v2'),
-                      help = 'the cnn to use')
-  parser$add_argument('--data-dir', type='character', help='the input data directory')
-  parser$add_argument('--gpus', type='character',
-                      help='the gpus will be used, e.g "0,1,2,3"')
-  parser$add_argument('--batch-size', type='integer', default=128,
-                      help='the batch size')
-  parser$add_argument('--lr', type='double', default=.01,
-                      help='the initial learning rate')
-  parser$add_argument('--lr-factor', type='double', default=1,
-                      help='times the lr with a factor for every lr-factor-epoch epoch')
-  parser$add_argument('--lr-factor-epoch', type='double', default=1,
-                      help='the number of epoch to factor the lr, could be .5')
-  parser$add_argument('--lr-multifactor', type='character', 
-                      help='the epoch at which the lr is changed, e.g "15,30,45"')
-  parser$add_argument('--mom', type='double', default=.9,
-                      help='momentum for sgd')
-  parser$add_argument('--wd', type='double', default=.0001,
-                      help='weight decay for sgd')
-  parser$add_argument('--clip-gradient', type='double', default=5,
-                      help='clip min/max gradient to prevent extreme value')
-  parser$add_argument('--model-prefix', type='character',
-                      help='the prefix of the model to load/save')
-  parser$add_argument('--load-epoch', type='integer',
-                      help="load the model on an epoch using the model-prefix")
-  parser$add_argument('--num-round', type='integer', default=10,
-                      help='the number of iterations over training data to train the model')
-  parser$add_argument('--kv-store', type='character', default='local',
-                      help='the kvstore type')
-  parser$add_argument('--num-examples', type='integer', default=1281167,
-                      help='the number of training examples')
-  parser$add_argument('--num-classes', type='integer', default=1000,
-                      help='the number of classes')
-  parser$add_argument('--log-file', type='character', 
-                      help='the name of log file')
-  parser$add_argument('--log-dir', type='character', default="/tmp/",
-                      help='directory of the log file')
-  parser$add_argument('--train-dataset', type='character', default="train.rec",
-                      help='train dataset name')
-  parser$add_argument('--val-dataset', type='character', default="val.rec",
-                      help="validation dataset name")
-  parser$add_argument('--data-shape', type='integer', default=224,
-                      help='set images shape')
-  parser$add_argument('--depth', type='integer',
-                      help='the depth for resnet, it can be a value among 18, 50, 101, 152, 200, 269')
-  parser$parse_args()
-}
-args <- parse_args()
-
-# network
-if (args$network == 'inception-bn'){
-  source("symbol_inception-bn.R")
-} else if (args$network == 'googlenet'){
-  if(args$data_shape < 299) stop(paste0("The data shape for ", args$network, " has to be at least 299"))
-  source("symbol_googlenet.R")
-} else if (args$network == 'inception-resnet-v1'){
-  if(args$data_shape < 299) stop(paste0("The data shape for ", args$network, " has to be at least 299"))
-  source("symbol_inception-resnet-v1.R")
-}  else if (args$network == 'inception-resnet-v2'){
-  if(args$data_shape < 299) stop(paste0("The data shape for ", args$network, " has to be at least 299"))
-  source("symbol_inception-resnet-v2.R")
-} else if (args$network == 'resnet'){
-  source("symbol_resnet-v2.R")
-} else{
-  stop("Wrong network")
-}
-if (is.null(args$depth)){
-  net <- get_symbol(args$num_classes)
-} else{
-  net <- get_symbol(args$num_classes, args$depth)
-}
-
-# train
-source("train_model.R")
-train_model.fit(args, net, get_iterator(args))
-
-
diff --git a/example/image-classification/train_imagenet.py b/example/image-classification/train_imagenet.py
deleted file mode 100644
index 421c15db73ad..000000000000
--- a/example/image-classification/train_imagenet.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import os
-import argparse
-import logging
-logging.basicConfig(level=logging.DEBUG)
-from common import find_mxnet, data, fit
-from common.util import download_file
-import mxnet as mx
-
-def set_imagenet_aug(aug):
-    # standard data augmentation setting for imagenet training
-    aug.set_defaults(rgb_mean='123.68,116.779,103.939', rgb_std='58.393,57.12,57.375')
-    aug.set_defaults(random_crop=0, random_resized_crop=1, random_mirror=1)
-    aug.set_defaults(min_random_area=0.08)
-    aug.set_defaults(max_random_aspect_ratio=4./3., min_random_aspect_ratio=3./4.)
-    aug.set_defaults(brightness=0.4, contrast=0.4, saturation=0.4, pca_noise=0.1)
-
-if __name__ == '__main__':
-    # parse args
-    parser = argparse.ArgumentParser(description="train imagenet-1k",
-                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    fit.add_fit_args(parser)
-    data.add_data_args(parser)
-    data.add_data_aug_args(parser)
-    parser.set_defaults(
-        # network
-        network          = 'resnet',
-        num_layers       = 50,
-        # data
-        num_classes      = 1000,
-        num_examples     = 1281167,
-        image_shape      = '3,224,224',
-        min_random_scale = 1, # if input image has min size k, suggest to use
-                              # 256.0/x, e.g. 0.533 for 480
-        # train
-        num_epochs       = 80,
-        lr_step_epochs   = '30,60',
-        dtype            = 'float32'
-    )
-    args = parser.parse_args()
-    if args.use_imagenet_data_augmentation:
-        set_imagenet_aug(parser)
-
-    # load network
-    from importlib import import_module
-    net = import_module('symbols.'+args.network)
-    sym = net.get_symbol(**vars(args))
-
-    # train
-    fit.fit(args, sym, data.get_rec_iter)
diff --git a/example/image-classification/train_mnist.R b/example/image-classification/train_mnist.R
deleted file mode 100644
index 43f2c21310b4..000000000000
--- a/example/image-classification/train_mnist.R
+++ /dev/null
@@ -1,163 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-require(argparse)
-require(mxnet)
-
-download_ <- function(data_dir) {
-    dir.create(data_dir, showWarnings = FALSE)
-    setwd(data_dir)
-    if ((!file.exists('train-images-idx3-ubyte')) ||
-        (!file.exists('train-labels-idx1-ubyte')) ||
-        (!file.exists('t10k-images-idx3-ubyte')) ||
-        (!file.exists('t10k-labels-idx1-ubyte'))) {
-        download.file(url='http://data.mxnet.io/mxnet/data/mnist.zip',
-                      destfile='mnist.zip', method='wget')
-        unzip("mnist.zip")
-        file.remove("mnist.zip")
-    }
-    setwd("..")
-}
-
-# multi-layer perceptron
-get_mlp <- function() {
-    data <- mx.symbol.Variable('data')
-    fc1  <- mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128)
-    act1 <- mx.symbol.Activation(data = fc1, name='relu1', act_type="relu")
-    fc2  <- mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 64)
-    act2 <- mx.symbol.Activation(data = fc2, name='relu2', act_type="relu")
-    fc3  <- mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=10)
-    mlp  <- mx.symbol.SoftmaxOutput(data = fc3, name = 'softmax')
-    mlp
-}
-
-# LeCun, Yann, Leon Bottou, Yoshua Bengio, and Patrick
-# Haffner. "Gradient-based learning applied to document recognition."
-# Proceedings of the IEEE (1998)
-get_lenet <- function() {
-    data <- mx.symbol.Variable('data')
-    # first conv
-    conv1 <- mx.symbol.Convolution(data=data, kernel=c(5,5), num_filter=20)
-    tanh1 <- mx.symbol.Activation(data=conv1, act_type="tanh")
-    pool1 <- mx.symbol.Pooling(data=tanh1, pool_type="max",
-                               kernel=c(2,2), stride=c(2,2))
-    # second conv
-    conv2 <- mx.symbol.Convolution(data=pool1, kernel=c(5,5), num_filter=50)
-    tanh2 <- mx.symbol.Activation(data=conv2, act_type="tanh")
-    pool2 <- mx.symbol.Pooling(data=tanh2, pool_type="max",
-                              kernel=c(2,2), stride=c(2,2))
-    # first fullc
-    flatten <- mx.symbol.Flatten(data=pool2)
-    fc1 <- mx.symbol.FullyConnected(data=flatten, num_hidden=500)
-    tanh3 <- mx.symbol.Activation(data=fc1, act_type="tanh")
-    # second fullc
-    fc2 <- mx.symbol.FullyConnected(data=tanh3, num_hidden=10)
-    # loss
-    lenet <- mx.symbol.SoftmaxOutput(data=fc2, name='softmax')
-    lenet
-}
-
-get_iterator <- function(data_shape) {
-    get_iterator_impl <- function(args) {
-        data_dir = args$data_dir
-        if (!grepl('://', args$data_dir))
-            download_(args$data_dir)
-        flat <- TRUE
-        if (length(data_shape) == 3) flat <- FALSE
-
-        train           = mx.io.MNISTIter(
-            image       = paste0(data_dir, "train-images-idx3-ubyte"),
-            label       = paste0(data_dir, "train-labels-idx1-ubyte"),
-            input_shape = data_shape,
-            batch_size  = args$batch_size,
-            shuffle     = TRUE,
-            flat        = flat)
-
-        val = mx.io.MNISTIter(
-            image       = paste0(data_dir, "t10k-images-idx3-ubyte"),
-            label       = paste0(data_dir, "t10k-labels-idx1-ubyte"),
-            input_shape = data_shape,
-            batch_size  = args$batch_size,
-            flat        = flat)
-
-        ret = list(train=train, value=val)
-    }
-    get_iterator_impl
-}
-
-parse_args <- function() {
-    parser <- ArgumentParser(description='train an image classifer on mnist')
-    parser$add_argument('--network', type='character', default='mlp',
-                        choices = c('mlp', 'lenet'),
-                        help = 'the cnn to use')
-    parser$add_argument('--data-dir', type='character', default='mnist/',
-                        help='the input data directory')
-    parser$add_argument('--gpus', type='character',
-                        help='the gpus will be used, e.g "0,1,2,3"')
-    parser$add_argument('--batch-size', type='integer', default=128,
-                        help='the batch size')
-    parser$add_argument('--lr', type='double', default=.05,
-                        help='the initial learning rate')
-    parser$add_argument('--mom', type='double', default=.9,
-                        help='momentum for sgd')
-    parser$add_argument('--model-prefix', type='character',
-                        help='the prefix of the model to load/save')
-    parser$add_argument('--num-round', type='integer', default=10,
-                        help='the number of iterations over training data to train the model')
-    parser$add_argument('--kv-store', type='character', default='local',
-                        help='the kvstore type')
-
-    parser$parse_args()
-}
-
-args = parse_args()
-if (args$network == 'mlp') {
-    data_shape <- c(784)
-    net <- get_mlp()
-} else {
-    data_shape <- c(28, 28, 1)
-    net <- get_lenet()
-}
-
-# train
-data_loader <- get_iterator(data_shape)
-data <- data_loader(args)
-train <- data$train
-val <- data$value 
-
-if (is.null(args$gpus)) {
-  devs <- mx.cpu()  
-} else {
-  devs <- lapply(unlist(strsplit(args$gpus, ",")), function(i) {
-    mx.gpu(as.integer(i))
-  })
-}
-
-mx.set.seed(0)
-
-model <- mx.model.FeedForward.create(
-  X                  = train,
-  eval.data          = val,
-  ctx                = devs,
-  symbol             = net,
-  num.round          = args$num_round,
-  array.batch.size   = args$batch_size,
-  learning.rate      = args$lr,
-  momentum           = args$mom,
-  eval.metric        = mx.metric.accuracy,
-  initializer        = mx.init.uniform(0.07),
-  batch.end.callback = mx.callback.log.train.metric(100))
diff --git a/example/image-classification/train_mnist.py b/example/image-classification/train_mnist.py
deleted file mode 100644
index d47521fc8ef8..000000000000
--- a/example/image-classification/train_mnist.py
+++ /dev/null
@@ -1,97 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""
-Train mnist, see more explanation at https://mxnet.io/tutorials/python/mnist.html
-"""
-import os
-import argparse
-import logging
-logging.basicConfig(level=logging.DEBUG)
-from common import find_mxnet, fit
-from common.util import download_file
-import mxnet as mx
-import numpy as np
-import gzip, struct
-
-def read_data(label, image):
-    """
-    download and read data into numpy
-    """
-    base_url = 'http://yann.lecun.com/exdb/mnist/'
-    with gzip.open(download_file(base_url+label, os.path.join('data',label))) as flbl:
-        magic, num = struct.unpack(">II", flbl.read(8))
-        label = np.fromstring(flbl.read(), dtype=np.int8)
-    with gzip.open(download_file(base_url+image, os.path.join('data',image)), 'rb') as fimg:
-        magic, num, rows, cols = struct.unpack(">IIII", fimg.read(16))
-        image = np.fromstring(fimg.read(), dtype=np.uint8).reshape(len(label), rows, cols)
-    return (label, image)
-
-
-def to4d(img):
-    """
-    reshape to 4D arrays
-    """
-    return img.reshape(img.shape[0], 1, 28, 28).astype(np.float32)/255
-
-def get_mnist_iter(args, kv):
-    """
-    create data iterator with NDArrayIter
-    """
-    (train_lbl, train_img) = read_data(
-            'train-labels-idx1-ubyte.gz', 'train-images-idx3-ubyte.gz')
-    (val_lbl, val_img) = read_data(
-            't10k-labels-idx1-ubyte.gz', 't10k-images-idx3-ubyte.gz')
-    train = mx.io.NDArrayIter(
-        to4d(train_img), train_lbl, args.batch_size, shuffle=True)
-    val = mx.io.NDArrayIter(
-        to4d(val_img), val_lbl, args.batch_size)
-    return (train, val)
-
-if __name__ == '__main__':
-    # parse args
-    parser = argparse.ArgumentParser(description="train mnist",
-                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('--num-classes', type=int, default=10,
-                        help='the number of classes')
-    parser.add_argument('--num-examples', type=int, default=60000,
-                        help='the number of training examples')
-
-    parser.add_argument('--add_stn',  action="store_true", default=False, help='Add Spatial Transformer Network Layer (lenet only)')
-    parser.add_argument('--image_shape', default='1, 28, 28', help='shape of training images')
-
-    fit.add_fit_args(parser)
-    parser.set_defaults(
-        # network
-        network        = 'mlp',
-        # train
-        gpus           = None,
-        batch_size     = 64,
-        disp_batches   = 100,
-        num_epochs     = 20,
-        lr             = .05,
-        lr_step_epochs = '10'
-    )
-    args = parser.parse_args()
-
-    # load network
-    from importlib import import_module
-    net = import_module('symbols.'+args.network)
-    sym = net.get_symbol(**vars(args))
-
-    # train
-    fit.fit(args, sym, get_mnist_iter)
diff --git a/example/image-classification/train_model.R b/example/image-classification/train_model.R
deleted file mode 100644
index 83dba93c6853..000000000000
--- a/example/image-classification/train_model.R
+++ /dev/null
@@ -1,107 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-require(mxnet)
-
-train_model.fit <- function(args, network, data_loader) {
-    
-    # log
-    if(!is.null(args$log_file)){
-      sink(file.path(args$log_dir, args$log_file), append = FALSE, 
-           type=c("output", "message"))
-      cat(paste0("Starting computation of ", args$network, " at ", Sys.time(), "\n"))
-    }
-    cat("Arguments")
-    print(unlist(args))
-
-    # save model
-    if (is.null(args$model_prefix)) {
-        checkpoint <- NULL
-    } else {
-        checkpoint <- mx.callback.save.checkpoint(args$model_prefix)
-    }
-
-    # load pretrained model
-    if(!is.null(args$load_epoch)){
-      if(is.null(args$model_prefix)) stop("model_prefix should not be empty")
-      begin.round <- args$load_epoch
-      model <- mx.model.load(args$model_prefix, iteration=begin.round)
-      network <- model$symbol
-      arg.params <- model$arg.params
-      aux.params <- model$aux.params
-    } else{
-      arg.params <- NULL
-      aux.params <- NULL
-      begin.round <- 1
-    }
-
-    # data
-    data <- data_loader(args)
-    train <- data$train
-    val <- data$value 
-    
-    # devices
-    if (is.null(args$gpus)) {
-        devs <- mx.cpu()  
-    } else {
-        devs <- lapply(unlist(strsplit(args$gpus, ",")), function(i) {
-            mx.gpu(as.integer(i))
-        })
-    }
-
-    # learning rate scheduler
-    if (args$lr_factor < 1){
-      epoch_size <- as.integer(max(args$num_examples/args$batch_size), 1)
-      if(!is.null(args$lr_multifactor)){
-        step <- as.integer(strsplit(args$lr_multifactor,",")[[1]])
-        step.updated <- step - begin.round + 1
-        step.updated <- step.updated[step.updated > 0]
-        step_batch <- epoch_size*step.updated 
-        lr_scheduler <- mx.lr_scheduler.MultiFactorScheduler(step=step_batch, factor_val=args$lr_factor)
-      } else{
-        lr_scheduler <- mx.lr_scheduler.FactorScheduler(
-          step = as.integer(max(epoch_size * args$lr_factor_epoch, 1)),
-          factor_val = args$lr_factor)
-      }
-    } else{
-      lr_scheduler = NULL
-    }
-
-    # train
-    model <- mx.model.FeedForward.create(
-      X                  = train,
-      eval.data          = val,
-      ctx                = devs,
-      symbol             = network,
-      begin.round        = begin.round,
-      eval.metric        = mx.metric.top_k_accuracy,
-      num.round          = args$num_round,
-      learning.rate      = args$lr,
-      momentum           = args$mom,
-      wd                 = args$wd,
-      kvstore            = args$kv_store,
-      array.batch.size   = args$batch_size,
-      clip_gradient      = args$clip_gradient,
-      lr_scheduler       = lr_scheduler,
-      optimizer          = "sgd",
-      initializer        = mx.init.Xavier(factor_type="in", magnitude=2),
-      arg.params         = arg.params,
-      aux.params         = aux.params,
-      epoch.end.callback = checkpoint,
-      batch.end.callback = mx.callback.log.train.metric(50))
-
-}
diff --git a/example/neural_collaborative_filtering/README.md b/example/neural_collaborative_filtering/README.md
deleted file mode 100644
index 4b16d20a8217..000000000000
--- a/example/neural_collaborative_filtering/README.md
+++ /dev/null
@@ -1,113 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-# Neural Collaborative Filtering
-
-[![Build Status](https://travis-ci.com/xinyu-intel/ncf_mxnet.svg?branch=master)](https://travis-ci.com/xinyu-intel/ncf_mxnet)
-
-This is MXNet implementation for the paper:
-
-Xiangnan He, Lizi Liao, Hanwang Zhang, Liqiang Nie, Xia Hu and Tat-Seng Chua (2017). [Neural Collaborative Filtering.](http://dl.acm.org/citation.cfm?id=3052569) In Proceedings of WWW '17, Perth, Australia, April 03-07, 2017.
-
-Three collaborative filtering models: Generalized Matrix Factorization (GMF), Multi-Layer Perceptron (MLP), and Neural Matrix Factorization (NeuMF). To target the models for implicit feedback and ranking task, we optimize them using log loss with negative sampling. 
-
-Author: Dr. Xiangnan He (http://www.comp.nus.edu.sg/~xiangnan/)
-
-Code Reference: https://github.com/hexiangnan/neural_collaborative_filtering
-
-## Dataset
-
-We provide the processed datasets on [Google Drive](https://drive.google.com/drive/folders/1qACR_Zhc2O2W0RrazzcepM2vJeh0MMdO?usp=sharing): MovieLens 20 Million (ml-20m), you can download directly or 
-run the script to prepare the datasets:
-```
-python convert.py
-```
-
-train-ratings.csv
-- Train file (positive instances).
-- Each Line is a training instance: userID\t itemID\t 
-
-test-ratings.csv
-- Test file (positive instances). 
-- Each Line is a testing instance: userID\t itemID\t 
-
-test-negative.csv
-- Test file (negative instances).
-- Each line corresponds to the line of test.rating, containing 999 negative samples.  
-- Each line is in the format: userID,\t negativeItemID1\t negativeItemID2 ...
-
-## Pre-trained models
-
-We provide the pretrained ml-20m model on [Google Drive](https://drive.google.com/drive/folders/1qACR_Zhc2O2W0RrazzcepM2vJeh0MMdO?usp=sharing), you can download directly for evaluation or calibration.
-
-|dtype|HR@10|NDCG@10|
-|:---:|:--:|:--:|
-|float32|0.6393|0.3849|
-|float32 opt|0.6393|0.3849|
-|int8|0.6395|0.3852|
-|int8 opt|0.6396|0.3852|
-
-## Training
-
-```
-# train ncf model with ml-20m dataset
-python train.py # --gpu=0
-```
-
-## Model Optimizer
-
-```
-# optimize model
-python model_optimizer.py
-```
-
-## Calibration
-
-```
-# neumf calibration on ml-20m dataset
-python ncf.py --prefix=./model/ml-20m/neumf --calibration
-# optimized neumf calibration on ml-20m dataset
-python ncf.py --prefix=./model/ml-20m/neumf-opt --calibration
-```
-
-## Evaluation
-
-```
-# neumf float32 inference on ml-20m dataset
-python ncf.py --batch-size=1000 --prefix=./model/ml-20m/neumf
-# optimized neumf float32 inference on ml-20m dataset
-python ncf.py --batch-size=1000 --prefix=./model/ml-20m/neumf-opt
-# neumf int8 inference on ml-20m dataset
-python ncf.py --batch-size=1000 --prefix=./model/ml-20m/neumf-quantized
-# optimized neumf int8 inference on ml-20m dataset
-python ncf.py --batch-size=1000 --prefix=./model/ml-20m/neumf-opt-quantized
-```
-
-## Benchmark
-
-```
-usage: bash ./benchmark.sh [[[-p prefix ] [-e epoch] [-d dataset] [-b batch_size] [-i instance] [-c cores/instance]] | [-h]]
-
-# neumf float32 benchmark on ml-20m dataset
-sh benchmark.sh -p model/ml-20m/neumf
-# optimized neumf float32 benchmark on ml-20m dataset
-sh benchmark.sh -p model/ml-20m/neumf-opt
-# neumf int8 benchmark on ml-20m dataset
-sh benchmark.sh -p model/ml-20m/neumf-quantized
-# optimized neumf int8 benchmark on ml-20m dataset
-sh benchmark.sh -p model/ml-20m/neumf-opt-quantized
-```
diff --git a/example/neural_collaborative_filtering/benchmark.sh b/example/neural_collaborative_filtering/benchmark.sh
deleted file mode 100755
index 703a087d9a50..000000000000
--- a/example/neural_collaborative_filtering/benchmark.sh
+++ /dev/null
@@ -1,118 +0,0 @@
-#!/bin/bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-usage()
-{
-    echo "usage: bash ./benchmark.sh [[[-p prefix ] [-e epoch] [-d dataset] [-b batch_size] [-i instance] [-c cores/instance]] | [-h]]"
-}
-
-while [ $# -gt 0 ]; do
-  case "$1" in
-    --prefix | -p)
-      shift
-      PREFIX=$1
-      ;;
-    --epoch | -e)
-      shift
-      EPOCH=$1
-      ;;
-    --dataset | -d)
-      shift
-      DATASET=$1
-      ;;
-    --batch-size | -b)
-      shift
-      BS=$1
-      ;;
-    --instance | -i)
-      shift
-      INS=$1
-      ;;
-    --core | -c)
-      shift
-      CORES=$1
-      ;;
-    --help | -h)
-      usage
-      exit 1
-      ;;
-    *)
-      usage
-      exit 1
-  esac
-  shift
-done
-
-NUM_SOCKET=`lscpu | grep 'Socket(s)' | awk '{print $NF}'`
-NUM_NUMA_NODE=`lscpu | grep 'NUMA node(s)' | awk '{print $NF}'`
-CORES_PER_SOCKET=`lscpu | grep 'Core(s) per socket' | awk '{print $NF}'`
-NUM_CORES=$((CORES_PER_SOCKET * NUM_SOCKET))
-CORES_PER_NUMA=$((NUM_CORES / NUM_NUMA_NODE))
-echo "target machine has $NUM_CORES physical core(s) on $NUM_NUMA_NODE numa nodes of $NUM_SOCKET socket(s)."
-
-if [ -z $PREFIX ]; then
-  echo "Error: Need a model prefix."
-  exit
-fi
-if [ -z $EPOCH ]; then
-  echo "Default: set epoch of model parameters to 7."
-  EPOCH=7
-fi
-if [ -z $DATASET ]; then
-  echo "Default: set dataset to ml-20m."
-  DATASET='ml-20m'
-fi
-if [ -z $INS ]; then
-  echo "Default: launch one instance per physical core."
-  INS=$NUM_CORES
-fi
-if [ -z $CORES ]; then
-  echo "Default: divide full physical cores."
-  CORES=$((NUM_CORES / $INS))
-fi
-if [ -z $BS ]; then
-  echo "Default: set batch size to 700."
-  BS=700
-fi
-
-echo "  cores/instance: $CORES"
-echo "  total instances: $INS"
-echo "  batch size: $BS"
-echo ""
-
-rm NCF_*.log
-
-for((i=0;i<$INS;i++));
-do
-  ((a=$i*$CORES))
-  ((b=$a+$CORES-1))
-  memid=$((b/CORES_PER_NUMA % NUM_NUMA_NODE))
-  LOG=NCF_$i.log
-  echo "  Instance $i use $a-$b cores with $LOG"
-  KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0 \
-  OMP_NUM_THREADS=$CORES \
-  numactl --physcpubind=$a-$b --membind=$memid python ncf.py --batch-size=$BS --dataset=$DATASET --epoch=$EPOCH --benchmark --prefix=$PREFIX 2>&1 | tee $LOG &
-done
-wait
-
-sps=`grep speed NCF_*.log | awk '{ sum += $(NF-1) }; END { print sum }'`
-latency=$(awk "BEGIN {printf \"%.2f\", 1000*${BS}*${INS}/${sps}}")
-echo "overall throughput (samples/sec): $sps"
-echo "latency per batch per instance (ms): $latency"
-echo "benchmark finish:)"
diff --git a/example/neural_collaborative_filtering/ci.py b/example/neural_collaborative_filtering/ci.py
deleted file mode 100644
index 1bf5b27cae32..000000000000
--- a/example/neural_collaborative_filtering/ci.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import mxnet as mx
-from core.model import get_model
-
-def test_model():
-    def test_ncf(model_type):
-        net = get_model(model_type=model_type, factor_size_mlp=128, factor_size_gmf=64,
-                        model_layers=[256, 128, 64], num_hidden=1, max_user=138493, max_item=26744)
-        mod = mx.module.Module(net, context=mx.cpu(), data_names=['user', 'item'], label_names=['softmax_label'])
-        provide_data = [mx.io.DataDesc(name='item', shape=((1,))),
-                        mx.io.DataDesc(name='user', shape=((1,)))]
-        provide_label = [mx.io.DataDesc(name='softmax_label', shape=((1,)))]
-        mod.bind(for_training=True, data_shapes=provide_data, label_shapes=provide_label)
-        mod.init_params(initializer=mx.init.Xavier(magnitude=2.))
-        data = [mx.nd.full(shape=shape, val=26744, ctx=mx.cpu(), dtype='int32')
-                for _, shape in mod.data_shapes]
-        batch = mx.io.DataBatch(data, [])
-        mod.forward(batch)
-        mod.backward()
-        mx.nd.waitall()
-
-        data_dict = {'user': data[0], 'item': data[1]}
-        calib_data = mx.io.NDArrayIter(data=data_dict, batch_size=1)
-        calib_data = mx.test_utils.DummyIter(calib_data)
-        arg_params, aux_params = mod.get_params()
-        qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model_mkldnn(sym=net,
-                                                                                arg_params=arg_params,
-                                                                                aux_params=aux_params,
-                                                                                ctx=mx.cpu(),
-                                                                                quantized_dtype='auto',
-                                                                                calib_mode='naive',
-                                                                                calib_data=calib_data,
-                                                                                data_names=['user', 'item'],
-                                                                                excluded_sym_names=['post_gemm_concat', 'fc_final'],
-                                                                                num_calib_examples=1)
-        qmod = mx.module.Module(qsym, context=mx.cpu(), data_names=['user', 'item'], label_names=['softmax_label'])
-        qmod.bind(for_training=True, data_shapes=provide_data, label_shapes=provide_label)
-        qmod.set_params(qarg_params, qaux_params)
-        qmod.forward(batch)
-        mx.nd.waitall()
-
-    for model_type in ['neumf', 'mlp', 'gmf']:
-        test_ncf(model_type)
-
diff --git a/example/neural_collaborative_filtering/convert.py b/example/neural_collaborative_filtering/convert.py
deleted file mode 100644
index 7fb7f1ede9e4..000000000000
--- a/example/neural_collaborative_filtering/convert.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# 
-import os
-import urllib
-import zipfile
-from argparse import ArgumentParser
-from collections import defaultdict
-import numpy as np
-import pandas as pd
-from tqdm import tqdm
-from core.load import implicit_load
-
-MIN_RATINGS = 20
-
-USER_COLUMN = 'user_id'
-ITEM_COLUMN = 'item_id'
-
-TRAIN_RATINGS_FILENAME = 'train-ratings.csv'
-TEST_RATINGS_FILENAME = 'test-ratings.csv'
-TEST_NEG_FILENAME = 'test-negative.csv'
-
-def parse_args():
-    parser = ArgumentParser()
-    parser.add_argument('--dataset', nargs='?', default='ml-20m', choices=['ml-1m', 'ml-20m'],
-                        help='The dataset name, temporary support ml-1m and ml-20m.')
-    parser.add_argument('--path', type=str, default = './data/',
-                        help='Path to reviews CSV file from MovieLens')
-    parser.add_argument('-n', '--negatives', type=int, default=999,
-                        help='Number of negative samples for each positive'
-                             'test example')
-    parser.add_argument('-s', '--seed', type=int, default=0,
-                        help='Random seed to reproduce same negative samples')
-    return parser.parse_args()
-
-def get_movielens_data(data_dir, dataset):
-    if not os.path.exists(data_dir + '%s.zip' % dataset):
-        os.mkdir(data_dir)
-        urllib.request.urlretrieve('http://files.grouplens.org/datasets/movielens/%s.zip' % dataset, data_dir + dataset + '.zip')
-        with zipfile.ZipFile(data_dir + "%s.zip" % dataset, "r") as f:
-            f.extractall(data_dir + "./")
-
-def main():
-    args = parse_args()
-    np.random.seed(args.seed)
-
-    print("download movielens {} dataset".format(args.dataset))
-    get_movielens_data(args.path, args.dataset)
-    output = os.path.join(args.path, args.dataset)
-
-    print("Loading raw data from {}".format(output))
-    df = implicit_load(os.path.join(output,"ratings.csv"), sort=False)
-
-    print("Filtering out users with less than {} ratings".format(MIN_RATINGS))
-    grouped = df.groupby(USER_COLUMN)
-    df = grouped.filter(lambda x: len(x) >= MIN_RATINGS)
-
-    print("Mapping original user and item IDs to new sequential IDs")
-    original_users = df[USER_COLUMN].unique()
-    original_items = df[ITEM_COLUMN].unique()
-
-    user_map = {user: index for index, user in enumerate(original_users)}
-    item_map = {item: index for index, item in enumerate(original_items)}
-
-    df[USER_COLUMN] = df[USER_COLUMN].apply(lambda user: user_map[user])
-    df[ITEM_COLUMN] = df[ITEM_COLUMN].apply(lambda item: item_map[item])
-
-    assert df[USER_COLUMN].max() == len(original_users) - 1
-    assert df[ITEM_COLUMN].max() == len(original_items) - 1
-
-    print("Creating list of items for each user")
-    # Need to sort before popping to get last item
-    df.sort_values(by='timestamp', inplace=True)
-    all_ratings = set(zip(df[USER_COLUMN], df[ITEM_COLUMN]))
-    user_to_items = defaultdict(list)
-    for row in tqdm(df.itertuples(), desc='Ratings', total=len(df)):
-        user_to_items[getattr(row, USER_COLUMN)].append(getattr(row, ITEM_COLUMN))  # noqa: E501
-
-    test_ratings = []
-    test_negs = []
-    all_items = set(range(len(original_items)))
-
-    print("Generating {} negative samples for each user"
-          .format(args.negatives))
-
-    for user in tqdm(range(len(original_users)), desc='Users', total=len(original_users)):  # noqa: E501
-        test_item = user_to_items[user].pop()
-
-        all_ratings.remove((user, test_item))
-        all_negs = all_items - set(user_to_items[user])
-        all_negs = sorted(list(all_negs))  # determinism
-
-        test_ratings.append((user, test_item))
-        test_negs.append(list(np.random.choice(all_negs, args.negatives)))
-
-    print("Saving train and test CSV files to {}".format(output))
-    df_train_ratings = pd.DataFrame(list(all_ratings))
-    df_train_ratings['fake_rating'] = 1
-    df_train_ratings.to_csv(os.path.join(output, TRAIN_RATINGS_FILENAME),
-                            index=False, header=False, sep='\t')
-
-    df_test_ratings = pd.DataFrame(test_ratings)
-    df_test_ratings['fake_rating'] = 1
-    df_test_ratings.to_csv(os.path.join(output, TEST_RATINGS_FILENAME),
-                           index=False, header=False, sep='\t')
-
-    df_test_negs = pd.DataFrame(test_negs)
-    df_test_negs.to_csv(os.path.join(output, TEST_NEG_FILENAME),
-                        index=False, header=False, sep='\t')
-
-if __name__ == '__main__':
-    main()
-
diff --git a/example/neural_collaborative_filtering/core/dataset.py b/example/neural_collaborative_filtering/core/dataset.py
deleted file mode 100644
index 56f04fd53811..000000000000
--- a/example/neural_collaborative_filtering/core/dataset.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-import os 
-import mxnet as mx
-import numpy as np
-import pandas as pd
-import scipy.sparse as sp
-
-class NCFTestData(object):
-    def __init__(self, path):
-        '''
-        Constructor
-        path: converted data root
-        testRatings: converted test ratings data
-        testNegatives: negative samples for evaluation dataset
-        '''
-        self.testRatings = self.load_rating_file_as_list(os.path.join(path, 'test-ratings.csv'))
-        self.testNegatives = self.load_negative_file(os.path.join(path ,'test-negative.csv'))
-        assert len(self.testRatings) == len(self.testNegatives)
-
-    def load_rating_file_as_list(self, filename):
-        ratingList = []
-        with open(filename, "r") as f:
-            line = f.readline()
-            while line != None and line != "":
-                arr = line.split("\t")
-                user, item = int(arr[0]), int(arr[1])
-                ratingList.append([user, item])
-                line = f.readline()
-        return ratingList
-    
-    def load_negative_file(self, filename):
-        negativeList = []
-        with open(filename, "r") as f:
-            line = f.readline()
-            while line != None and line != "":
-                arr = line.split("\t")
-                negatives = []
-                for x in arr:
-                    negatives.append(int(x))
-                negativeList.append(negatives)
-                line = f.readline()
-        return negativeList
-
-class NCFTrainData(mx.gluon.data.Dataset):
-    def __init__(self, train_fname, nb_neg):
-        '''
-        Constructor
-        train_fname: converted data root
-        nb_neg: number of negative samples per positive sample while training
-        '''
-        self._load_train_matrix(train_fname)
-        self.nb_neg = nb_neg
-
-    def _load_train_matrix(self, train_fname):
-        def process_line(line):
-            tmp = line.split('\t')
-            return [int(tmp[0]), int(tmp[1]), float(tmp[2]) > 0]
-        with open(train_fname, 'r') as file:
-            data = list(map(process_line, file))
-        self.nb_users = max(data, key=lambda x: x[0])[0] + 1
-        self.nb_items = max(data, key=lambda x: x[1])[1] + 1
-
-        self.data = list(filter(lambda x: x[2], data))
-        self.mat = sp.dok_matrix(
-                (self.nb_users, self.nb_items), dtype=np.float32)
-        for user, item, _ in data:
-            self.mat[user, item] = 1.
-
-    def __len__(self):
-        return (self.nb_neg + 1) * len(self.data)
-
-    def __getitem__(self, idx):
-        if idx % (self.nb_neg + 1) == 0:
-            idx = idx // (self.nb_neg + 1)
-            return self.data[idx][0], self.data[idx][1], np.ones(1, dtype=np.float32).item()  # noqa: E501
-        else:
-            idx = idx // (self.nb_neg + 1)
-            u = self.data[idx][0]
-            j = mx.random.randint(0, self.nb_items).asnumpy().item()
-            while (u, j) in self.mat:
-                j = mx.random.randint(0, self.nb_items).asnumpy().item()
-            return u, j, np.zeros(1, dtype=np.float32).item()
-
diff --git a/example/neural_collaborative_filtering/core/evaluate.py b/example/neural_collaborative_filtering/core/evaluate.py
deleted file mode 100644
index 02a1379f6454..000000000000
--- a/example/neural_collaborative_filtering/core/evaluate.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# 
-import math
-import heapq
-import random
-import logging
-import mxnet as mx
-import numpy as np
-
-def get_movielens_iter(filename, batch_size, ctx, logger):
-    """Not particularly fast code to parse the text file and load into NDArrays.
-    return two data iters, one for train, the other for validation.
-    """
-    logger.info("Preparing data iterators for " + filename + " ... ")
-    user = []
-    item = []
-    score = []
-    with open(filename, 'r') as f:
-        num_samples = 0
-        for line in f:
-            tks = line.strip().split('\t')
-            if len(tks) != 3:
-                continue
-            num_samples += 1
-            user.append((tks[0]))
-            item.append((tks[1]))
-            score.append((tks[2]))
-    # convert to ndarrays
-    user = mx.nd.array(user, dtype='int32').as_in_context(ctx)
-    item = mx.nd.array(item, dtype='int32').as_in_context(ctx)
-    score = mx.nd.array(score).as_in_context(ctx)
-    # prepare data iters
-    data = {'user': user, 'item': item}
-    label = {'softmax_label': score}
-    iter = mx.io.NDArrayIter(data=data, label=label, batch_size=batch_size)
-    return iter
-
-
-def predict(model, users, items, batch_size=1000, ctx=mx.cpu()):
-    user = mx.nd.array(users, dtype='int32').as_in_context(ctx)
-    item = mx.nd.array(items, dtype='int32').as_in_context(ctx)
-    label = mx.nd.zeros(len(user)).as_in_context(ctx)
-    data = {'user': user, 'item': item}
-    label = {'softmax_label':label}
-    eval_iter = mx.io.NDArrayIter(data=data, label=label, batch_size=batch_size)
-    preds = []
-    for batch in eval_iter:
-        model.forward(batch)
-        outp = model.get_outputs()[0].asnumpy()
-        preds += list(outp.flatten())
-    return preds
-
-def _calculate_hit(ranked, test_item):
-    return int(test_item in ranked)
-
-
-def _calculate_ndcg(ranked, test_item):
-    for i, item in enumerate(ranked):
-        if item == test_item:
-            return math.log(2) / math.log(i + 2)
-    return 0.
-
-def eval_one(rating, items, model, K, batch_size, ctx):
-    user = rating[0]
-    test_item = rating[1]
-    items.append(test_item)
-    users = [user] * len(items)
-    predictions = predict(model, users, items, batch_size, ctx)
-
-    map_item_score = {item: pred for item, pred in zip(items, predictions)}
-    ranked = heapq.nlargest(K, map_item_score, key=map_item_score.get)
-
-    hit = _calculate_hit(ranked, test_item)
-    ndcg = _calculate_ndcg(ranked, test_item)
-    return hit, ndcg, len(predictions)
-
-def evaluate_model(model, ratings, negs, K, batch_size, ctx, logger=None):
-    hits, ndcgs, num_preds = [], [], []
-    index = 0
-    for rating, items in zip(ratings, negs):
-        index += 1
-        hit, ndcg, num_pred = eval_one(rating, items, model, K, batch_size, ctx)
-        hits.append(hit)
-        ndcgs.append(ndcg)
-        num_preds.append(num_pred)
-        if index % batch_size == 0:
-            logger.info('evaluating test data {} / {}'.format(index, len(ratings)))
-
-    return hits, ndcgs
-
diff --git a/example/neural_collaborative_filtering/core/load.py b/example/neural_collaborative_filtering/core/load.py
deleted file mode 100644
index de47af9fb36b..000000000000
--- a/example/neural_collaborative_filtering/core/load.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# 
-from collections import namedtuple
-
-import pandas as pd
-
-
-RatingData = namedtuple('RatingData',
-                        ['items', 'users', 'ratings', 'min_date', 'max_date'])
-
-
-def describe_ratings(ratings):
-    info = RatingData(items=len(ratings['item_id'].unique()),
-                      users=len(ratings['user_id'].unique()),
-                      ratings=len(ratings),
-                      min_date=ratings['timestamp'].min(),
-                      max_date=ratings['timestamp'].max())
-    print("{ratings} ratings on {items} items from {users} users"
-          " from {min_date} to {max_date}"
-          .format(**(info._asdict())))
-    return info
-
-
-def process_movielens(ratings, sort=True):
-    ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')
-    if sort:
-        ratings.sort_values(by='timestamp', inplace=True)
-    describe_ratings(ratings)
-    return ratings
-
-
-def load_ml_1m(filename, sort=True):
-    names = ['user_id', 'item_id', 'rating', 'timestamp']
-    ratings = pd.read_csv(filename, sep='::', names=names, engine='python')
-    return process_movielens(ratings, sort=sort)
-
-
-def load_ml_20m(filename, sort=True):
-    ratings = pd.read_csv(filename)
-    ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')
-    names = {'userId': 'user_id', 'movieId': 'item_id'}
-    ratings.rename(columns=names, inplace=True)
-    return process_movielens(ratings, sort=sort)
-
-
-DATASETS = [k.replace('load_', '') for k in locals().keys() if "load_" in k]
-
-
-def get_dataset_name(filename):
-    for dataset in DATASETS:
-        if dataset in filename.replace('-', '_').lower():
-            return dataset
-    raise NotImplementedError
-
-
-def implicit_load(filename, sort=True):
-    func = globals()["load_" + get_dataset_name(filename)]
-    return func(filename, sort=sort)
-
diff --git a/example/neural_collaborative_filtering/core/model.py b/example/neural_collaborative_filtering/core/model.py
deleted file mode 100644
index 6c03bb01a357..000000000000
--- a/example/neural_collaborative_filtering/core/model.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# 
-import mxnet as mx
-import numpy as np
-
-@mx.init.register
-class golorot_uniform(mx.init.Initializer):
-    def __init__(self, fan_in, fan_out):
-        super(golorot_uniform, self).__init__(fan_in=fan_in, fan_out=fan_out)
-        self._fan_in = fan_in
-        self._fan_out = fan_out
-    def _init_weight(self, _, arr):
-        limit = np.sqrt(6. / (self._fan_in + self._fan_out))
-        mx.random.uniform(-limit, limit, out=arr)
-
-@mx.init.register
-class lecunn_uniform(mx.init.Initializer):
-    def __init__(self, fan_in):
-        super(lecunn_uniform, self).__init__(fan_in=fan_in)
-        self._fan_in = fan_in
-    def _init_weight(self, _, arr):
-        limit = np.sqrt(3. / self._fan_in)
-        mx.random.uniform(-limit, limit, out=arr)
-
-# only for inference model optimize
-def mlp_opt(user, item, factor_size, model_layers, max_user, max_item):
-    user_weight = mx.sym.Variable('fused_mlp_user_weight', init=mx.init.Normal(0.01))
-    item_weight = mx.sym.Variable('fused_mlp_item_weight', init=mx.init.Normal(0.01))
-    embed_user = mx.sym.Embedding(data=user, weight=user_weight, input_dim=max_user,
-                                  output_dim=factor_size * 2, name='fused_embed_user'+str(factor_size))
-    embed_item = mx.sym.Embedding(data=item, weight=item_weight, input_dim=max_item,
-                                  output_dim=factor_size * 2, name='fused_embed_item'+str(factor_size))
-    pre_gemm_concat = embed_user + embed_item
-
-    for i in range(1, len(model_layers)):
-        if i==1:
-            pre_gemm_concat = mx.sym.Activation(data=pre_gemm_concat, act_type='relu', name='act_'+str(i-1))
-            continue
-        else:
-            mlp_weight_init = golorot_uniform(model_layers[i-1], model_layers[i])
-        mlp_weight = mx.sym.Variable('fc_{}_weight'.format(i-1), init=mlp_weight_init)
-        pre_gemm_concat = mx.sym.FullyConnected(data=pre_gemm_concat, weight=mlp_weight, num_hidden=model_layers[i], name='fc_'+str(i-1))
-        pre_gemm_concat = mx.sym.Activation(data=pre_gemm_concat, act_type='relu', name='act_'+str(i-1))
-
-    return pre_gemm_concat
-
-def mlp(user, item, factor_size, model_layers, max_user, max_item):
-    user_weight = mx.sym.Variable('mlp_user_weight', init=mx.init.Normal(0.01))
-    item_weight = mx.sym.Variable('mlp_item_weight', init=mx.init.Normal(0.01))
-    embed_user = mx.sym.Embedding(data=user, weight=user_weight, input_dim=max_user,
-                                  output_dim=factor_size, name='embed_user'+str(factor_size))
-    embed_item = mx.sym.Embedding(data=item, weight=item_weight, input_dim=max_item,
-                                  output_dim=factor_size, name='embed_item'+str(factor_size))
-    pre_gemm_concat = mx.sym.concat(embed_user, embed_item, dim=1, name='pre_gemm_concat')
-
-    for i in range(1, len(model_layers)):
-        mlp_weight_init = golorot_uniform(model_layers[i-1], model_layers[i])
-        mlp_weight = mx.sym.Variable('fc_{}_weight'.format(i-1), init=mlp_weight_init)
-        pre_gemm_concat = mx.sym.FullyConnected(data=pre_gemm_concat, weight=mlp_weight, num_hidden=model_layers[i], name='fc_'+str(i-1))
-        pre_gemm_concat = mx.sym.Activation(data=pre_gemm_concat, act_type='relu', name='act_'+str(i-1))
-
-    return pre_gemm_concat
-
-def gmf(user, item, factor_size, max_user, max_item):
-    user_weight = mx.sym.Variable('gmf_user_weight', init=mx.init.Normal(0.01))
-    item_weight = mx.sym.Variable('gmf_item_weight', init=mx.init.Normal(0.01))
-    embed_user = mx.sym.Embedding(data=user, weight=user_weight, input_dim=max_user,
-                                  output_dim=factor_size, name='embed_user'+str(factor_size))
-    embed_item = mx.sym.Embedding(data=item, weight=item_weight, input_dim=max_item,
-                                  output_dim=factor_size, name='embed_item'+str(factor_size))
-    pred = embed_user * embed_item
-
-    return pred
-
-def get_model(model_type='neumf', factor_size_mlp=128, factor_size_gmf=64,
-              model_layers=[256, 256, 128, 64], num_hidden=1, 
-              max_user=138493, max_item=26744, opt=False):
-    # input
-    user = mx.sym.Variable('user')
-    item = mx.sym.Variable('item')
-
-    if model_type == 'mlp':
-        if opt:
-            net = mlp_opt(user=user, item=item,
-                         factor_size=factor_size_mlp, model_layers=model_layers,
-                         max_user=max_user, max_item=max_item)
-        else:
-            net = mlp(user=user, item=item,
-                      factor_size=factor_size_mlp, model_layers=model_layers,
-                      max_user=max_user, max_item=max_item)
-    elif model_type == 'gmf':
-        net = gmf(user=user, item=item,
-                  factor_size=factor_size_gmf,
-                  max_user=max_user, max_item=max_item)
-    elif model_type == 'neumf':
-        if opt:
-            net_mlp = mlp_opt(user=user, item=item,
-                              factor_size=factor_size_mlp, model_layers=model_layers,
-                              max_user=max_user, max_item=max_item)
-        else:
-            net_mlp = mlp(user=user, item=item,
-                          factor_size=factor_size_mlp, model_layers=model_layers,
-                          max_user=max_user, max_item=max_item)
-        net_gmf = gmf(user=user, item=item,
-                      factor_size=factor_size_gmf,
-                      max_user=max_user, max_item=max_item)
-
-        net = mx.sym.concat(net_gmf, net_mlp, dim=1, name='post_gemm_concat')
-
-    else:
-        raise ValueError('Unsupported ncf model %s.' % model_type)
-
-    final_weight = mx.sym.Variable('fc_final_weight', init=lecunn_uniform(factor_size_gmf + model_layers[-1]))
-    net = mx.sym.FullyConnected(data=net, weight=final_weight, num_hidden=num_hidden, name='fc_final') 
-   
-    y_label = mx.sym.Variable('softmax_label')
-    net = mx.symbol.LogisticRegressionOutput(data=net, label=y_label, name='sigmoid_final')
-
-    return net
-
diff --git a/example/neural_collaborative_filtering/model_optimizer.py b/example/neural_collaborative_filtering/model_optimizer.py
deleted file mode 100644
index 2866ae7e7e05..000000000000
--- a/example/neural_collaborative_filtering/model_optimizer.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# 
-import os
-import time
-import argparse
-import logging
-import math
-import random
-import numpy as np
-import mxnet as mx
-from core.model import get_model
-from core.dataset import NCFTrainData
-
-logging.basicConfig(level=logging.DEBUG)
-
-parser = argparse.ArgumentParser(description="Run model optimizer.",
-                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser.add_argument('--path', nargs='?', default='./data/',
-                    help='Input data path.')
-parser.add_argument('--dataset', nargs='?', default='ml-20m',
-                    help='The dataset name.')
-parser.add_argument('--model-prefix', type=str, default='./model/ml-20m/neumf')
-parser.add_argument('--epoch', type=int, default=7, help='parameters epoch')
-parser.add_argument('--model-type', type=str, default='neumf', choices=['neumf', 'gmf', 'mlp'],
-                    help="mdoel type")
-parser.add_argument('--layers', default='[256, 256, 128, 64]',
-                    help="list of number hiddens of fc layers in mlp model.")
-parser.add_argument('--factor-size-gmf', type=int, default=64,
-                    help="outdim of gmf embedding layers.")
-parser.add_argument('--num-hidden', type=int, default=1,
-                    help="num-hidden of neumf fc layer")
-
-head = '%(asctime)-15s %(message)s'
-logging.basicConfig(level=logging.INFO, format=head)
-
-# arg parser
-args = parser.parse_args()
-logging.info(args)
-
-model_prefix = args.model_prefix
-model_type = args.model_type
-model_layers = eval(args.layers)
-factor_size_gmf = args.factor_size_gmf
-factor_size_mlp = int(model_layers[0]/2)
-num_hidden = args.num_hidden
-train_dataset = NCFTrainData((args.path + args.dataset + '/train-ratings.csv'), nb_neg=4)
-net = get_model(model_type, factor_size_mlp, factor_size_gmf, 
-                model_layers, num_hidden, train_dataset.nb_users, train_dataset.nb_items, opt=True)
-
-raw_params, _ = mx.model.load_params(model_prefix, args.epoch)
-fc_0_weight_split = mx.nd.split(raw_params['fc_0_weight'], axis=1, num_outputs=2)
-fc_0_left = fc_0_weight_split[0]
-fc_0_right = fc_0_weight_split[1]
-
-user_weight_fusion = mx.nd.FullyConnected(data = raw_params['mlp_user_weight'], weight=fc_0_left, bias=raw_params['fc_0_bias'], no_bias=False, num_hidden=model_layers[0])
-item_weight_fusion = mx.nd.FullyConnected(data = raw_params['mlp_item_weight'], weight=fc_0_right, no_bias=True, num_hidden=model_layers[0])
-
-opt_params = raw_params
-del opt_params['mlp_user_weight']
-del opt_params['mlp_item_weight']
-del opt_params['fc_0_bias']
-opt_params['fused_mlp_user_weight'] = user_weight_fusion
-opt_params['fused_mlp_item_weight'] = item_weight_fusion
-
-mx.model.save_checkpoint(model_prefix + '-opt', args.epoch, net, opt_params, {})
-
diff --git a/example/neural_collaborative_filtering/ncf.py b/example/neural_collaborative_filtering/ncf.py
deleted file mode 100644
index b01be01bc8d9..000000000000
--- a/example/neural_collaborative_filtering/ncf.py
+++ /dev/null
@@ -1,162 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# 
-import os
-import time
-import argparse
-import logging
-import math
-import random
-import numpy as np
-import mxnet as mx
-from core.model import get_model
-from core.dataset import NCFTestData
-from core.evaluate import *
-from mxnet.contrib.quantization import *
-
-logging.basicConfig(level=logging.DEBUG)
-
-parser = argparse.ArgumentParser(description="Run matrix factorization with embedding",
-                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser.add_argument('--path', nargs='?', default='./data/',
-                    help='Input data path.')
-parser.add_argument('--dataset', nargs='?', default='ml-20m',
-                    help='The dataset name.')
-parser.add_argument('--max-user', type=int, default=138493,
-                    help='max number of user index.')
-parser.add_argument('--max-item', type=int, default=26744,
-                    help='max number of item index.')
-parser.add_argument('--batch-size', type=int, default=256,
-                    help='number of examples per batch')
-parser.add_argument('--topk', type=int, default=10,
-                    help="topk for accuracy evaluation.")
-parser.add_argument('--gpu', type=int, default=None,
-                    help="index of gpu to run, e.g. 0 or 1. None means using cpu().")
-parser.add_argument('--benchmark', action='store_true',  help="whether to benchmark performance only")
-parser.add_argument('--epoch', type=int, default=7, help='model checkpoint index for inference')
-parser.add_argument('--prefix', default='./model/ml-20m/neumf', help="model checkpoint prefix")
-parser.add_argument('--calibration', action='store_true', help="whether to calibrate model")
-parser.add_argument('--calib-mode', type=str, choices=['naive', 'entropy'], default='naive',
-                    help='calibration mode used for generating calibration table for the quantized symbol; supports'
-                            ' 1. naive: simply take min and max values of layer outputs as thresholds for'
-                            ' quantization. In general, the inference accuracy worsens with more examples used in'
-                            ' calibration. It is recommended to use `entropy` mode as it produces more accurate'
-                            ' inference results.'
-                            ' 2. entropy: calculate KL divergence of the fp32 output and quantized output for optimal'
-                            ' thresholds. This mode is expected to produce the best inference accuracy of all three'
-                            ' kinds of quantized models if the calibration dataset is representative enough of the'
-                            ' inference dataset.')
-parser.add_argument('--quantized-dtype', type=str, default='auto',
-                    choices=['auto', 'int8', 'uint8'],
-                    help='quantization destination data type for input data')
-parser.add_argument('--num-calib-batches', type=int, default=10,
-                    help='number of batches for calibration')
-
-if __name__ == '__main__':
-    head = '%(asctime)-15s %(message)s'
-    logging.basicConfig(level=logging.INFO, format=head)
-
-    # arg parser
-    args = parser.parse_args()
-    logging.info(args)
-
-    max_user = args.max_user
-    max_item = args.max_item
-    batch_size = args.batch_size
-    benchmark = args.benchmark
-    calibration = args.calibration
-    calib_mode = args.calib_mode
-    quantized_dtype = args.quantized_dtype
-    num_calib_batches = args.num_calib_batches
-    ctx = mx.cpu() if args.gpu is None else mx.gpu(args.gpu)
-    topK = args.topk
-
-    # prepare dataset
-    if benchmark or calibration:
-        logging.info('Prepare movielens dataset')
-        val_iter = get_movielens_iter(args.path + args.dataset + '/test-ratings.csv', batch_size, ctx=ctx, logger=logging)
-    else:
-        logging.info('Prepare validation dataset')
-        data = NCFTestData(args.path + args.dataset)
-        testRatings, testNegatives= data.testRatings, data.testNegatives
-        logging.info("Load validation data done. #user=%d, #item=%d, #test=%d" 
-                    %(max_user, max_item, len(testRatings)))
-        logging.info('Prepare validation dataset completed')
-    
-    # construct the model
-    net, arg_params, aux_params = mx.model.load_checkpoint(args.prefix, args.epoch)
-    if ctx == mx.cpu() and calibration:
-        net = net.get_backend_symbol('MKLDNN_QUANTIZE')
-
-    # initialize the module
-    mod = mx.module.Module(net, context=ctx, data_names=['user', 'item'], label_names=['softmax_label'])
-    provide_data = [mx.io.DataDesc(name='item', shape=((batch_size,))),
-                    mx.io.DataDesc(name='user', shape=((batch_size,)))]
-    provide_label = [mx.io.DataDesc(name='softmax_label', shape=((batch_size,)))]
-    mod.bind(for_training=False, data_shapes=provide_data, label_shapes=provide_label)
-    mod.set_params(arg_params, aux_params)
-
-    if calibration:
-        logging.info('Quantizing FP32 model')
-        excluded_sym_names = ['post_gemm_concat', 'fc_final']
-        cqsym, cqarg_params, aux_params, collector = quantize_graph(sym=net, arg_params=arg_params, aux_params=aux_params,
-                                                                    excluded_sym_names=excluded_sym_names,
-                                                                    calib_mode=calib_mode,
-                                                                    quantized_dtype=quantized_dtype, logger=logging)
-        max_num_examples = num_calib_batches * batch_size
-        mod._exec_group.execs[0].set_monitor_callback(collector.collect, monitor_all=True)
-        num_batches = 0
-        num_examples = 0
-        for batch in val_iter:
-            mod.forward(batch)
-            num_batches += 1
-            num_examples += batch_size
-            if num_examples >= max_num_examples:
-                break
-        logging.info("Collected statistics from %d batches with batch_size=%d"
-                    % (num_batches, batch_size))
-        cqsym, cqarg_params, aux_params = calib_graph(qsym=cqsym, arg_params=arg_params, aux_params=aux_params,
-                                                      collector=collector, calib_mode=calib_mode,
-                                                      quantized_dtype=quantized_dtype, logger=logging)                                                       
-        sym_name = '%s-symbol.json' % (args.prefix + '-quantized')
-        cqsym = cqsym.get_backend_symbol('MKLDNN_QUANTIZE')
-        mx.model.save_checkpoint(args.prefix + '-quantized', args.epoch, cqsym, cqarg_params, aux_params)
-    elif benchmark:
-        logging.info('Benchmarking...')
-        data = [mx.random.randint(0, 1000, shape=shape, ctx=ctx) for _, shape in mod.data_shapes]
-        batch = mx.io.DataBatch(data, []) # empty label
-        for i in range(2000):
-            mod.forward(batch, is_train=False)
-        logging.info('Benchmarking...')
-        num_samples = 0
-        for ib, batch in enumerate(val_iter):
-            if ib == 5:
-                num_samples = 0
-                tic = time.time()
-            mod.forward(batch, is_train=False)
-            mx.nd.waitall()
-            num_samples += batch_size
-        toc = time.time()
-        fps = num_samples/(toc - tic)
-        logging.info('Evaluating completed')
-        logging.info('Inference speed %.4f fps' % fps)
-    else:
-        logging.info('Evaluating...')
-        (hits, ndcgs) = evaluate_model(mod, testRatings, testNegatives, topK, batch_size, ctx, logging)
-        hr, ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
-        logging.info('Evaluate: HR = %.4f, NDCG = %.4f'  % (hr, ndcg))
-
diff --git a/example/neural_collaborative_filtering/train.py b/example/neural_collaborative_filtering/train.py
deleted file mode 100644
index f99b16fd5b0e..000000000000
--- a/example/neural_collaborative_filtering/train.py
+++ /dev/null
@@ -1,163 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# 
-import os
-import time
-import argparse
-import logging
-import math
-import random
-import numpy as np
-import mxnet as mx
-from mxnet import gluon
-from core.model import get_model
-from core.dataset import NCFTrainData, NCFTestData
-from core.evaluate import *
-
-
-logging.basicConfig(level=logging.DEBUG)
-
-parser = argparse.ArgumentParser(description="Run matrix factorization with embedding",
-                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser.add_argument('--path', nargs='?', default='./data/',
-                    help='Input data path.')
-parser.add_argument('--dataset', nargs='?', default='ml-20m',
-                    help='The dataset name.')
-parser.add_argument('--batch-size', type=int, default=2048,
-                    help='number of training examples per batch')
-parser.add_argument('--eval-batch-size', type=int, default=1000,
-                    help='number of evaluate examples per batch')                  
-parser.add_argument('--model-type', type=str, default='neumf', choices=['neumf', 'gmf', 'mlp'],
-                    help="mdoel type")
-parser.add_argument('--num-negative', type=int, default=4,
-                    help="number of negative samples per positive sample while training.")
-parser.add_argument('--layers', default='[256, 256, 128, 64]',
-                    help="list of number hiddens of fc layers in mlp model.")
-parser.add_argument('--factor-size-gmf', type=int, default=64,
-                    help="outdim of gmf embedding layers.")
-parser.add_argument('--num-hidden', type=int, default=1,
-                    help="num-hidden of neumf fc layer")
-parser.add_argument('--log-interval', type=int, default=100,
-                    help='logging interval')
-parser.add_argument('--learning-rate', type=float, default=0.0005,
-                    help='learning rate for optimizer')
-parser.add_argument('--beta1', '-b1', type=float, default=0.9,
-                    help='beta1 for Adam')
-parser.add_argument('--beta2', '-b2', type=float, default=0.999,
-                    help='beta1 for Adam')
-parser.add_argument('--eps', type=float, default=1e-8,
-                    help='eps for Adam')
-parser.add_argument('--topk', type=int, default=10,
-                    help="topk for accuracy evaluation.")
-parser.add_argument('--gpu', type=int, default=None,
-                    help="list of gpus to run, e.g. 0 or 0,2. empty means using cpu().")
-parser.add_argument('--workers', type=int, default=8, help='thread number for dataloader.')
-parser.add_argument('--epoch', type=int, default=14, help='training epoch')
-parser.add_argument('--seed', type=int, default=3, help='random seed to use. Default=3.')
-parser.add_argument('--deploy', action='store_true', help="whether to load static graph for deployment")
-
-
-def cross_entropy(label, pred, eps=1e-12):
-    ce = 0
-    for l, p in zip(label, pred):
-        ce += -( l*np.log(p+eps) + (1-l)*np.log(1-p+eps))
-    return ce
-
-if __name__ == '__main__':
-    head = '%(asctime)-15s %(message)s'
-    logging.basicConfig(level=logging.INFO, format=head)
-
-    # arg parser
-    args = parser.parse_args()
-    logging.info(args)
-
-    mx.random.seed(args.seed)
-    np.random.seed(args.seed)
-    batch_size = args.batch_size
-    eval_batch_size = args.eval_batch_size
-    model_type = args.model_type
-    model_layers = eval(args.layers)
-    factor_size_gmf = args.factor_size_gmf
-    factor_size_mlp = int(model_layers[0]/2)
-    num_hidden = args.num_hidden
-    learning_rate=args.learning_rate
-    beta1=args.beta1
-    beta2=args.beta2
-    eps=args.eps
-    ctx = mx.cpu() if args.gpu is None else mx.gpu(args.gpu)
-    topK = args.topk
-    num_negatives = args.num_negative
-    num_workers = args.workers
-    epoch = args.epoch
-    log_interval = args.log_interval
-
-    # prepare dataset
-    logging.info('Prepare Dataset')
-    train_dataset = NCFTrainData((args.path + args.dataset + '/train-ratings.csv'), num_negatives)
-    test_data = NCFTestData(args.path + args.dataset)
-    train_dataloader = mx.gluon.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, last_batch='rollover')
-    logging.info('Prepare Dataset completed')
-    # construct the model
-    net = get_model(model_type, factor_size_mlp, factor_size_gmf, 
-                    model_layers, num_hidden, train_dataset.nb_users, train_dataset.nb_items)
-
-    # initialize the module
-    mod = mx.module.Module(net, context=ctx, data_names=['user', 'item'], label_names=['softmax_label'])
-    provide_data = [mx.io.DataDesc(name='item', shape=((batch_size,))),
-                    mx.io.DataDesc(name='user', shape=((batch_size,)))]
-    provide_label = [mx.io.DataDesc(name='softmax_label', shape=((batch_size,)))]
-    mod.bind(for_training=True, data_shapes=provide_data, label_shapes=provide_label)
-    mod.init_params()
-    mod.init_optimizer(optimizer='adam', optimizer_params=[('learning_rate', learning_rate), ('beta1',beta1), ('beta2',beta2), ('epsilon',eps)])
-    
-    metric = mx.gluon.metric.create(cross_entropy)
-    speedometer = mx.callback.Speedometer(batch_size, log_interval)
-    best_hr, best_ndcg, best_iter = -1, -1, -1 
-    logging.info('Training started ...')
-    for epoch in range(epoch):
-        metric.reset()
-        for nbatch, seqs in enumerate(train_dataloader):
-            user_id, item_id, labels = seqs
-            batch = mx.io.DataBatch(data = [item_id.astype('int32').as_in_context(ctx),
-                                            user_id.astype('int32').as_in_context(ctx)],
-                                    label = [labels.as_in_context(ctx)])
-            mod.forward(batch)
-            mod.backward()
-            mod.update()
-            predicts=mod.get_outputs()[0]
-            metric.update(labels = labels, preds = predicts)
-            speedometer_param = mx.model.BatchEndParam(epoch=epoch, nbatch=nbatch,
-                                                       eval_metric=metric, locals=locals())
-            speedometer(speedometer_param)
-        
-        # save model
-        dir_path = os.path.dirname(os.path.realpath(__file__))
-        model_path = os.path.join(dir_path, 'model', args.dataset)
-        if not os.path.exists(model_path):
-            os.makedirs(model_path)
-        mod.save_checkpoint(os.path.join(model_path, model_type), epoch)
-        # compute hit ratio
-        (hits, ndcgs) = evaluate_model(mod, test_data.testRatings, test_data.testNegatives, topK, eval_batch_size, ctx, logging)
-        hr, ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
-        logging.info('Iteration %d: HR = %.4f, NDCG = %.4f'  % (epoch, hr, ndcg))
-        # best hit ratio
-        if hr > best_hr:
-            best_hr, best_ndcg, best_iter = hr, ndcg, epoch
-
-    logging.info("End. Best Iteration %d:  HR = %.4f, NDCG = %.4f. " % (best_iter, best_hr, best_ndcg))
-    logging.info('Training completed.')
-
diff --git a/example/ssd/README.md b/example/ssd/README.md
deleted file mode 100644
index f85b90049213..000000000000
--- a/example/ssd/README.md
+++ /dev/null
@@ -1,270 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-# SSD: Single Shot MultiBox Object Detector
-
-SSD is an unified framework for object detection with a single network.
-
-You can use the code to train/evaluate/test for object detection task.
-
--------------------
-
-## Gluon Implementation
-
-You can find a Gluon implementation on [gluon-cv](https://gluon-cv.mxnet.io/build/examples_detection/train_ssd_voc.html).
-
--------------------
-
-### Disclaimer
-This is a re-implementation of original SSD which is based on caffe. The official
-repository is available [here](https://github.com/weiliu89/caffe/tree/ssd).
-The arXiv paper is available [here](http://arxiv.org/abs/1512.02325).
-
-This example is intended for reproducing the nice detector while fully utilize the
-remarkable traits of MXNet.
-* The result is almost identical to the original version. However, due to different implementation details, the results might differ slightly.
-
-Due to the permission issue, this example is maintained in this [repository](https://github.com/zhreshold/mxnet-ssd) separately. You can use the link regarding specific per example [issues](https://github.com/zhreshold/mxnet-ssd/issues).
-
-### What's new
-* Support training and inference on COCO dataset. Int8 inference achieves 0.253 mAP on CPU with MKL-DNN backend, which is a comparable accuracy to FP32 (0.2552 mAP).
-* Support uint8 inference on CPU with MKL-DNN backend. Uint8 inference achieves 0.8364 mAP, which is a comparable accuracy to FP32 (0.8366 mAP).
-* Added live camera capture and detection display (run with --camera flag). Example:
-    `./demo.py --camera --cpu --frame-resize 0.5`
-* Added multiple trained models.
-* Added a much simpler way to compose network from mainstream classification networks (resnet, inception...) and [Guide](symbol/README.md).
-* Update to the latest version according to caffe version, with 5% mAP increase.
-* Use C++ record iterator based on back-end multi-thread engine to achieve huge speed up on multi-gpu environments.
-* Monitor validation mAP during training.
-* More network symbols under development and test.
-* Extra operators are now in `mxnet/src/operator/contrib`.
-* Old models are incompatible, use [e06c55d](https://github.com/dmlc/mxnet/commits/e06c55d6466a0c98c7def8f118a48060fb868901) or [e4f73f1](https://github.com/dmlc/mxnet/commits/e4f73f1f4e76397992c4b0a33c139d52b4b7af0e) for backward compatibility. Or, you can modify the json file to update the symbols if you are familiar with it, because only names have changed while weights and bias should still be good.
-
-### Demo results
-![demo1](https://cloud.githubusercontent.com/assets/3307514/19171057/8e1a0cc4-8be0-11e6-9d8f-088c25353b40.png)
-![demo2](https://cloud.githubusercontent.com/assets/3307514/19171063/91ec2792-8be0-11e6-983c-773bd6868fa8.png)
-![demo3](https://cloud.githubusercontent.com/assets/3307514/19171086/a9346842-8be0-11e6-8011-c17716b22ad3.png)
-
-### mAP
-|        Model          | Training data    | Test data |  mAP | Note |
-|:-----------------:|:----------------:|:---------:|:----:|:-----|
-| [VGG16_reduced 300x300](https://github.com/zhreshold/mxnet-ssd/releases/download/v0.5-beta/vgg16_ssd_300_voc0712_trainval.zip) | VOC07+12 trainval| VOC07 test| 77.8| fast |
-| [VGG16_reduced 512x512](https://github.com/zhreshold/mxnet-ssd/releases/download/v0.5-beta/vgg16_ssd_512_voc0712_trainval.zip) | VOC07+12 trainval | VOC07 test| 79.9| slow |
-| [Inception-v3 512x512](https://github.com/zhreshold/mxnet-ssd/releases/download/v0.6/inceptionv3_ssd_512_voc0712_trainval.zip) | VOC07+12 trainval| VOC07 test| 78.9 | fastest |
-| [Resnet-50 512x512](https://github.com/zhreshold/mxnet-ssd/releases/download/v0.6/resnet50_ssd_512_voc0712_trainval.zip) | VOC07+12 trainval| VOC07 test| 78.9 | fast |
-
-### Speed
-|         Model         |   GPU            | CUDNN | Batch-size | FPS* |
-|:---------------------:|:----------------:|:-----:|:----------:|:----:|
-| VGG16_reduced 300x300 | TITAN X(Maxwell) | v5.1  |     16     | 95   |
-| VGG16_reduced 300x300 | TITAN X(Maxwell) | v5.1  |     8      | 95   |
-| VGG16_reduced 300x300 | TITAN X(Maxwell) | v5.1  |     1      | 64   |
-| VGG16_reduced 300x300 | TITAN X(Maxwell) |  N/A  |     8      | 36   |
-| VGG16_reduced 300x300 | TITAN X(Maxwell) |  N/A  |     1      | 28   |
-*Forward time only, data loading and drawing excluded.*
-
-
-### Getting started
-* You will need python modules: `cv2`, `matplotlib` and `numpy`.
-If you use mxnet-python api, you probably have already got them.
-You can install them via pip or package managers, such as `apt-get`:
-```
-sudo apt-get install python-opencv python-matplotlib python-numpy
-```
-
-* Build MXNet: Follow the official instructions
-```
-# for Ubuntu/Debian
-cp make/config.mk ./config.mk
-# enable cuda, cudnn if applicable
-```
-Remember to enable CUDA if you want to be able to train, since CPU training is
-insanely slow. Using CUDNN is optional, but highly recommended.
-
-### Try the demo
-* Download the pretrained model: [`ssd_resnet50_0712.zip`](https://github.com/zhreshold/mxnet-ssd/releases/download/v0.6/resnet50_ssd_512_voc0712_trainval.zip), and extract to `model/` directory.
-
-* Run
-```
-# cd /path/to/incubator-mxnet/example/ssd
-# download the test images
-python data/demo/download_demo_images.py
-# run the demo
-python demo.py --gpu 0
-# play with examples:
-python demo.py --epoch 0 --images ./data/demo/dog.jpg --thresh 0.5
-python demo.py --cpu --network resnet50 --data-shape 512
-# wait for library to load for the first time
-```
-* Check `python demo.py --help` for more options.
-
-### Live Camera detection
-
-Use `init.sh` to download the trained model.
-You can use `./demo.py --camera` to use a video capture device with opencv such as a webcam. This
-will open a window that will display the camera output together with the detections. You can play
-with the detection threshold to get more or less detections.
-
-### Train the model on VOC
-* Note that we recommend to use gluon-cv to train the model, please refer to [gluon-cv ssd](https://gluon-cv.mxnet.io/build/examples_detection/train_ssd_voc.html).
-This example only covers training on Pascal VOC or MS COCO dataset. Other datasets should
-be easily supported by adding subclass derived from class `Imdb` in `dataset/imdb.py`.
-See example of `dataset/pascal_voc.py` for details.
-* Download the converted pretrained `vgg16_reduced` model [here](https://github.com/zhreshold/mxnet-ssd/releases/download/v0.2-beta/vgg16_reduced.zip), unzip `.param` and `.json` files
-into `model/` directory by default.
-* Download the PASCAL VOC dataset, skip this step if you already have one.
-```
-cd /path/to/where_you_store_datasets/
-wget http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar
-wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
-wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar
-# Extract the data.
-tar -xvf VOCtrainval_11-May-2012.tar
-tar -xvf VOCtrainval_06-Nov-2007.tar
-tar -xvf VOCtest_06-Nov-2007.tar
-```
-* We are going to use `trainval` set in VOC2007/2012 as a common strategy.
-The suggested directory structure is to store `VOC2007` and `VOC2012` directories
-in the same `VOCdevkit` folder.
-* Then link `VOCdevkit` folder to `data/VOCdevkit` by default:
-```
-ln -s /path/to/VOCdevkit /path/to/incubator-mxnet/example/ssd/data/VOCdevkit
-```
-Use hard link instead of copy could save us a bit disk space.
-* Create packed binary file for faster training:
-```
-# cd /path/to/incubator-mxnet/example/ssd
-bash tools/prepare_pascal.sh
-# or if you are using windows
-python tools/prepare_dataset.py --dataset pascal --year 2007,2012 --set trainval --target ./data/train.lst
-python tools/prepare_dataset.py --dataset pascal --year 2007 --set test --target ./data/val.lst --no-shuffle
-```
-* Start training:
-```
-# cd /path/to/incubator-mxnet/example/ssd
-python train.py
-```
-* By default, this example will use `batch-size=32` and `learning_rate=0.002`.
-You might need to change the parameters a bit if you have different configurations.
-Check `python train.py --help` for more training options. For example, if you have 4 GPUs, use:
-```
-# note that a perfect training parameter set is yet to be discovered for multi-GPUs
-python train.py --gpus 0,1,2,3 --batch-size 32
-```
-
-### Train the model on COCO
-* Download the COCO2014 dataset, skip this step if you already have one.
-```
-cd /path/to/where_you_store_datasets/
-wget http://images.cocodataset.org/zips/train2014.zip
-wget http://images.cocodataset.org/zips/val2014.zip
-wget http://images.cocodataset.org/annotations/annotations_trainval2014.zip
-# Extract the data.
-unzip train2014.zip
-unzip val2014.zip
-unzip annotations_trainval2014.zip
-```
-* We are going to use `train2014,valminusminival2014` set in COCO2014 for training and `minival2014` for evaluation as a common strategy.
-* Then link `COCO2014` folder to `data/coco` by default:
-```
-ln -s /path/to/COCO2014 /path/to/incubator-mxnet/example/ssd/data/coco
-```
-Use hard link instead of copy could save us a bit disk space.
-* Create packed binary file for faster training:
-```
-# cd /path/to/incubator-mxnet/example/ssd
-bash tools/prepare_coco.sh
-# or if you are using windows
-python tools/prepare_dataset.py --dataset coco --set train2014,valminusminival2014 --target ./data/train.lst --root ./data/coco
-python tools/prepare_dataset.py --dataset coco --set minival2014 --target ./data/val.lst --root ./data/coco --no-shuffle
-```
-* Start training:
-```
-# cd /path/to/incubator-mxnet/example/ssd
-python train.py --label-width=560 --num-class=80 --class-names=./dataset/names/coco_label --pretrained="" --num-example=117265 --batch-size=64
-```
-
-### Evalute trained model
-Make sure you have val.rec as validation dataset. It's the same one as used in training. Use:
-```
-# cd /path/to/incubator-mxnet/example/ssd
-python evaluate.py --gpus 0,1 --batch-size 128 --epoch 0
-
-# Evaluate on COCO dataset
-python evaluate.py --gpus 0,1 --batch-size 128 --epoch 0 --num-class=80 --class-names=./dataset/names/mscoco.names
-```
-
-### Quantize model
-
-To quantize a model on VOC dataset, follow the [Train instructions](https://github.com/apache/incubator-mxnet/tree/master/example/ssd#train-the-model-on-VOC) to train a FP32 `SSD-VGG16_reduced_300x300` model based on Pascal VOC dataset. You can also download our [SSD-VGG16 pre-trained model](http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/ssd_vgg16_reduced_300-dd479559.zip) and [packed binary data](http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/ssd-val-fc19a535.zip). Create `model` and `data` directories if they're not exist, extract the zip files, then rename the uncompressed files as follows (eg, rename `ssd-val-fc19a535.idx` to `val.idx`, `ssd-val-fc19a535.lst` to `val.lst`, `ssd-val-fc19a535.rec` to `val.rec`, `ssd_vgg16_reduced_300-dd479559.params` to `ssd_vgg16_reduced_300-0000.params`, `ssd_vgg16_reduced_300-symbol-dd479559.json` to `ssd_vgg16_reduced_300-symbol.json`.)
-
-To quantize a model on COCO dataset, follow the [Train instructions](https://github.com/apache/incubator-mxnet/tree/master/example/ssd#train-the-model-on-COCO) to train a FP32 `SSD-VGG16_reduced_300x300` model based on COCO dataset. You can also download our [SSD-VGG16 pre-trained model](http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/ssd_vgg16_reduced_300-7fedd4ad.zip) and [packed binary data](http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/ssd_coco-val-e91096e8.zip). Create `model` and `data` directories if they're not exist, extract the zip files, then rename the uncompressed files as follows (eg, rename `ssd_coco-val-e91096e8.idx` to `val.idx`, `ssd_coco-val-e91096e8.lst` to `val.lst`, `ssd_coco-val-e91096e8.rec` to `val.rec`, `ssd_vgg16_reduced_300-7fedd4ad.params` to `ssd_vgg16_reduced_300-0000.params`, `ssd_vgg16_reduced_300-symbol-7fedd4ad.json` to `ssd_vgg16_reduced_300-symbol.json`.)
-
-```
-data/
-|---val.rec
-|---val.lxt
-|---val.idx
-model/
-|---ssd_vgg16_reduced_300-0000.params
-|---ssd_vgg16_reduced_300-symbol.json
-```
-
-Then, use the following command for quantization. By default, this script uses 5 batches (32 samples per batch) for naive calibration:
-
-```
-python quantization.py
-```
-
-After quantization, INT8 models will be saved in `model/` dictionary.  Use the following command to launch inference.
-
-```
-
-# Launch FP32 Inference on VOC dataset
-python evaluate.py --cpu --num-batch 10 --batch-size 224 --deploy --prefix=./model/ssd_
-
-# Launch INT8 Inference on VOC dataset
-python evaluate.py --cpu --num-batch 10 --batch-size 224 --deploy --prefix=./model/cqssd_
-
-# Launch FP32 Inference on COCO dataset
-
-python evaluate.py --cpu --num-batch 10 --batch-size 224 --deploy --prefix=./model/ssd_ --num-class=80 --class-names=./dataset/names/mscoco.names
-
-# Launch INT8 Inference on COCO dataset
-
-python evaluate.py --cpu --num-batch 10 --batch-size 224 --deploy --prefix=./model/cqssd_ --num-class=80 --class-names=./dataset/names/mscoco.names
-
-# Launch dummy data Inference
-python benchmark_score.py --deploy --prefix=./model/ssd_
-python benchmark_score.py --deploy --prefix=./model/cqssd_
-```
-### Convert model to deploy mode
-This simply removes all loss layers, and attach a layer for merging results and non-maximum suppression.
-Useful when loading python symbol is not available.
-```
-# cd /path/to/incubator-mxnet/example/ssd
-python deploy.py --num-class 20
-```
-
-### Legacy models
-Since the new interface for composing network is introduced, the old models have inconsistent names for weights.
-You can still load the previous model by rename the symbol to `legacy_xxx.py`
-and call with `python train/demo.py --network legacy_xxx `
-For example:
-```
-python demo.py --network 'legacy_vgg16_ssd_300.py' --prefix model/ssd_300 --epoch 0
-```
diff --git a/example/ssd/__init__.py b/example/ssd/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/example/ssd/benchmark_score.py b/example/ssd/benchmark_score.py
deleted file mode 100644
index 01a0eb9528da..000000000000
--- a/example/ssd/benchmark_score.py
+++ /dev/null
@@ -1,117 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from __future__ import print_function
-import os
-import sys
-import argparse
-import importlib
-import mxnet as mx
-import time
-import logging
-
-from symbol.symbol_factory import get_symbol
-from symbol.symbol_factory import get_symbol_train
-from symbol import symbol_builder
-
-
-parser = argparse.ArgumentParser(description='MXNet SSD benchmark')
-parser.add_argument('--network', '-n', type=str, default='vgg16_reduced')
-parser.add_argument('--batch_size', '-b', type=int, default=0)
-parser.add_argument('--shape', '-w', type=int, default=300)
-parser.add_argument('--class_num', '-class', type=int, default=20)
-parser.add_argument('--prefix', dest='prefix', help='load model prefix',
-                    default=os.path.join(os.getcwd(), 'model', 'ssd_'), type=str)
-parser.add_argument('--deploy', dest='deploy', help='Load network from model',
-                    action='store_true', default=False)
-
-
-def get_data_shapes(batch_size):
-    image_shape = (3, 300, 300)
-    return [('data', (batch_size,)+image_shape)]
-
-def get_label_shapes(batch_size):
-    return [('label', (batch_size,) + (42, 6))]
-
-def get_data(batch_size):
-    data_shapes = get_data_shapes(batch_size)
-    data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=mx.cpu()) for _, shape in data_shapes]
-    batch = mx.io.DataBatch(data, [])
-    return batch
-
-
-if __name__ == '__main__':
-    args = parser.parse_args()
-    network = args.network
-    image_shape = args.shape
-    num_classes = args.class_num
-    b = args.batch_size
-    prefix = args.prefix
-    supported_image_shapes = [300, 512]
-    supported_networks = ['vgg16_reduced', 'inceptionv3', 'resnet50']
-
-    if network not in supported_networks:
-        raise Exception(network + " is not supported")
-
-    if image_shape not in supported_image_shapes:
-       raise Exception("Image shape should be either 300*300 or 512*512!")
-
-    if b == 0:
-        batch_sizes = [1, 2, 4, 8, 16, 32]
-    else:
-        batch_sizes = [b]
-
-    data_shape = (3, image_shape, image_shape)
-
-    if args.deploy == True:
-        prefix += network + '_' + str(data_shape[1]) + '-symbol.json'
-        net = mx.sym.load(prefix)
-    else:
-        net = get_symbol(network, data_shape[1], num_classes=num_classes,
-                         nms_thresh=0.4, force_suppress=True)
-    if not 'label' in net.list_arguments():
-        label = mx.sym.Variable(name='label')
-        net = mx.sym.Group([net, label])
-    
-    num_batches = 100
-    dry_run = 5   # use 5 iterations to warm up
-    
-    for bs in batch_sizes:
-        batch = get_data(bs)
-        mod = mx.mod.Module(net, label_names=('label',), context=mx.cpu())
-        mod.bind(for_training = False,
-                 inputs_need_grad = False,
-                 data_shapes = get_data_shapes(bs),
-                 label_shapes = get_label_shapes(bs))
-        mod.init_params(initializer=mx.init.Xavier(magnitude=2.))
-
-        # get data
-        data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=mx.cpu()) for _, shape in mod.data_shapes]
-        batch = mx.io.DataBatch(data, [])
-
-        for i in range(dry_run + num_batches):
-            if i == dry_run:
-                tic = time.time()
-            mod.forward(batch, is_train=False)
-            for output in mod.get_outputs():
-                output.wait_to_read()
-
-        avg_time = (time.time() - tic) / num_batches
-        fps = bs / avg_time
-        print("SSD-" + network + " with " + str(num_classes) + " classes and shape " + str(data_shape))
-        print("batchsize=" + str(bs) + " " + str(1000*avg_time) + " ms")
-        print("batchsize=" + str(bs) + " " + str(fps) + " imgs/s")
diff --git a/example/ssd/config/__init__.py b/example/ssd/config/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/example/ssd/config/config.py b/example/ssd/config/config.py
deleted file mode 100644
index 8d44a0d992c3..000000000000
--- a/example/ssd/config/config.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import os
-from .utils import DotDict, namedtuple_with_defaults, zip_namedtuple, config_as_dict
-
-RandCropper = namedtuple_with_defaults('RandCropper',
-    'min_crop_scales, max_crop_scales, \
-    min_crop_aspect_ratios, max_crop_aspect_ratios, \
-    min_crop_overlaps, max_crop_overlaps, \
-    min_crop_sample_coverages, max_crop_sample_coverages, \
-    min_crop_object_coverages, max_crop_object_coverages, \
-    max_crop_trials',
-    [0.0, 1.0,
-    0.5, 2.0,
-    0.0, 1.0,
-    0.0, 1.0,
-    0.0, 1.0,
-    25])
-
-RandPadder = namedtuple_with_defaults('RandPadder',
-    'rand_pad_prob, max_pad_scale, fill_value',
-    [0.0, 1.0, 127])
-
-ColorJitter = namedtuple_with_defaults('ColorJitter',
-    'random_hue_prob, max_random_hue, \
-    random_saturation_prob, max_random_saturation, \
-    random_illumination_prob, max_random_illumination, \
-    random_contrast_prob, max_random_contrast',
-    [0.0, 18,
-    0.0, 32,
-    0.0, 32,
-    0.0, 0.5])
-
-
-cfg = DotDict()
-cfg.ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
-
-# training configs
-cfg.train = DotDict()
-# random cropping samplers
-cfg.train.rand_crop_samplers = [
-    RandCropper(min_crop_scales=0.3, min_crop_overlaps=0.1),
-    RandCropper(min_crop_scales=0.3, min_crop_overlaps=0.3),
-    RandCropper(min_crop_scales=0.3, min_crop_overlaps=0.5),
-    RandCropper(min_crop_scales=0.3, min_crop_overlaps=0.7),
-    RandCropper(min_crop_scales=0.3, min_crop_overlaps=0.9),]
-cfg.train.crop_emit_mode = 'center'
-# cfg.train.emit_overlap_thresh = 0.4
-# random padding
-cfg.train.rand_pad = RandPadder(rand_pad_prob=0.5, max_pad_scale=4.0)
-# random color jitter
-cfg.train.color_jitter = ColorJitter(random_hue_prob=0.5, random_saturation_prob=0.5,
-    random_illumination_prob=0.5, random_contrast_prob=0.5)
-cfg.train.inter_method = 10  # random interpolation
-cfg.train.rand_mirror_prob = 0.5
-cfg.train.shuffle = True
-cfg.train.seed = 233
-cfg.train.preprocess_threads = 48
-cfg.train = config_as_dict(cfg.train)  # convert to normal dict
-
-# validation
-cfg.valid = DotDict()
-cfg.valid.rand_crop_samplers = []
-cfg.valid.rand_pad = RandPadder()
-cfg.valid.color_jitter = ColorJitter()
-cfg.valid.rand_mirror_prob = 0
-cfg.valid.shuffle = False
-cfg.valid.seed = 0
-cfg.valid.preprocess_threads = 32
-cfg.valid = config_as_dict(cfg.valid)  # convert to normal dict
diff --git a/example/ssd/config/utils.py b/example/ssd/config/utils.py
deleted file mode 100644
index 5c8af6a4dd93..000000000000
--- a/example/ssd/config/utils.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import collections
-
-class DotDict(dict):
-    """
-    Simple class for dot access elements in dict, support nested initialization
-    Example:
-    d = DotDict({'child': 'dotdict'}, name='dotdict', index=1, contents=['a', 'b'])
-    # add new key
-    d.new_key = '!' # or d['new_key'] = '!'
-    # update values
-    d.new_key = '!!!'
-    # delete keys
-    del d.new_key
-    """
-    def __init__(self, *args, **kwargs):
-        super(DotDict, self).__init__(*args, **kwargs)
-        for arg in args:
-            if isinstance(arg, dict):
-                for k, v in arg.items():
-                    self[k] = v
-
-        if kwargs:
-            for k, v in kwargs.items():
-                self[k] = v
-
-    def __getattr__(self, attr):
-        return self.get(attr)
-
-    def __setattr__(self, key, value):
-        self.__setitem__(key, value)
-
-    def __setitem__(self, key, value):
-        super(DotDict, self).__setitem__(key, value)
-        self.__dict__.update({key: value})
-
-    def __delattr__(self, item):
-        self.__delitem__(item)
-
-    def __delitem__(self, key):
-        super(DotDict, self).__delitem__(key)
-        del self.__dict__[key]
-
-
-def namedtuple_with_defaults(typename, field_names, default_values=()):
-    """ create a namedtuple with default values """
-    T = collections.namedtuple(typename, field_names)
-    T.__new__.__defaults__ = (None, ) * len(T._fields)
-    if isinstance(default_values, collections.Mapping):
-        prototype = T(**default_values)
-    else:
-        prototype = T(*default_values)
-    T.__new__.__defaults__ = tuple(prototype)
-    return T
-
-def merge_dict(a, b):
-    """ merge dict a, b, with b overriding keys in a """
-    c = a.copy()
-    c.update(b)
-    return c
-
-def zip_namedtuple(nt_list):
-    """ accept list of namedtuple, return a dict of zipped fields """
-    if not nt_list:
-        return dict()
-    if not isinstance(nt_list, list):
-        nt_list = [nt_list]
-    for nt in nt_list:
-        assert type(nt) == type(nt_list[0])
-    ret = {k : [v] for k, v in nt_list[0]._asdict().items()}
-    for nt in nt_list[1:]:
-        for k, v in nt._asdict().items():
-            ret[k].append(v)
-    return ret
-
-def config_as_dict(cfg):
-    """ convert raw configuration to unified dictionary """
-    ret = cfg.__dict__.copy()
-    # random cropping params
-    del ret['rand_crop_samplers']
-    assert isinstance(cfg.rand_crop_samplers, list)
-    ret = merge_dict(ret, zip_namedtuple(cfg.rand_crop_samplers))
-    num_crop_sampler = len(cfg.rand_crop_samplers)
-    ret['num_crop_sampler'] = num_crop_sampler  # must specify the #
-    ret['rand_crop_prob'] = 1.0 / (num_crop_sampler + 1) * num_crop_sampler
-    # random padding params
-    del ret['rand_pad']
-    ret = merge_dict(ret, cfg.rand_pad._asdict())
-    # color jitter
-    del ret['color_jitter']
-    ret = merge_dict(ret, cfg.color_jitter._asdict())
-    return ret
diff --git a/example/ssd/data/demo/download_demo_images.py b/example/ssd/data/demo/download_demo_images.py
deleted file mode 100644
index 425f71393ce2..000000000000
--- a/example/ssd/data/demo/download_demo_images.py
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import os
-
-wd = os.path.dirname(os.path.realpath(__file__))
-
-def download(url, target):
-    os.system("wget {} -O {}".format(url, target))
-
-if __name__ == "__main__":
-    base_url = "https://cloud.githubusercontent.com/assets/3307514/"
-    demo_list = {"20012566/cbb53c76-a27d-11e6-9aaa-91939c9a1cd5.jpg":"000001.jpg",
-    "20012564/cbb43894-a27d-11e6-9619-ba792b66c4ae.jpg": "000002.jpg",
-    "20012565/cbb53942-a27d-11e6-996c-125bb060a81d.jpg": "000004.jpg",
-    "20012562/cbb4136e-a27d-11e6-884c-ed83c165b422.jpg": "000010.jpg",
-    "20012567/cbb60336-a27d-11e6-93ff-cbc3f09f5c9e.jpg": "dog.jpg",
-    "20012563/cbb41382-a27d-11e6-92a9-18dab4fd1ad3.jpg": "person.jpg",
-    "20012568/cbc2d6f6-a27d-11e6-94c3-d35a9cb47609.jpg": "street.jpg"}
-    for k, v in demo_list.items():
-        download(base_url + k, os.path.join(wd, v))
diff --git a/example/ssd/dataset/__init__.py b/example/ssd/dataset/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/example/ssd/dataset/concat_db.py b/example/ssd/dataset/concat_db.py
deleted file mode 100644
index 7e22105ff8cf..000000000000
--- a/example/ssd/dataset/concat_db.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from dataset.imdb import Imdb
-import random
-
-class ConcatDB(Imdb):
-    """
-    ConcatDB is used to concatenate multiple imdbs to form a larger db.
-    It is very useful to combine multiple dataset with same classes.
-    Parameters
-    ----------
-    imdbs : Imdb or list of Imdb
-        Imdbs to be concatenated
-    shuffle : bool
-        whether to shuffle the initial list
-    """
-    def __init__(self, imdbs, shuffle):
-        super(ConcatDB, self).__init__('concatdb')
-        if not isinstance(imdbs, list):
-            imdbs = [imdbs]
-        self.imdbs = imdbs
-        self._check_classes()
-        self.image_set_index = self._load_image_set_index(shuffle)
-
-    def _check_classes(self):
-        """
-        check input imdbs, make sure they have same classes
-        """
-        try:
-            self.classes = self.imdbs[0].classes
-            self.num_classes = len(self.classes)
-        except AttributeError:
-            # fine, if no classes is provided
-            pass
-
-        if self.num_classes > 0:
-            for db in self.imdbs:
-                assert self.classes == db.classes, "Multiple imdb must have same classes"
-
-    def _load_image_set_index(self, shuffle):
-        """
-        get total number of images, init indices
-
-        Parameters
-        ----------
-        shuffle : bool
-            whether to shuffle the initial indices
-        """
-        self.num_images = 0
-        for db in self.imdbs:
-            self.num_images += db.num_images
-        indices = list(range(self.num_images))
-        if shuffle:
-            random.shuffle(indices)
-        return indices
-
-    def _locate_index(self, index):
-        """
-        given index, find out sub-db and sub-index
-
-        Parameters
-        ----------
-        index : int
-            index of a specific image
-
-        Returns
-        ----------
-        a tuple (sub-db, sub-index)
-        """
-        assert index >= 0 and index < self.num_images, "index out of range"
-        pos = self.image_set_index[index]
-        for k, v in enumerate(self.imdbs):
-            if pos >= v.num_images:
-                pos -= v.num_images
-            else:
-                return (k, pos)
-
-    def image_path_from_index(self, index):
-        """
-        given image index, find out full path
-
-        Parameters
-        ----------
-        index: int
-            index of a specific image
-
-        Returns
-        ----------
-        full path of this image
-        """
-        assert self.image_set_index is not None, "Dataset not initialized"
-        pos = self.image_set_index[index]
-        n_db, n_index = self._locate_index(index)
-        return self.imdbs[n_db].image_path_from_index(n_index)
-
-    def label_from_index(self, index):
-        """
-        given image index, return preprocessed ground-truth
-
-        Parameters
-        ----------
-        index: int
-            index of a specific image
-
-        Returns
-        ----------
-        ground-truths of this image
-        """
-        assert self.image_set_index is not None, "Dataset not initialized"
-        pos = self.image_set_index[index]
-        n_db, n_index = self._locate_index(index)
-        return self.imdbs[n_db].label_from_index(n_index)
diff --git a/example/ssd/dataset/cv2Iterator.py b/example/ssd/dataset/cv2Iterator.py
deleted file mode 100644
index 0af8c3272fa9..000000000000
--- a/example/ssd/dataset/cv2Iterator.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import mxnet as mx
-import numpy as np
-import cv2
-
-
-class CameraIterator():
-    """
-    An iterator that captures frames with opencv or the specified capture
-    """
-    def __init__(self, capture=cv2.VideoCapture(0), frame_resize=None):
-        self._capture = capture
-        self._frame_resize = None
-        if frame_resize:
-            if isinstance(frame_resize, (tuple, list)) and (len(frame_resize) == 2):
-                self._frame_resize = tuple(map(int, frame_resize))
-                self._frame_shape = (1, 3, self._frame_resize[0], self._frame_resize[1])
-            elif isinstance(frame_resize, float):
-                width = int(self._capture.get(cv2.CAP_PROP_FRAME_WIDTH)*frame_resize)
-                height = int(self._capture.get(cv2.CAP_PROP_FRAME_HEIGHT)*frame_resize)
-                self._frame_shape = (1, 3, width, height)
-                self._frame_resize = (width, height)
-            else:
-                assert False, "frame_resize should be a tuple of (x,y) pixels "
-                "or a float setting the scaling factor"
-        else:
-            self._frame_shape = (1, 3,
-                int(self._capture.get(cv2.CAP_PROP_FRAME_WIDTH)),
-                int(self._capture.get(cv2.CAP_PROP_FRAME_HEIGHT)))
-
-    def __iter__(self):
-        return self
-
-    def __next__(self):
-        ret, frame = self._capture.read()
-        if cv2.waitKey(1) & 0xFF == ord('q') or ret is not True:
-            raise StopIteration
-        if self._frame_resize:
-            frame = cv2.resize(frame, (self._frame_resize[0], self._frame_resize[1]))
-        return frame
-
-    def __enter__(self):
-        pass
-
-    def __exit__(self, exc_type, exc_alue, traceback):
-        self.close()
-
-    def close(self):
-        self._capture.release()
diff --git a/example/ssd/dataset/imdb.py b/example/ssd/dataset/imdb.py
deleted file mode 100644
index 4fbb5d85c873..000000000000
--- a/example/ssd/dataset/imdb.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import numpy as np
-import os.path as osp
-
-class Imdb(object):
-    """
-    Base class for dataset loading
-
-    Parameters:
-    ----------
-    name : str
-        name of dataset
-    """
-    def __init__(self, name):
-        self.name = name
-        self.classes = []
-        self.num_classes = 0
-        self.image_set_index = None
-        self.num_images = 0
-        self.labels = None
-        self.padding = 0
-
-    def image_path_from_index(self, index):
-        """
-        load image full path given specified index
-
-        Parameters:
-        ----------
-        index : int
-            index of image requested in dataset
-
-        Returns:
-        ----------
-        full path of specified image
-        """
-        raise NotImplementedError
-
-    def label_from_index(self, index):
-        """
-        load ground-truth of image given specified index
-
-        Parameters:
-        ----------
-        index : int
-            index of image requested in dataset
-
-        Returns:
-        ----------
-        object ground-truths, in format
-        numpy.array([id, xmin, ymin, xmax, ymax]...)
-        """
-        raise NotImplementedError
-
-    def save_imglist(self, fname=None, root=None, shuffle=False):
-        """
-        save imglist to disk
-
-        Parameters:
-        ----------
-        fname : str
-            saved filename
-        """
-        def progress_bar(count, total, suffix=''):
-            import sys
-            bar_len = 24
-            filled_len = int(round(bar_len * count / float(total)))
-
-            percents = round(100.0 * count / float(total), 1)
-            bar = '=' * filled_len + '-' * (bar_len - filled_len)
-            sys.stdout.write('[%s] %s%s ...%s\r' % (bar, percents, '%', suffix))
-            sys.stdout.flush()
-
-        str_list = []
-        for index in range(self.num_images):
-            progress_bar(index, self.num_images)
-            label = self.label_from_index(index)
-            if label.size < 1:
-                continue
-            path = self.image_path_from_index(index)
-            if root:
-                path = osp.relpath(path, root)
-            str_list.append('\t'.join([str(index), str(2), str(label.shape[1])] \
-              + ["{0:.4f}".format(x) for x in label.ravel()] + [path,]) + '\n')
-        if str_list:
-            if shuffle:
-                import random
-                random.shuffle(str_list)
-            if not fname:
-                fname = self.name + '.lst'
-            with open(fname, 'w') as f:
-                for line in str_list:
-                    f.write(line)
-        else:
-            raise RuntimeError("No image in imdb")
-
-    def _load_class_names(self, filename, dirname):
-        """
-        load class names from text file
-
-        Parameters:
-        ----------
-        filename: str
-            file stores class names
-        dirname: str
-            file directory
-        """
-        full_path = osp.join(dirname, filename)
-        classes = []
-        with open(full_path, 'r') as f:
-            classes = [l.strip() for l in f.readlines()]
-        return classes
diff --git a/example/ssd/dataset/iterator.py b/example/ssd/dataset/iterator.py
deleted file mode 100644
index 0d35b4724102..000000000000
--- a/example/ssd/dataset/iterator.py
+++ /dev/null
@@ -1,307 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import mxnet as mx
-import numpy as np
-import cv2
-from tools.rand_sampler import RandSampler
-
-class DetRecordIter(mx.io.DataIter):
-    """
-    The new detection iterator wrapper for mx.io.ImageDetRecordIter which is
-    written in C++, it takes record file as input and runs faster.
-    Supports various augment operations for object detection.
-
-    Parameters:
-    -----------
-    path_imgrec : str
-        path to the record file
-    path_imglist : str
-        path to the list file to replace the labels in record
-    batch_size : int
-        batch size
-    data_shape : tuple
-        (3, height, width)
-    label_width : int
-        specify the label width, use -1 for variable length
-    label_pad_width : int
-        labels must have same shape in batches, use -1 for automatic estimation
-        in each record, otherwise force padding to width in case you want t
-        rain/validation to match the same width
-    label_pad_value : float
-        label padding value
-    resize_mode : str
-        force - resize to data_shape regardless of aspect ratio
-        fit - try fit to data_shape preserving aspect ratio
-        shrink - shrink to data_shape only, preserving aspect ratio
-    mean_pixels : list or tuple
-        mean values for red/green/blue
-    kwargs : dict
-        see mx.io.ImageDetRecordIter
-
-    Returns:
-    ----------
-
-    """
-    def __init__(self, path_imgrec, batch_size, data_shape, path_imglist="",
-                 label_width=-1, label_pad_width=-1, label_pad_value=-1,
-                 resize_mode='force',  mean_pixels=[123.68, 116.779, 103.939],
-                 **kwargs):
-        super(DetRecordIter, self).__init__()
-        self.rec = mx.io.ImageDetRecordIter(
-            path_imgrec     = path_imgrec,
-            path_imglist    = path_imglist,
-            label_width     = label_width,
-            label_pad_width = label_pad_width,
-            label_pad_value = label_pad_value,
-            batch_size      = batch_size,
-            data_shape      = data_shape,
-            mean_r          = mean_pixels[0],
-            mean_g          = mean_pixels[1],
-            mean_b          = mean_pixels[2],
-            resize_mode     = resize_mode,
-            **kwargs)
-
-        self.provide_label = None
-        self._get_batch()
-        if not self.provide_label:
-            raise RuntimeError("Invalid ImageDetRecordIter: " + path_imgrec)
-        self.reset()
-
-    @property
-    def provide_data(self):
-        return self.rec.provide_data
-
-    def reset(self):
-        self.rec.reset()
-
-    def iter_next(self):
-        return self._get_batch()
-
-    def next(self):
-        if self.iter_next():
-            return self._batch
-        else:
-            raise StopIteration
-
-    def _get_batch(self):
-        self._batch = self.rec.next()
-        if not self._batch:
-            return False
-
-        if self.provide_label is None:
-            # estimate the label shape for the first batch, always reshape to n*5
-            first_label = self._batch.label[0][0].asnumpy()
-            self.batch_size = self._batch.label[0].shape[0]
-            self.label_header_width = int(first_label[4])
-            self.label_object_width = int(first_label[5])
-            assert self.label_object_width >= 5, "object width must >=5"
-            self.label_start = 4 + self.label_header_width
-            self.max_objects = (first_label.size - self.label_start) // self.label_object_width
-            self.label_shape = (self.batch_size, self.max_objects, self.label_object_width)
-            self.label_end = self.label_start + self.max_objects * self.label_object_width
-            self.provide_label = [('label', self.label_shape)]
-
-        # modify label
-        label = self._batch.label[0].asnumpy()
-        label = label[:, self.label_start:self.label_end].reshape(
-            (self.batch_size, self.max_objects, self.label_object_width))
-        self._batch.label = [mx.nd.array(label)]
-        return True
-
-class DetIter(mx.io.DataIter):
-    """
-    Detection Iterator, which will feed data and label to network
-    Optional data augmentation is performed when providing batch
-
-    Parameters:
-    ----------
-    imdb : Imdb
-        image database
-    batch_size : int
-        batch size
-    data_shape : int or (int, int)
-        image shape to be resized
-    mean_pixels : float or float list
-        [R, G, B], mean pixel values
-    rand_samplers : list
-        random cropping sampler list, if not specified, will
-        use original image only
-    rand_mirror : bool
-        whether to randomly mirror input images, default False
-    shuffle : bool
-        whether to shuffle initial image list, default False
-    rand_seed : int or None
-        whether to use fixed random seed, default None
-    max_crop_trial : bool
-        if random crop is enabled, defines the maximum trial time
-        if trial exceed this number, will give up cropping
-    is_train : bool
-        whether in training phase, default True, if False, labels might
-        be ignored
-    """
-    def __init__(self, imdb, batch_size, data_shape, \
-                 mean_pixels=[128, 128, 128], rand_samplers=[], \
-                 rand_mirror=False, shuffle=False, rand_seed=None, \
-                 is_train=True, max_crop_trial=50):
-        super(DetIter, self).__init__()
-
-        self._imdb = imdb
-        self.batch_size = batch_size
-        if isinstance(data_shape, int):
-            data_shape = (data_shape, data_shape)
-        self._data_shape = data_shape
-        self._mean_pixels = mx.nd.array(mean_pixels).reshape((3,1,1))
-        if not rand_samplers:
-            self._rand_samplers = []
-        else:
-            if not isinstance(rand_samplers, list):
-                rand_samplers = [rand_samplers]
-            assert isinstance(rand_samplers[0], RandSampler), "Invalid rand sampler"
-            self._rand_samplers = rand_samplers
-        self.is_train = is_train
-        self._rand_mirror = rand_mirror
-        self._shuffle = shuffle
-        if rand_seed:
-            np.random.seed(rand_seed) # fix random seed
-        self._max_crop_trial = max_crop_trial
-
-        self._current = 0
-        self._size = imdb.num_images
-        self._index = np.arange(self._size)
-
-        self._data = None
-        self._label = None
-        self._get_batch()
-
-    @property
-    def provide_data(self):
-        return [(k, v.shape) for k, v in self._data.items()]
-
-    @property
-    def provide_label(self):
-        if self.is_train:
-            return [(k, v.shape) for k, v in self._label.items()]
-        else:
-            return []
-
-    def reset(self):
-        self._current = 0
-        if self._shuffle:
-            np.random.shuffle(self._index)
-
-    def iter_next(self):
-        return self._current < self._size
-
-    def next(self):
-        if self.iter_next():
-            self._get_batch()
-            data_batch = mx.io.DataBatch(data=list(self._data.values()),
-                                   label=list(self._label.values()),
-                                   pad=self.getpad(), index=self.getindex())
-            self._current += self.batch_size
-            return data_batch
-        else:
-            raise StopIteration
-
-    def getindex(self):
-        return self._current // self.batch_size
-
-    def getpad(self):
-        pad = self._current + self.batch_size - self._size
-        return 0 if pad < 0 else pad
-
-    def _get_batch(self):
-        """
-        Load data/label from dataset
-        """
-        batch_data = mx.nd.zeros((self.batch_size, 3, self._data_shape[0], self._data_shape[1]))
-        batch_label = []
-        for i in range(self.batch_size):
-            if (self._current + i) >= self._size:
-                if not self.is_train:
-                    continue
-                # use padding from middle in each epoch
-                idx = (self._current + i + self._size // 2) % self._size
-                index = self._index[idx]
-            else:
-                index = self._index[self._current + i]
-            # index = self.debug_index
-            im_path = self._imdb.image_path_from_index(index)
-            with open(im_path, 'rb') as fp:
-                img_content = fp.read()
-            img = mx.img.imdecode(img_content)
-            gt = self._imdb.label_from_index(index).copy() if self.is_train else None
-            data, label = self._data_augmentation(img, gt)
-            batch_data[i] = data
-            if self.is_train:
-                batch_label.append(label)
-        self._data = {'data': batch_data}
-        if self.is_train:
-            self._label = {'label': mx.nd.array(np.array(batch_label))}
-        else:
-            self._label = {'label': None}
-
-    def _data_augmentation(self, data, label):
-        """
-        perform data augmentations: crop, mirror, resize, sub mean, swap channels...
-        """
-        if self.is_train and self._rand_samplers:
-            rand_crops = []
-            for rs in self._rand_samplers:
-                rand_crops += rs.sample(label)
-            num_rand_crops = len(rand_crops)
-            # randomly pick up one as input data
-            if num_rand_crops > 0:
-                index = int(np.random.uniform(0, 1) * num_rand_crops)
-                width = data.shape[1]
-                height = data.shape[0]
-                crop = rand_crops[index][0]
-                xmin = int(crop[0] * width)
-                ymin = int(crop[1] * height)
-                xmax = int(crop[2] * width)
-                ymax = int(crop[3] * height)
-                if xmin >= 0 and ymin >= 0 and xmax <= width and ymax <= height:
-                    data = mx.img.fixed_crop(data, xmin, ymin, xmax-xmin, ymax-ymin)
-                else:
-                    # padding mode
-                    new_width = xmax - xmin
-                    new_height = ymax - ymin
-                    offset_x = 0 - xmin
-                    offset_y = 0 - ymin
-                    data_bak = data
-                    data = mx.nd.full((new_height, new_width, 3), 128, dtype='uint8')
-                    data[offset_y:offset_y+height, offset_x:offset_x + width, :] = data_bak
-                label = rand_crops[index][1]
-        if self.is_train:
-            interp_methods = [cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, \
-                              cv2.INTER_NEAREST, cv2.INTER_LANCZOS4]
-        else:
-            interp_methods = [cv2.INTER_LINEAR]
-        interp_method = interp_methods[int(np.random.uniform(0, 1) * len(interp_methods))]
-        data = mx.img.imresize(data, self._data_shape[1], self._data_shape[0], interp_method)
-        if self.is_train and self._rand_mirror:
-            if np.random.uniform(0, 1) > 0.5:
-                data = mx.nd.flip(data, axis=1)
-                valid_mask = np.where(label[:, 0] > -1)[0]
-                tmp = 1.0 - label[valid_mask, 1]
-                label[valid_mask, 1] = 1.0 - label[valid_mask, 3]
-                label[valid_mask, 3] = tmp
-        data = mx.nd.transpose(data, (2,0,1))
-        data = data.astype('float32')
-        data = data - self._mean_pixels
-        return data, label
diff --git a/example/ssd/dataset/mscoco.py b/example/ssd/dataset/mscoco.py
deleted file mode 100644
index dbe6e6909f4d..000000000000
--- a/example/ssd/dataset/mscoco.py
+++ /dev/null
@@ -1,138 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import os
-import numpy as np
-from dataset.imdb import Imdb
-from dataset.pycocotools.coco import COCO
-
-
-class Coco(Imdb):
-    """
-    Implementation of Imdb for MSCOCO dataset: https://http://mscoco.org
-
-    Parameters:
-    ----------
-    anno_file : str
-        annotation file for coco, a json file
-    image_dir : str
-        image directory for coco images
-    shuffle : bool
-        whether initially shuffle image list
-
-    """
-    def __init__(self, anno_file, image_dir, shuffle=True, names='mscoco.names'):
-        assert os.path.isfile(anno_file), "Invalid annotation file: " + anno_file
-        basename = os.path.splitext(os.path.basename(anno_file))[0]
-        super(Coco, self).__init__('coco_' + basename)
-        self.image_dir = image_dir
-
-        self.classes = self._load_class_names(names,
-            os.path.join(os.path.dirname(__file__), 'names'))
-
-        self.num_classes = len(self.classes)
-        self._load_all(anno_file, shuffle)
-        self.num_images = len(self.image_set_index)
-
-
-    def image_path_from_index(self, index):
-        """
-        given image index, find out full path
-
-        Parameters:
-        ----------
-        index: int
-            index of a specific image
-        Returns:
-        ----------
-        full path of this image
-        """
-        assert self.image_set_index is not None, "Dataset not initialized"
-        name = self.image_set_index[index]
-        image_file = os.path.join(self.image_dir, 'images', name)
-        assert os.path.isfile(image_file), 'Path does not exist: {}'.format(image_file)
-        return image_file
-
-    def label_from_index(self, index):
-        """
-        given image index, return preprocessed ground-truth
-
-        Parameters:
-        ----------
-        index: int
-            index of a specific image
-        Returns:
-        ----------
-        ground-truths of this image
-        """
-        assert self.labels is not None, "Labels not processed"
-        return self.labels[index]
-
-    def _load_all(self, anno_file, shuffle):
-        """
-        initialize all entries given annotation json file
-
-        Parameters:
-        ----------
-        anno_file: str
-            annotation json file
-        shuffle: bool
-            whether to shuffle image list
-        """
-        image_set_index = []
-        labels = []
-        coco = COCO(anno_file)
-        img_ids = coco.getImgIds()
-        # deal with class names
-        cats = [cat['name'] for cat in coco.loadCats(coco.getCatIds())]
-        class_to_coco_ind = dict(zip(cats, coco.getCatIds()))
-        class_to_ind = dict(zip(self.classes, range(len(self.classes))))
-        coco_ind_to_class_ind = dict([(class_to_coco_ind[cls], class_to_ind[cls])
-                                     for cls in self.classes[0:]])
-        for img_id in img_ids:
-            # filename
-            image_info = coco.loadImgs(img_id)[0]
-            filename = image_info["file_name"]
-            subdir = filename.split('_')[1]
-            height = image_info["height"]
-            width = image_info["width"]
-            # label
-            anno_ids = coco.getAnnIds(imgIds=img_id)
-            annos = coco.loadAnns(anno_ids)
-            label = []
-            for anno in annos:
-                cat_id = coco_ind_to_class_ind[anno['category_id']]
-                bbox = anno["bbox"]
-                assert len(bbox) == 4
-                xmin = float(bbox[0]) / width
-                ymin = float(bbox[1]) / height
-                xmax = xmin + float(bbox[2]) / width
-                ymax = ymin + float(bbox[3]) / height
-                label.append([cat_id, xmin, ymin, xmax, ymax, 0])
-            if label:
-                labels.append(np.array(label))
-                image_set_index.append(os.path.join(subdir, filename))
-
-        if shuffle:
-            import random
-            indices = list(range(len(image_set_index)))
-            random.shuffle(indices)
-            image_set_index = [image_set_index[i] for i in indices]
-            labels = [labels[i] for i in indices]
-        # store the results
-        self.image_set_index = image_set_index
-        self.labels = labels
diff --git a/example/ssd/dataset/names/mscoco.names b/example/ssd/dataset/names/mscoco.names
deleted file mode 100644
index 941cb4e13922..000000000000
--- a/example/ssd/dataset/names/mscoco.names
+++ /dev/null
@@ -1,80 +0,0 @@
-person
-bicycle
-car
-motorcycle
-airplane
-bus
-train
-truck
-boat
-traffic light
-fire hydrant
-stop sign
-parking meter
-bench
-bird
-cat
-dog
-horse
-sheep
-cow
-elephant
-bear
-zebra
-giraffe
-backpack
-umbrella
-handbag
-tie
-suitcase
-frisbee
-skis
-snowboard
-sports ball
-kite
-baseball bat
-baseball glove
-skateboard
-surfboard
-tennis racket
-bottle
-wine glass
-cup
-fork
-knife
-spoon
-bowl
-banana
-apple
-sandwich
-orange
-broccoli
-carrot
-hot dog
-pizza
-donut
-cake
-chair
-couch
-potted plant
-bed
-dining table
-toilet
-tv
-laptop
-mouse
-remote
-keyboard
-cell phone
-microwave
-oven
-toaster
-sink
-refrigerator
-book
-clock
-vase
-scissors
-teddy bear
-hair drier
-toothbrush
diff --git a/example/ssd/dataset/names/pascal_voc.names b/example/ssd/dataset/names/pascal_voc.names
deleted file mode 100644
index 8420ab35ede7..000000000000
--- a/example/ssd/dataset/names/pascal_voc.names
+++ /dev/null
@@ -1,20 +0,0 @@
-aeroplane
-bicycle
-bird
-boat
-bottle
-bus
-car
-cat
-chair
-cow
-diningtable
-dog
-horse
-motorbike
-person
-pottedplant
-sheep
-sofa
-train
-tvmonitor
diff --git a/example/ssd/dataset/pascal_voc.py b/example/ssd/dataset/pascal_voc.py
deleted file mode 100644
index 98e217fd15ad..000000000000
--- a/example/ssd/dataset/pascal_voc.py
+++ /dev/null
@@ -1,286 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from __future__ import print_function
-import os
-import numpy as np
-from dataset.imdb import Imdb
-import xml.etree.ElementTree as ET
-from evaluate.eval_voc import voc_eval
-import cv2
-
-
-class PascalVoc(Imdb):
-    """
-    Implementation of Imdb for Pascal VOC datasets
-
-    Parameters:
-    ----------
-    image_set : str
-        set to be used, can be train, val, trainval, test
-    year : str
-        year of dataset, can be 2007, 2010, 2012...
-    devkit_path : str
-        devkit path of VOC dataset
-    shuffle : boolean
-        whether to initial shuffle the image list
-    is_train : boolean
-        if true, will load annotations
-    """
-    def __init__(self, image_set, year, devkit_path, shuffle=False, is_train=False,
-            names='pascal_voc.names'):
-        super(PascalVoc, self).__init__('voc_' + year + '_' + image_set)
-        self.image_set = image_set
-        self.year = year
-        self.devkit_path = devkit_path
-        self.data_path = os.path.join(devkit_path, 'VOC' + year)
-        self.extension = '.jpg'
-        self.is_train = is_train
-
-        self.classes = self._load_class_names(names,
-            os.path.join(os.path.dirname(__file__), 'names'))
-
-        self.config = {'use_difficult': True,
-                       'comp_id': 'comp4',}
-
-        self.num_classes = len(self.classes)
-        self.image_set_index = self._load_image_set_index(shuffle)
-        self.num_images = len(self.image_set_index)
-        if self.is_train:
-            self.labels = self._load_image_labels()
-
-    @property
-    def cache_path(self):
-        """
-        make a directory to store all caches
-
-        Returns:
-        ---------
-            cache path
-        """
-        cache_path = os.path.join(os.path.dirname(__file__), '..', 'cache')
-        if not os.path.exists(cache_path):
-            os.mkdir(cache_path)
-        return cache_path
-
-    def _load_image_set_index(self, shuffle):
-        """
-        find out which indexes correspond to given image set (train or val)
-
-        Parameters:
-        ----------
-        shuffle : boolean
-            whether to shuffle the image list
-        Returns:
-        ----------
-        entire list of images specified in the setting
-        """
-        image_set_index_file = os.path.join(self.data_path, 'ImageSets', 'Main', self.image_set + '.txt')
-        assert os.path.exists(image_set_index_file), 'Path does not exist: {}'.format(image_set_index_file)
-        with open(image_set_index_file) as f:
-            image_set_index = [x.strip() for x in f.readlines()]
-        if shuffle:
-            np.random.shuffle(image_set_index)
-        return image_set_index
-
-    def image_path_from_index(self, index):
-        """
-        given image index, find out full path
-
-        Parameters:
-        ----------
-        index: int
-            index of a specific image
-        Returns:
-        ----------
-        full path of this image
-        """
-        assert self.image_set_index is not None, "Dataset not initialized"
-        name = self.image_set_index[index]
-        image_file = os.path.join(self.data_path, 'JPEGImages', name + self.extension)
-        assert os.path.exists(image_file), 'Path does not exist: {}'.format(image_file)
-        return image_file
-
-    def label_from_index(self, index):
-        """
-        given image index, return preprocessed ground-truth
-
-        Parameters:
-        ----------
-        index: int
-            index of a specific image
-        Returns:
-        ----------
-        ground-truths of this image
-        """
-        assert self.labels is not None, "Labels not processed"
-        return self.labels[index]
-
-    def _label_path_from_index(self, index):
-        """
-        given image index, find out annotation path
-
-        Parameters:
-        ----------
-        index: int
-            index of a specific image
-
-        Returns:
-        ----------
-        full path of annotation file
-        """
-        label_file = os.path.join(self.data_path, 'Annotations', index + '.xml')
-        assert os.path.exists(label_file), 'Path does not exist: {}'.format(label_file)
-        return label_file
-
-    def _load_image_labels(self):
-        """
-        preprocess all ground-truths
-
-        Returns:
-        ----------
-        labels packed in [num_images x max_num_objects x 5] tensor
-        """
-        temp = []
-
-        # load ground-truth from xml annotations
-        for idx in self.image_set_index:
-            label_file = self._label_path_from_index(idx)
-            tree = ET.parse(label_file)
-            root = tree.getroot()
-            size = root.find('size')
-            width = float(size.find('width').text)
-            height = float(size.find('height').text)
-            label = []
-
-            for obj in root.iter('object'):
-                difficult = int(obj.find('difficult').text)
-                # if not self.config['use_difficult'] and difficult == 1:
-                #     continue
-                cls_name = obj.find('name').text
-                if cls_name not in self.classes:
-                    continue
-                cls_id = self.classes.index(cls_name)
-                xml_box = obj.find('bndbox')
-                xmin = float(xml_box.find('xmin').text) / width
-                ymin = float(xml_box.find('ymin').text) / height
-                xmax = float(xml_box.find('xmax').text) / width
-                ymax = float(xml_box.find('ymax').text) / height
-                label.append([cls_id, xmin, ymin, xmax, ymax, difficult])
-            temp.append(np.array(label))
-        return temp
-
-    def evaluate_detections(self, detections):
-        """
-        top level evaluations
-        Parameters:
-        ----------
-        detections: list
-            result list, each entry is a matrix of detections
-        Returns:
-        ----------
-            None
-        """
-        # make all these folders for results
-        result_dir = os.path.join(self.devkit_path, 'results')
-        if not os.path.exists(result_dir):
-            os.mkdir(result_dir)
-        year_folder = os.path.join(self.devkit_path, 'results', 'VOC' + self.year)
-        if not os.path.exists(year_folder):
-            os.mkdir(year_folder)
-        res_file_folder = os.path.join(self.devkit_path, 'results', 'VOC' + self.year, 'Main')
-        if not os.path.exists(res_file_folder):
-            os.mkdir(res_file_folder)
-
-        self.write_pascal_results(detections)
-        self.do_python_eval()
-
-    def get_result_file_template(self):
-        """
-        this is a template
-        VOCdevkit/results/VOC2007/Main/<comp_id>_det_test_aeroplane.txt
-
-        Returns:
-        ----------
-            a string template
-        """
-        res_file_folder = os.path.join(self.devkit_path, 'results', 'VOC' + self.year, 'Main')
-        comp_id = self.config['comp_id']
-        filename = comp_id + '_det_' + self.image_set + '_{:s}.txt'
-        path = os.path.join(res_file_folder, filename)
-        return path
-
-    def write_pascal_results(self, all_boxes):
-        """
-        write results files in pascal devkit path
-        Parameters:
-        ----------
-        all_boxes: list
-            boxes to be processed [bbox, confidence]
-        Returns:
-        ----------
-        None
-        """
-        for cls_ind, cls in enumerate(self.classes):
-            print('Writing {} VOC results file'.format(cls))
-            filename = self.get_result_file_template().format(cls)
-            with open(filename, 'wt') as f:
-                for im_ind, index in enumerate(self.image_set_index):
-                    dets = all_boxes[im_ind]
-                    if dets.shape[0] < 1:
-                        continue
-                    h, w = self._get_imsize(self.image_path_from_index(im_ind))
-                    # the VOCdevkit expects 1-based indices
-                    for k in range(dets.shape[0]):
-                        if (int(dets[k, 0]) == cls_ind):
-                            f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'.
-                                    format(index, dets[k, 1],
-                                           int(dets[k, 2] * w) + 1, int(dets[k, 3] * h) + 1,
-                                           int(dets[k, 4] * w) + 1, int(dets[k, 5] * h) + 1))
-
-    def do_python_eval(self):
-        """
-        python evaluation wrapper
-
-        Returns:
-        ----------
-        None
-        """
-        annopath = os.path.join(self.data_path, 'Annotations', '{:s}.xml')
-        imageset_file = os.path.join(self.data_path, 'ImageSets', 'Main', self.image_set + '.txt')
-        cache_dir = os.path.join(self.cache_path, self.name)
-        aps = []
-        # The PASCAL VOC metric changed in 2010
-        use_07_metric = True if int(self.year) < 2010 else False
-        print('VOC07 metric? ' + ('Y' if use_07_metric else 'No'))
-        for cls_ind, cls in enumerate(self.classes):
-            filename = self.get_result_file_template().format(cls)
-            rec, prec, ap = voc_eval(filename, annopath, imageset_file, cls, cache_dir,
-                                     ovthresh=0.5, use_07_metric=use_07_metric)
-            aps += [ap]
-            print('AP for {} = {:.4f}'.format(cls, ap))
-        print('Mean AP = {:.4f}'.format(np.mean(aps)))
-
-    def _get_imsize(self, im_name):
-        """
-        get image size info
-        Returns:
-        ----------
-        tuple of (height, width)
-        """
-        img = cv2.imread(im_name)
-        return (img.shape[0], img.shape[1])
diff --git a/example/ssd/dataset/pycocotools/README.md b/example/ssd/dataset/pycocotools/README.md
deleted file mode 100644
index ed4411425f8c..000000000000
--- a/example/ssd/dataset/pycocotools/README.md
+++ /dev/null
@@ -1,19 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-This is a modified version of https://github.com/pdollar/coco python API.
-No `make` is required, but this will not support mask functions.
diff --git a/example/ssd/dataset/pycocotools/__init__.py b/example/ssd/dataset/pycocotools/__init__.py
deleted file mode 100644
index 2f4e0d430df9..000000000000
--- a/example/ssd/dataset/pycocotools/__init__.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-__author__ = 'tylin'
diff --git a/example/ssd/dataset/pycocotools/coco.py b/example/ssd/dataset/pycocotools/coco.py
deleted file mode 100644
index 470f086f0b02..000000000000
--- a/example/ssd/dataset/pycocotools/coco.py
+++ /dev/null
@@ -1,418 +0,0 @@
-__author__ = 'tylin'
-__version__ = '2.0'
-# Interface for accessing the Microsoft COCO dataset.
-
-# Microsoft COCO is a large image dataset designed for object detection,
-# segmentation, and caption generation. pycocotools is a Python API that
-# assists in loading, parsing and visualizing the annotations in COCO.
-# Please visit http://mscoco.org/ for more information on COCO, including
-# for the data, paper, and tutorials. The exact format of the annotations
-# is also described on the COCO website. For example usage of the pycocotools
-# please see pycocotools_demo.ipynb. In addition to this API, please download both
-# the COCO images and annotations in order to run the demo.
-
-# An alternative to using the API is to load the annotations directly
-# into Python dictionary
-# Using the API provides additional utility functions. Note that this API
-# supports both *instance* and *caption* annotations. In the case of
-# captions not all functions are defined (e.g. categories are undefined).
-
-# The following API functions are defined:
-#  COCO       - COCO api class that loads COCO annotation file and prepare data structures.
-#  decodeMask - Decode binary mask M encoded via run-length encoding.
-#  encodeMask - Encode binary mask M using run-length encoding.
-#  getAnnIds  - Get ann ids that satisfy given filter conditions.
-#  getCatIds  - Get cat ids that satisfy given filter conditions.
-#  getImgIds  - Get img ids that satisfy given filter conditions.
-#  loadAnns   - Load anns with the specified ids.
-#  loadCats   - Load cats with the specified ids.
-#  loadImgs   - Load imgs with the specified ids.
-#  annToMask  - Convert segmentation in an annotation to binary mask.
-#  showAnns   - Display the specified annotations.
-#  loadRes    - Load algorithm results and create API for accessing them.
-#  download   - Download COCO images from mscoco.org server.
-# Throughout the API "ann"=annotation, "cat"=category, and "img"=image.
-# Help on each functions can be accessed by: "help COCO>function".
-
-# See also COCO>decodeMask,
-# COCO>encodeMask, COCO>getAnnIds, COCO>getCatIds,
-# COCO>getImgIds, COCO>loadAnns, COCO>loadCats,
-# COCO>loadImgs, COCO>annToMask, COCO>showAnns
-
-# Microsoft COCO Toolbox.      version 2.0
-# Data, paper, and tutorials available at:  http://mscoco.org/
-# Code written by Piotr Dollar and Tsung-Yi Lin, 2014.
-# Licensed under the Simplified BSD License [see bsd.txt]
-
-import json
-import time
-import matplotlib.pyplot as plt
-from matplotlib.collections import PatchCollection
-from matplotlib.patches import Polygon
-import numpy as np
-import copy
-import itertools
-# from . import mask as maskUtils
-import os
-from collections import defaultdict
-from mxnet.base import string_types
-try:
-    from urllib.request import urlretrieve
-except ImportError:
-    from urllib import urlretrieve
-
-
-class COCO:
-    def __init__(self, annotation_file=None):
-        """
-        Constructor of Microsoft COCO helper class for reading and visualizing annotations.
-        :param annotation_file (str): location of annotation file
-        :param image_folder (str): location to the folder that hosts images.
-        :return:
-        """
-        # load dataset
-        self.dataset,self.anns,self.cats,self.imgs = dict(),dict(),dict(),dict()
-        self.imgToAnns, self.catToImgs = defaultdict(list), defaultdict(list)
-        if not annotation_file == None:
-            print('loading annotations into memory...')
-            tic = time.time()
-            dataset = json.load(open(annotation_file, 'r'))
-            assert type(dataset)==dict, 'annotation file format {} not supported'.format(type(dataset))
-            print('Done (t={:0.2f}s)'.format(time.time()- tic))
-            self.dataset = dataset
-            self.createIndex()
-
-    def createIndex(self):
-        # create index
-        print('creating index...')
-        anns, cats, imgs = {}, {}, {}
-        imgToAnns,catToImgs = defaultdict(list),defaultdict(list)
-        if 'annotations' in self.dataset:
-            for ann in self.dataset['annotations']:
-                imgToAnns[ann['image_id']].append(ann)
-                anns[ann['id']] = ann
-
-        if 'images' in self.dataset:
-            for img in self.dataset['images']:
-                imgs[img['id']] = img
-
-        if 'categories' in self.dataset:
-            for cat in self.dataset['categories']:
-                cats[cat['id']] = cat
-
-        if 'annotations' in self.dataset and 'categories' in self.dataset:
-            for ann in self.dataset['annotations']:
-                catToImgs[ann['category_id']].append(ann['image_id'])
-
-        print('index created!')
-
-        # create class members
-        self.anns = anns
-        self.imgToAnns = imgToAnns
-        self.catToImgs = catToImgs
-        self.imgs = imgs
-        self.cats = cats
-
-    def info(self):
-        """
-        Print information about the annotation file.
-        :return:
-        """
-        for key, value in self.dataset['info'].items():
-            print('{}: {}'.format(key, value))
-
-    def getAnnIds(self, imgIds=[], catIds=[], areaRng=[], iscrowd=None):
-        """
-        Get ann ids that satisfy given filter conditions. default skips that filter
-        :param imgIds  (int array)     : get anns for given imgs
-               catIds  (int array)     : get anns for given cats
-               areaRng (float array)   : get anns for given area range (e.g. [0 inf])
-               iscrowd (boolean)       : get anns for given crowd label (False or True)
-        :return: ids (int array)       : integer array of ann ids
-        """
-        imgIds = imgIds if type(imgIds) == list else [imgIds]
-        catIds = catIds if type(catIds) == list else [catIds]
-
-        if len(imgIds) == len(catIds) == len(areaRng) == 0:
-            anns = self.dataset['annotations']
-        else:
-            if not len(imgIds) == 0:
-                lists = [self.imgToAnns[imgId] for imgId in imgIds if imgId in self.imgToAnns]
-                anns = list(itertools.chain.from_iterable(lists))
-            else:
-                anns = self.dataset['annotations']
-            anns = anns if len(catIds)  == 0 else [ann for ann in anns if ann['category_id'] in catIds]
-            anns = anns if len(areaRng) == 0 else [ann for ann in anns if ann['area'] > areaRng[0] and ann['area'] < areaRng[1]]
-        if not iscrowd == None:
-            ids = [ann['id'] for ann in anns if ann['iscrowd'] == iscrowd]
-        else:
-            ids = [ann['id'] for ann in anns]
-        return ids
-
-    def getCatIds(self, catNms=[], supNms=[], catIds=[]):
-        """
-        filtering parameters. default skips that filter.
-        :param catNms (str array)  : get cats for given cat names
-        :param supNms (str array)  : get cats for given supercategory names
-        :param catIds (int array)  : get cats for given cat ids
-        :return: ids (int array)   : integer array of cat ids
-        """
-        catNms = catNms if type(catNms) == list else [catNms]
-        supNms = supNms if type(supNms) == list else [supNms]
-        catIds = catIds if type(catIds) == list else [catIds]
-
-        if len(catNms) == len(supNms) == len(catIds) == 0:
-            cats = self.dataset['categories']
-        else:
-            cats = self.dataset['categories']
-            cats = cats if len(catNms) == 0 else [cat for cat in cats if cat['name']          in catNms]
-            cats = cats if len(supNms) == 0 else [cat for cat in cats if cat['supercategory'] in supNms]
-            cats = cats if len(catIds) == 0 else [cat for cat in cats if cat['id']            in catIds]
-        ids = [cat['id'] for cat in cats]
-        return ids
-
-    def getImgIds(self, imgIds=[], catIds=[]):
-        '''
-        Get img ids that satisfy given filter conditions.
-        :param imgIds (int array) : get imgs for given ids
-        :param catIds (int array) : get imgs with all given cats
-        :return: ids (int array)  : integer array of img ids
-        '''
-        imgIds = imgIds if type(imgIds) == list else [imgIds]
-        catIds = catIds if type(catIds) == list else [catIds]
-
-        if len(imgIds) == len(catIds) == 0:
-            ids = self.imgs.keys()
-        else:
-            ids = set(imgIds)
-            for i, catId in enumerate(catIds):
-                if i == 0 and len(ids) == 0:
-                    ids = set(self.catToImgs[catId])
-                else:
-                    ids &= set(self.catToImgs[catId])
-        return list(ids)
-
-    def loadAnns(self, ids=[]):
-        """
-        Load anns with the specified ids.
-        :param ids (int array)       : integer ids specifying anns
-        :return: anns (object array) : loaded ann objects
-        """
-        if type(ids) == list:
-            return [self.anns[id] for id in ids]
-        elif type(ids) == int:
-            return [self.anns[ids]]
-
-    def loadCats(self, ids=[]):
-        """
-        Load cats with the specified ids.
-        :param ids (int array)       : integer ids specifying cats
-        :return: cats (object array) : loaded cat objects
-        """
-        if type(ids) == list:
-            return [self.cats[id] for id in ids]
-        elif type(ids) == int:
-            return [self.cats[ids]]
-
-    def loadImgs(self, ids=[]):
-        """
-        Load anns with the specified ids.
-        :param ids (int array)       : integer ids specifying img
-        :return: imgs (object array) : loaded img objects
-        """
-        if type(ids) == list:
-            return [self.imgs[id] for id in ids]
-        elif type(ids) == int:
-            return [self.imgs[ids]]
-
-    def showAnns(self, anns):
-        """
-        Display the specified annotations.
-        :param anns (array of object): annotations to display
-        :return: None
-        """
-        if len(anns) == 0:
-            return 0
-        if 'segmentation' in anns[0] or 'keypoints' in anns[0]:
-            datasetType = 'instances'
-        elif 'caption' in anns[0]:
-            datasetType = 'captions'
-        else:
-            raise Exception('datasetType not supported')
-        if datasetType == 'instances':
-            ax = plt.gca()
-            ax.set_autoscale_on(False)
-            polygons = []
-            color = []
-            for ann in anns:
-                c = (np.random.random((1, 3))*0.6+0.4).tolist()[0]
-                if 'segmentation' in ann:
-                    if type(ann['segmentation']) == list:
-                        # polygon
-                        for seg in ann['segmentation']:
-                            poly = np.array(seg).reshape((int(len(seg)/2), 2))
-                            polygons.append(Polygon(poly))
-                            color.append(c)
-                    else:
-                        # mask
-                        raise NotImplementedError("maskUtils disabled!")
-                if 'keypoints' in ann and type(ann['keypoints']) == list:
-                    # turn skeleton into zero-based index
-                    sks = np.array(self.loadCats(ann['category_id'])[0]['skeleton'])-1
-                    kp = np.array(ann['keypoints'])
-                    x = kp[0::3]
-                    y = kp[1::3]
-                    v = kp[2::3]
-                    for sk in sks:
-                        if np.all(v[sk]>0):
-                            plt.plot(x[sk],y[sk], linewidth=3, color=c)
-                    plt.plot(x[v>0], y[v>0],'o',markersize=8, markerfacecolor=c, markeredgecolor='k',markeredgewidth=2)
-                    plt.plot(x[v>1], y[v>1],'o',markersize=8, markerfacecolor=c, markeredgecolor=c, markeredgewidth=2)
-            p = PatchCollection(polygons, facecolor=color, linewidths=0, alpha=0.4)
-            ax.add_collection(p)
-            p = PatchCollection(polygons, facecolor='none', edgecolors=color, linewidths=2)
-            ax.add_collection(p)
-        elif datasetType == 'captions':
-            for ann in anns:
-                print(ann['caption'])
-
-    def loadRes(self, resFile):
-        """
-        Load result file and return a result api object.
-        :param   resFile (str)     : file name of result file
-        :return: res (obj)         : result api object
-        """
-        res = COCO()
-        res.dataset['images'] = [img for img in self.dataset['images']]
-
-        print('Loading and preparing results...')
-        tic = time.time()
-        if type(resFile) in string_types:
-            anns = json.load(open(resFile))
-        elif type(resFile) == np.ndarray:
-            anns = self.loadNumpyAnnotations(resFile)
-        else:
-            anns = resFile
-        assert type(anns) == list, 'results in not an array of objects'
-        annsImgIds = [ann['image_id'] for ann in anns]
-        assert set(annsImgIds) == (set(annsImgIds) & set(self.getImgIds())), \
-               'Results do not correspond to current coco set'
-        if 'caption' in anns[0]:
-            imgIds = set([img['id'] for img in res.dataset['images']]) & set([ann['image_id'] for ann in anns])
-            res.dataset['images'] = [img for img in res.dataset['images'] if img['id'] in imgIds]
-            for id, ann in enumerate(anns):
-                ann['id'] = id+1
-        elif 'bbox' in anns[0] and not anns[0]['bbox'] == []:
-            res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
-            for id, ann in enumerate(anns):
-                bb = ann['bbox']
-                x1, x2, y1, y2 = [bb[0], bb[0]+bb[2], bb[1], bb[1]+bb[3]]
-                if not 'segmentation' in ann:
-                    ann['segmentation'] = [[x1, y1, x1, y2, x2, y2, x2, y1]]
-                ann['area'] = bb[2]*bb[3]
-                ann['id'] = id+1
-                ann['iscrowd'] = 0
-        elif 'segmentation' in anns[0]:
-            res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
-            for id, ann in enumerate(anns):
-                # now only support compressed RLE format as segmentation results
-                # ann['area'] = maskUtils.area(ann['segmentation'])
-                raise NotImplementedError("maskUtils disabled!")
-                if not 'bbox' in ann:
-                    # ann['bbox'] = maskUtils.toBbox(ann['segmentation'])
-                    raise NotImplementedError("maskUtils disabled!")
-                ann['id'] = id+1
-                ann['iscrowd'] = 0
-        elif 'keypoints' in anns[0]:
-            res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
-            for id, ann in enumerate(anns):
-                s = ann['keypoints']
-                x = s[0::3]
-                y = s[1::3]
-                x0,x1,y0,y1 = np.min(x), np.max(x), np.min(y), np.max(y)
-                ann['area'] = (x1-x0)*(y1-y0)
-                ann['id'] = id + 1
-                ann['bbox'] = [x0,y0,x1-x0,y1-y0]
-        print('DONE (t={:0.2f}s)'.format(time.time()- tic))
-
-        res.dataset['annotations'] = anns
-        res.createIndex()
-        return res
-
-    def download(self, tarDir = None, imgIds = [] ):
-        '''
-        Download COCO images from mscoco.org server.
-        :param tarDir (str): COCO results directory name
-               imgIds (list): images to be downloaded
-        :return:
-        '''
-        if tarDir is None:
-            print('Please specify target directory')
-            return -1
-        if len(imgIds) == 0:
-            imgs = self.imgs.values()
-        else:
-            imgs = self.loadImgs(imgIds)
-        N = len(imgs)
-        if not os.path.exists(tarDir):
-            os.makedirs(tarDir)
-        for i, img in enumerate(imgs):
-            tic = time.time()
-            fname = os.path.join(tarDir, img['file_name'])
-            if not os.path.exists(fname):
-                urlretrieve(img['coco_url'], fname)
-            print('downloaded {}/{} images (t={:0.1f}s)'.format(i, N, time.time()- tic))
-
-    def loadNumpyAnnotations(self, data):
-        """
-        Convert result data from a numpy array [Nx7] where each row contains {imageID,x1,y1,w,h,score,class}
-        :param  data (numpy.ndarray)
-        :return: annotations (python nested list)
-        """
-        print('Converting ndarray to lists...')
-        assert(type(data) == np.ndarray)
-        print(data.shape)
-        assert(data.shape[1] == 7)
-        N = data.shape[0]
-        ann = []
-        for i in range(N):
-            if i % 1000000 == 0:
-                print('{}/{}'.format(i,N))
-            ann += [{
-                'image_id'  : int(data[i, 0]),
-                'bbox'  : [ data[i, 1], data[i, 2], data[i, 3], data[i, 4] ],
-                'score' : data[i, 5],
-                'category_id': int(data[i, 6]),
-                }]
-        return ann
-
-    def annToRLE(self, ann):
-        """
-        Convert annotation which can be polygons, uncompressed RLE to RLE.
-        :return: binary mask (numpy 2D array)
-        """
-        t = self.imgs[ann['image_id']]
-        h, w = t['height'], t['width']
-        segm = ann['segmentation']
-        if type(segm) == list:
-            # polygon -- a single object might consist of multiple parts
-            # we merge all parts into one mask rle code
-            # rles = maskUtils.frPyObjects(segm, h, w)
-            # rle = maskUtils.merge(rles)
-            raise NotImplementedError("maskUtils disabled!")
-        elif type(segm['counts']) == list:
-            # uncompressed RLE
-            # rle = maskUtils.frPyObjects(segm, h, w)
-            raise NotImplementedError("maskUtils disabled!")
-        else:
-            # rle
-            rle = ann['segmentation']
-        return rle
-
-    def annToMask(self, ann):
-        """
-        Convert annotation which can be polygons, uncompressed RLE, or RLE to binary mask.
-        :return: binary mask (numpy 2D array)
-        """
-        rle = self.annToRLE(ann)
-        raise NotImplementedError("maskUtils disabled!")
diff --git a/example/ssd/dataset/testdb.py b/example/ssd/dataset/testdb.py
deleted file mode 100644
index 2f982a4b303b..000000000000
--- a/example/ssd/dataset/testdb.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import os
-from .imdb import Imdb
-
-
-class TestDB(Imdb):
-    """
-    A simple wrapper class for converting list of image to Imdb during testing
-
-    Parameters:
-    ----------
-    images : str or list of str
-        image path or list of images, if directory and extension not
-        specified, root_dir and extension are required
-    root_dir : str or None
-        directory of input images, optional if image path already
-        has full directory information
-    extension : str or None
-        image extension, eg. ".jpg", optional
-    """
-    def __init__(self, images, root_dir=None, extension=None):
-        if not isinstance(images, list):
-            images = [images]
-        num_images = len(images)
-        super(TestDB, self).__init__("test" + str(num_images))
-        self.image_set_index = images
-        self.num_images = num_images
-        self.root_dir = root_dir if root_dir else None
-        self.extension = extension if extension else None
-
-
-    def image_path_from_index(self, index):
-        """
-        given image index, return full path
-
-        Parameters:
-        ----------
-        index: int
-            index of a specific image
-        Returns
-        ----------
-        path of this image
-        """
-        name = self.image_set_index[index]
-        if self.extension:
-            name += self.extension
-        if self.root_dir:
-            name = os.path.join(self.root_dir, name)
-        assert os.path.exists(name), 'Path does not exist: {}'.format(name)
-        return name
-
-    def label_from_index(self, index):
-        return RuntimeError("Testdb does not support label loading")
diff --git a/example/ssd/dataset/yolo_format.py b/example/ssd/dataset/yolo_format.py
deleted file mode 100644
index f1b73d032293..000000000000
--- a/example/ssd/dataset/yolo_format.py
+++ /dev/null
@@ -1,170 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import os
-import numpy as np
-from imdb import Imdb
-
-
-class YoloFormat(Imdb):
-    """
-    Base class for loading datasets as used in YOLO
-
-    Parameters:
-    ----------
-    name : str
-        name for this dataset
-    classes : list or tuple of str
-        class names in this dataset
-    list_file : str
-        filename of the image list file
-    image_dir : str
-        image directory
-    label_dir : str
-        label directory
-    extension : str
-        by default .jpg
-    label_extension : str
-        by default .txt
-    shuffle : bool
-        whether to shuffle the initial order when loading this dataset,
-        default is True
-    """
-    def __init__(self, name, classes, list_file, image_dir, label_dir, \
-                 extension='.jpg', label_extension='.txt', shuffle=True):
-        if isinstance(classes, list) or isinstance(classes, tuple):
-            num_classes = len(classes)
-        elif isinstance(classes, str):
-            with open(classes, 'r') as f:
-                classes = [l.strip() for l in f.readlines()]
-                num_classes = len(classes)
-        else:
-            raise ValueError("classes should be list/tuple or text file")
-        assert num_classes > 0, "number of classes must > 0"
-        super(YoloFormat, self).__init__(name + '_' + str(num_classes))
-        self.classes = classes
-        self.num_classes = num_classes
-        self.list_file = list_file
-        self.image_dir = image_dir
-        self.label_dir = label_dir
-        self.extension = extension
-        self.label_extension = label_extension
-
-        self.image_set_index = self._load_image_set_index(shuffle)
-        self.num_images = len(self.image_set_index)
-        self.labels = self._load_image_labels()
-
-
-    def _load_image_set_index(self, shuffle):
-        """
-        find out which indexes correspond to given image set (train or val)
-
-        Parameters:
-        ----------
-        shuffle : boolean
-            whether to shuffle the image list
-        Returns:
-        ----------
-        entire list of images specified in the setting
-        """
-        assert os.path.exists(self.list_file), 'Path does not exists: {}'.format(self.list_file)
-        with open(self.list_file, 'r') as f:
-            image_set_index = [x.strip() for x in f.readlines()]
-        if shuffle:
-            np.random.shuffle(image_set_index)
-        return image_set_index
-
-    def image_path_from_index(self, index):
-        """
-        given image index, find out full path
-
-        Parameters:
-        ----------
-        index: int
-            index of a specific image
-        Returns:
-        ----------
-        full path of this image
-        """
-        assert self.image_set_index is not None, "Dataset not initialized"
-        name = self.image_set_index[index]
-        image_file = os.path.join(self.image_dir, name) + self.extension
-        assert os.path.exists(image_file), 'Path does not exist: {}'.format(image_file)
-        return image_file
-
-    def label_from_index(self, index):
-        """
-        given image index, return preprocessed ground-truth
-
-        Parameters:
-        ----------
-        index: int
-            index of a specific image
-        Returns:
-        ----------
-        ground-truths of this image
-        """
-        assert self.labels is not None, "Labels not processed"
-        return self.labels[index]
-
-    def _label_path_from_index(self, index):
-        """
-        given image index, find out annotation path
-
-        Parameters:
-        ----------
-        index: int
-            index of a specific image
-
-        Returns:
-        ----------
-        full path of annotation file
-        """
-        label_file = os.path.join(self.label_dir, index + self.label_extension)
-        assert os.path.exists(label_file), 'Path does not exist: {}'.format(label_file)
-        return label_file
-
-    def _load_image_labels(self):
-        """
-        preprocess all ground-truths
-
-        Returns:
-        ----------
-        labels packed in [num_images x max_num_objects x 5] tensor
-        """
-        temp = []
-
-        # load ground-truths
-        for idx in self.image_set_index:
-            label_file = self._label_path_from_index(idx)
-            with open(label_file, 'r') as f:
-                label = []
-                for line in f.readlines():
-                    temp_label = line.strip().split()
-                    assert len(temp_label) == 5, "Invalid label file" + label_file
-                    cls_id = int(temp_label[0])
-                    x = float(temp_label[1])
-                    y = float(temp_label[2])
-                    half_width = float(temp_label[3]) / 2
-                    half_height = float(temp_label[4]) / 2
-                    xmin = x - half_width
-                    ymin = y - half_height
-                    xmax = x + half_width
-                    ymax = y + half_height
-                    label.append([cls_id, xmin, ymin, xmax, ymax])
-                temp.append(np.array(label))
-        return temp
diff --git a/example/ssd/demo.py b/example/ssd/demo.py
deleted file mode 100644
index e8194ab8ead3..000000000000
--- a/example/ssd/demo.py
+++ /dev/null
@@ -1,241 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import argparse
-import tools.find_mxnet
-import mxnet as mx
-import os
-import sys
-from detect.detector import Detector
-from symbol.symbol_factory import get_symbol
-from dataset.cv2Iterator import CameraIterator
-import logging
-import cv2
-
-def get_detector(net, prefix, epoch, data_shape, mean_pixels, ctx, num_class,
-                 nms_thresh=0.5, force_nms=True, nms_topk=400):
-    """
-    wrapper for initialize a detector
-
-    Parameters:
-    ----------
-    net : str
-        test network name
-    prefix : str
-        load model prefix
-    epoch : int
-        load model epoch
-    data_shape : int
-        resize image shape
-    mean_pixels : tuple (float, float, float)
-        mean pixel values (R, G, B)
-    ctx : mx.ctx
-        running context, mx.cpu() or mx.gpu(?)
-    num_class : int
-        number of classes
-    nms_thresh : float
-        non-maximum suppression threshold
-    force_nms : bool
-        force suppress different categories
-    """
-    if net is not None:
-        if isinstance(data_shape, tuple):
-            data_shape = data_shape[0]
-        net = get_symbol(net, data_shape, num_classes=num_class, nms_thresh=nms_thresh,
-            force_nms=force_nms, nms_topk=nms_topk)
-    detector = Detector(net, prefix, epoch, data_shape, mean_pixels, ctx=ctx)
-    return detector
-
-def parse_args():
-    parser = argparse.ArgumentParser(description='Single-shot detection network demo')
-    parser.add_argument('--network', dest='network', type=str, default='resnet50',
-                        help='which network to use')
-    parser.add_argument('--images', dest='images', type=str, default='./data/demo/dog.jpg',
-                        help='run demo with images, use comma to seperate multiple images')
-    parser.add_argument('--dir', dest='dir', nargs='?',
-                        help='demo image directory, optional', type=str)
-    parser.add_argument('--ext', dest='extension', help='image extension, optional',
-                        type=str, nargs='?')
-    parser.add_argument('--epoch', dest='epoch', help='epoch of trained model',
-                        default=0, type=int)
-    parser.add_argument('--batch-size', dest='batch_size', help='batch size',
-                        default=1, type=int)
-    parser.add_argument('--prefix', dest='prefix', help='trained model prefix',
-                        default=os.path.join(os.getcwd(), 'model', 'ssd_'),
-                        type=str)
-    parser.add_argument('--cpu', dest='cpu', help='(override GPU) use CPU to detect',
-                        action='store_true', default=False)
-    parser.add_argument('--gpu', dest='gpu_id', type=int, default=0,
-                        help='GPU device id to detect with')
-    parser.add_argument('--data-shape', dest='data_shape', type=str, default='512',
-                        help='set image shape')
-    parser.add_argument('--mean-r', dest='mean_r', type=float, default=123,
-                        help='red mean value')
-    parser.add_argument('--mean-g', dest='mean_g', type=float, default=117,
-                        help='green mean value')
-    parser.add_argument('--mean-b', dest='mean_b', type=float, default=104,
-                        help='blue mean value')
-    parser.add_argument('--thresh', dest='thresh', type=float, default=0.5,
-                        help='object visualize score threshold, default 0.6')
-    parser.add_argument('--nms', dest='nms_thresh', type=float, default=0.5,
-                        help='non-maximum suppression threshold, default 0.5')
-    parser.add_argument('--no-force', dest='force_nms', action='store_false',
-                        help='dont force non-maximum suppression on different class')
-    parser.add_argument('--no-timer', dest='show_timer', action='store_false',
-                        help='dont show detection time')
-    parser.add_argument('--deploy', dest='deploy_net', action='store_true', default=False,
-                        help='Load network from json file, rather than from symbol')
-    parser.add_argument('--class-names', dest='class_names', type=str,
-                        default='aeroplane, bicycle, bird, boat, bottle, bus, \
-                        car, cat, chair, cow, diningtable, dog, horse, motorbike, \
-                        person, pottedplant, sheep, sofa, train, tvmonitor',
-                        help='string of comma separated names, or text filename')
-    parser.add_argument('--camera', action='store_true',
-                        help="use camera for image capturing")
-    parser.add_argument('--frame-resize', type=str, default=None,
-                        help="resize camera frame to x,y pixels or a float scaling factor")
-    args = parser.parse_args()
-    return args
-
-def parse_class_names(class_names):
-    """ parse # classes and class_names if applicable """
-    if len(class_names) > 0:
-        if os.path.isfile(class_names):
-            # try to open it to read class names
-            with open(class_names, 'r') as f:
-                class_names = [l.strip() for l in f.readlines()]
-        else:
-            class_names = [c.strip() for c in class_names.split(',')]
-        for name in class_names:
-            assert len(name) > 0
-    else:
-        raise RuntimeError("No valid class_name provided...")
-    return class_names
-
-def parse_frame_resize(x):
-    if not x:
-        return x
-    x = list(map(float, x.strip().split(',')))
-    assert len(x) >= 1 and len(x) <= 2, "frame_resize should be a float scaling factor or a tuple of w,h pixels"
-    if len(x) == 1:
-        x = x[0]
-    return x
-
-def parse_data_shape(data_shape_str):
-    """Parse string to tuple or int"""
-    ds = data_shape_str.strip().split(',')
-    if len(ds) == 1:
-        data_shape = (int(ds[0]), int(ds[0]))
-    elif len(ds) == 2:
-        data_shape = (int(ds[0]), int(ds[1]))
-    else:
-        raise ValueError("Unexpected data_shape: %s", data_shape_str)
-    return data_shape
-
-def draw_detection(frame, det, class_names):
-    (klass, score, x0, y0, x1, y1) = det
-    klass_name = class_names[int(klass)]
-    h = frame.shape[0]
-    w = frame.shape[1]
-    # denormalize detections from [0,1] to the frame size
-    p0 = tuple(map(int, (x0*w,y0*h)))
-    p1 = tuple(map(int, (x1*w,y1*h)))
-    logging.info("detection: %s %s", klass_name, score)
-    cv2.rectangle(frame, p0, p1, (0,0,255), 2)
-    # Where to draw the text, a few pixels above the top y coordinate
-    tp0 = (p0[0], p0[1]-5)
-    draw_text = "{} {}".format(klass_name, score)
-    cv2.putText(frame, draw_text, tp0, cv2.FONT_HERSHEY_COMPLEX_SMALL, 0.5, (0,0,255))
-
-
-def network_path(prefix, network, data_shape):
-    return "{}{}_{}".format(prefix, network, data_shape)
-
-def run_camera(args,ctx):
-    assert args.batch_size == 1, "only batch size of 1 is supported"
-    logging.info("Detection threshold is {}".format(args.thresh))
-    iter = CameraIterator(frame_resize=parse_frame_resize(args.frame_resize))
-    class_names = parse_class_names(args.class_names)
-    mean_pixels = (args.mean_r, args.mean_g, args.mean_b)
-    data_shape = int(args.data_shape)
-    batch_size = int(args.batch_size)
-    detector = Detector(
-        get_symbol(args.network, data_shape, num_classes=len(class_names)),
-        network_path(args.prefix, args.network, data_shape),
-        args.epoch,
-        data_shape,
-        mean_pixels,
-        batch_size,
-        ctx
-    )
-    for frame in iter:
-        logging.info("Frame info: shape %s type %s", frame.shape, frame.dtype)
-        logging.info("Generating batch")
-        data_batch = detector.create_batch(frame)
-        logging.info("Detecting objects")
-        detections_batch = detector.detect_batch(data_batch)
-        #detections = [mx.nd.array((1,1,0.2,0.2,0.4,0.4))]
-        detections = detections_batch[0]
-        logging.info("%d detections", len(detections))
-        for det in detections:
-            obj = det.asnumpy()
-            (klass, score, x0, y0, x1, y1) = obj
-            if score > args.thresh:
-                draw_detection(frame, obj, class_names)
-        cv2.imshow('frame', frame)
-
-def run_images(args,ctx):
-    # parse image list
-    image_list = [i.strip() for i in args.images.split(',')]
-    assert len(image_list) > 0, "No valid image specified to detect"
-
-    network = None if args.deploy_net else args.network
-    class_names = parse_class_names(args.class_names)
-    data_shape = parse_data_shape(args.data_shape)
-    if args.prefix.endswith('_'):
-        prefix = args.prefix + args.network + '_' + str(data_shape[0])
-    else:
-        prefix = args.prefix
-    detector = get_detector(network, prefix, args.epoch,
-                            data_shape,
-                            (args.mean_r, args.mean_g, args.mean_b),
-                            ctx, len(class_names), args.nms_thresh, args.force_nms)
-    # run detection
-    detector.detect_and_visualize(image_list, args.dir, args.extension,
-                                  class_names, args.thresh, args.show_timer)
-
-def main():
-    logging.getLogger().setLevel(logging.INFO)
-    logging.basicConfig(format='%(asctime)-15s %(message)s')
-    args = parse_args()
-    if args.cpu:
-        ctx = mx.cpu()
-    else:
-        ctx = mx.gpu(args.gpu_id)
-
-    if args.camera:
-        run_camera(args, ctx)
-    else:
-        run_images(args, ctx)
-    return 0
-
-if __name__ == '__main__':
-    sys.exit(main())
-
diff --git a/example/ssd/deploy.py b/example/ssd/deploy.py
deleted file mode 100644
index 5c435f4939e5..000000000000
--- a/example/ssd/deploy.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from __future__ import print_function
-import argparse
-import tools.find_mxnet
-import mxnet as mx
-import os
-import importlib
-import sys
-from symbol.symbol_factory import get_symbol
-
-def parse_args():
-    parser = argparse.ArgumentParser(description='Convert a trained model to deploy model')
-    parser.add_argument('--network', dest='network', type=str, default='vgg16_reduced',
-                        help='which network to use')
-    parser.add_argument('--epoch', dest='epoch', help='epoch of trained model',
-                        default=0, type=int)
-    parser.add_argument('--prefix', dest='prefix', help='trained model prefix',
-                        default=os.path.join(os.getcwd(), 'model', 'ssd_'), type=str)
-    parser.add_argument('--data-shape', dest='data_shape', type=int, default=300,
-                        help='data shape')
-    parser.add_argument('--num-class', dest='num_classes', help='number of classes',
-                        default=20, type=int)
-    parser.add_argument('--nms', dest='nms_thresh', type=float, default=0.5,
-                        help='non-maximum suppression threshold, default 0.5')
-    parser.add_argument('--no-force', dest='force_nms', action='store_false',
-                        help='dont force non-maximum suppression on different class')
-    parser.add_argument('--topk', dest='nms_topk', type=int, default=400,
-                        help='apply nms only to top k detections based on scores.')
-    args = parser.parse_args()
-    return args
-
-if __name__ == '__main__':
-    args = parse_args()
-    net = get_symbol(args.network, args.data_shape,
-        num_classes=args.num_classes, nms_thresh=args.nms_thresh,
-        force_suppress=args.force_nms, nms_topk=args.nms_topk)
-    if args.prefix.endswith('_'):
-        prefix = args.prefix + args.network + '_' + str(args.data_shape)
-    else:
-        prefix = args.prefix
-    _, arg_params, aux_params = mx.model.load_checkpoint(prefix, args.epoch)
-    # new name
-    tmp = prefix.rsplit('/', 1)
-    save_prefix = '/deploy_'.join(tmp)
-    mx.model.save_checkpoint(save_prefix, args.epoch, net, arg_params, aux_params)
-    print("Saved model: {}-{:04d}.params".format(save_prefix, args.epoch))
-    print("Saved symbol: {}-symbol.json".format(save_prefix))
diff --git a/example/ssd/detect/__init__.py b/example/ssd/detect/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/example/ssd/detect/detector.py b/example/ssd/detect/detector.py
deleted file mode 100644
index 1b5e8cb76ee2..000000000000
--- a/example/ssd/detect/detector.py
+++ /dev/null
@@ -1,238 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import mxnet as mx
-import numpy as np
-from timeit import default_timer as timer
-from dataset.testdb import TestDB
-from dataset.iterator import DetIter
-import logging
-import cv2
-from mxnet.io import DataBatch, DataDesc
-
-
-class Detector(object):
-    """
-    SSD detector which hold a detection network and wraps detection API
-
-    Parameters:
-    ----------
-    symbol : mx.Symbol
-        detection network Symbol
-    model_prefix : str
-        name prefix of trained model
-    epoch : int
-        load epoch of trained model
-    data_shape : int
-        input data resize shape
-    mean_pixels : tuple of float
-        (mean_r, mean_g, mean_b)
-    batch_size : int
-        run detection with batch size
-    ctx : mx.ctx
-        device to use, if None, use mx.cpu() as default context
-    """
-    def __init__(self, symbol, model_prefix, epoch, data_shape, mean_pixels, \
-                 batch_size=1, ctx=None):
-        self.ctx = ctx
-        if self.ctx is None:
-            self.ctx = mx.cpu()
-        load_symbol, args, auxs = mx.model.load_checkpoint(model_prefix, epoch)
-        if symbol is None:
-            symbol = load_symbol
-        self.mod = mx.mod.Module(symbol, label_names=None, context=self.ctx)
-        if not isinstance(data_shape, tuple):
-            data_shape = (data_shape, data_shape)
-        self.data_shape = data_shape
-        self.mod.bind(data_shapes=[('data', (batch_size, 3, data_shape[0], data_shape[1]))])
-        self.mod.set_params(args, auxs)
-        self.mean_pixels = mean_pixels
-        self.mean_pixels_nd = mx.nd.array(mean_pixels).reshape((3,1,1))
-
-    def create_batch(self, frame):
-        """
-        :param frame: an (w,h,channels) numpy array (image)
-        :return: DataBatch of (1,channels,data_shape,data_shape)
-        """
-        frame_resize = mx.nd.array(cv2.resize(frame, (self.data_shape[0], self.data_shape[1])))
-        #frame_resize = mx.img.imresize(frame, self.data_shape[0], self.data_shape[1], cv2.INTER_LINEAR)
-        # Change dimensions from (w,h,channels) to (channels, w, h)
-        frame_t = mx.nd.transpose(frame_resize, axes=(2,0,1))
-        frame_norm = frame_t - self.mean_pixels_nd
-        # Add dimension for batch, results in (1,channels,w,h)
-        batch_frame = [mx.nd.expand_dims(frame_norm, axis=0)]
-        batch_shape = [DataDesc('data', batch_frame[0].shape)]
-        batch = DataBatch(data=batch_frame, provide_data=batch_shape)
-        return batch
-
-    def detect_iter(self, det_iter, show_timer=False):
-        """
-        detect all images in iterator
-
-        Parameters:
-        ----------
-        det_iter : DetIter
-            iterator for all testing images
-        show_timer : Boolean
-            whether to print out detection exec time
-
-        Returns:
-        ----------
-        list of detection results
-        """
-        num_images = det_iter._size
-        if not isinstance(det_iter, mx.io.PrefetchingIter):
-            det_iter = mx.io.PrefetchingIter(det_iter)
-        start = timer()
-        detections = self.mod.predict(det_iter).asnumpy()
-        time_elapsed = timer() - start
-        if show_timer:
-            logging.info("Detection time for {} images: {:.4f} sec".format(
-                num_images, time_elapsed))
-        result = Detector.filter_positive_detections(detections)
-        return result
-
-    def detect_batch(self, batch):
-        """
-        Return detections for batch
-        :param batch:
-        :return:
-        """
-        self.mod.forward(batch, is_train=False)
-        detections = self.mod.get_outputs()[0]
-        positive_detections = Detector.filter_positive_detections(detections)
-        return positive_detections
-
-    def im_detect(self, im_list, root_dir=None, extension=None, show_timer=False):
-        """
-        wrapper for detecting multiple images
-
-        Parameters:
-        ----------
-        im_list : list of str
-            image path or list of image paths
-        root_dir : str
-            directory of input images, optional if image path already
-            has full directory information
-        extension : str
-            image extension, eg. ".jpg", optional
-
-        Returns:
-        ----------
-        list of detection results in format [det0, det1...], det is in
-        format np.array([id, score, xmin, ymin, xmax, ymax]...)
-        """
-        test_db = TestDB(im_list, root_dir=root_dir, extension=extension)
-        test_iter = DetIter(test_db, 1, self.data_shape, self.mean_pixels,
-                            is_train=False)
-        return self.detect_iter(test_iter, show_timer)
-
-    def visualize_detection(self, img, dets, classes=[], thresh=0.6):
-        """
-        visualize detections in one image
-
-        Parameters:
-        ----------
-        img : numpy.array
-            image, in bgr format
-        dets : numpy.array
-            ssd detections, numpy.array([[id, score, x1, y1, x2, y2]...])
-            each row is one object
-        classes : tuple or list of str
-            class names
-        thresh : float
-            score threshold
-        """
-        import matplotlib.pyplot as plt
-        import random
-        plt.imshow(img)
-        height = img.shape[0]
-        width = img.shape[1]
-        colors = dict()
-        for det in dets:
-            (klass, score, x0, y0, x1, y1) = det
-            if score < thresh:
-                continue
-            cls_id = int(klass)
-            if cls_id not in colors:
-                colors[cls_id] = (random.random(), random.random(), random.random())
-            xmin = int(x0 * width)
-            ymin = int(y0 * height)
-            xmax = int(x1 * width)
-            ymax = int(y1 * height)
-            rect = plt.Rectangle((xmin, ymin), xmax - xmin,
-                                 ymax - ymin, fill=False,
-                                 edgecolor=colors[cls_id],
-                                 linewidth=3.5)
-            plt.gca().add_patch(rect)
-            class_name = str(cls_id)
-            if classes and len(classes) > cls_id:
-                class_name = classes[cls_id]
-            plt.gca().text(xmin, ymin - 2,
-                            '{:s} {:.3f}'.format(class_name, score),
-                            bbox=dict(facecolor=colors[cls_id], alpha=0.5),
-                                    fontsize=12, color='white')
-        plt.show()
-
-    @staticmethod
-    def filter_positive_detections(detections):
-        """
-        First column (class id) is -1 for negative detections
-        :param detections:
-        :return:
-        """
-        class_idx = 0
-        assert(isinstance(detections, mx.nd.NDArray) or isinstance(detections, np.ndarray))
-        detections_per_image = []
-        # for each image
-        for i in range(detections.shape[0]):
-            result = []
-            det = detections[i, :, :]
-            for obj in det:
-                if obj[class_idx] >= 0:
-                    result.append(obj)
-            detections_per_image.append(result)
-        logging.info("%d positive detections", len(result))
-        return detections_per_image
-
-    def detect_and_visualize(self, im_list, root_dir=None, extension=None,
-                             classes=[], thresh=0.6, show_timer=False):
-        """
-        wrapper for im_detect and visualize_detection
-
-        Parameters:
-        ----------
-        im_list : list of str or str
-            image path or list of image paths
-        root_dir : str or None
-            directory of input images, optional if image path already
-            has full directory information
-        extension : str or None
-            image extension, eg. ".jpg", optional
-
-        Returns:
-        ----------
-
-        """
-        dets = self.im_detect(im_list, root_dir, extension, show_timer=show_timer)
-        if not isinstance(im_list, list):
-            im_list = [im_list]
-        assert len(dets) == len(im_list)
-        for k, det in enumerate(dets):
-            img = cv2.imread(im_list[k])
-            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
-            self.visualize_detection(img, det, classes, thresh)
diff --git a/example/ssd/evaluate.py b/example/ssd/evaluate.py
deleted file mode 100644
index bbe9feab333c..000000000000
--- a/example/ssd/evaluate.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import argparse
-import tools.find_mxnet
-import mxnet as mx
-import os
-import sys
-from evaluate.evaluate_net import evaluate_net
-
-def parse_args():
-    parser = argparse.ArgumentParser(description='Evaluate a network')
-    parser.add_argument('--rec-path', dest='rec_path', help='which record file to use',
-                        default=os.path.join(os.getcwd(), 'data', 'val.rec'), type=str)
-    parser.add_argument('--list-path', dest='list_path', help='which list file to use',
-                        default="", type=str)
-    parser.add_argument('--network', dest='network', type=str, default='vgg16_reduced',
-                        help='which network to use')
-    parser.add_argument('--num-batch', dest='num_batch', type=int, default=5,
-                        help='evaluation number batches')
-    parser.add_argument('--batch-size', dest='batch_size', type=int, default=32,
-                        help='evaluation batch size')
-    parser.add_argument('--num-class', dest='num_class', type=int, default=20,
-                        help='number of classes')
-    parser.add_argument('--class-names', dest='class_names', type=str,
-                        default='aeroplane, bicycle, bird, boat, bottle, bus, \
-                        car, cat, chair, cow, diningtable, dog, horse, motorbike, \
-                        person, pottedplant, sheep, sofa, train, tvmonitor',
-                        help='string of comma separated names, or text filename')
-    parser.add_argument('--epoch', dest='epoch', help='epoch of pretrained model',
-                        default=0, type=int)
-    parser.add_argument('--prefix', dest='prefix', help='load model prefix',
-                        default=os.path.join(os.getcwd(), 'model', 'ssd_'), type=str)
-    parser.add_argument('--gpus', dest='gpu_id', help='GPU devices to evaluate with',
-                        default='0', type=str)
-    parser.add_argument('--cpu', dest='cpu', help='use cpu to evaluate, this can be slow',
-                        action='store_true')
-    parser.add_argument('--data-shape', dest='data_shape', type=int, default=300,
-                        help='set image shape')
-    parser.add_argument('--mean-r', dest='mean_r', type=float, default=123,
-                        help='red mean value')
-    parser.add_argument('--mean-g', dest='mean_g', type=float, default=117,
-                        help='green mean value')
-    parser.add_argument('--mean-b', dest='mean_b', type=float, default=104,
-                        help='blue mean value')
-    parser.add_argument('--nms', dest='nms_thresh', type=float, default=0.45,
-                        help='non-maximum suppression threshold')
-    parser.add_argument('--overlap', dest='overlap_thresh', type=float, default=0.5,
-                        help='evaluation overlap threshold')
-    parser.add_argument('--force', dest='force_nms', action='store_true',
-                        help='force non-maximum suppression on different class')
-    parser.add_argument('--use-difficult', dest='use_difficult', action='store_true',
-                        help='use difficult ground-truths in evaluation')
-    parser.add_argument('--no-voc07', dest='use_voc07_metric', action='store_false',
-                        help='dont use PASCAL VOC 07 metric')
-    parser.add_argument('--deploy', dest='deploy_net', help='Load network from model',
-                        action='store_true', default=False)
-    args = parser.parse_args()
-    return args
-
-if __name__ == '__main__':
-    args = parse_args()
-    # choose ctx
-    if args.cpu:
-        ctx = mx.cpu()
-    else:
-        ctx = [mx.gpu(int(i)) for i in args.gpu_id.split(',')]
-    # parse # classes and class_names if applicable
-    num_class = args.num_class
-    if len(args.class_names) > 0:
-        if os.path.isfile(args.class_names):
-                # try to open it to read class names
-                with open(args.class_names, 'r') as f:
-                    class_names = [l.strip() for l in f.readlines()]
-        else:
-            class_names = [c.strip() for c in args.class_names.split(',')]
-        assert len(class_names) == num_class
-        for name in class_names:
-            assert len(name) > 0
-    else:
-        class_names = None
-
-    network = None if args.deploy_net else args.network
-    if args.prefix.endswith('_'):
-        prefix = args.prefix + args.network
-    else:
-        prefix = args.prefix
-    evaluate_net(network, args.rec_path, num_class, args.num_batch,
-                 (args.mean_r, args.mean_g, args.mean_b), args.data_shape,
-                 prefix, args.epoch, ctx, batch_size=args.batch_size,
-                 path_imglist=args.list_path, nms_thresh=args.nms_thresh,
-                 force_nms=args.force_nms, ovp_thresh=args.overlap_thresh,
-                 use_difficult=args.use_difficult, class_names=class_names,
-                 voc07_metric=args.use_voc07_metric)
diff --git a/example/ssd/evaluate/__init__.py b/example/ssd/evaluate/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/example/ssd/evaluate/eval_metric.py b/example/ssd/evaluate/eval_metric.py
deleted file mode 100644
index b038d3afb376..000000000000
--- a/example/ssd/evaluate/eval_metric.py
+++ /dev/null
@@ -1,295 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import mxnet as mx
-import numpy as np
-
-class MApMetric(mx.gluon.metric.EvalMetric):
-    """
-    Calculate mean AP for object detection task
-
-    Parameters:
-    ---------
-    ovp_thresh : float
-        overlap threshold for TP
-    use_difficult : boolean
-        use difficult ground-truths if applicable, otherwise just ignore
-    class_names : list of str
-        optional, if provided, will print out AP for each class
-    pred_idx : int
-        prediction index in network output list
-    """
-    def __init__(self, ovp_thresh=0.5, use_difficult=False, class_names=None, pred_idx=0):
-        super(MApMetric, self).__init__('mAP')
-        if class_names is None:
-            self.num = None
-        else:
-            assert isinstance(class_names, (list, tuple))
-            for name in class_names:
-                assert isinstance(name, str), "must provide names as str"
-            num = len(class_names)
-            self.name = class_names + ['mAP']
-            self.num = num + 1
-        self.reset()
-        self.ovp_thresh = ovp_thresh
-        self.use_difficult = use_difficult
-        self.class_names = class_names
-        self.pred_idx = int(pred_idx)
-
-    def reset(self):
-        """Clear the internal statistics to initial state."""
-        if getattr(self, 'num', None) is None:
-            self.num_inst = 0
-            self.sum_metric = 0.0
-        else:
-            self.num_inst = [0] * self.num
-            self.sum_metric = [0.0] * self.num
-        self.records = dict()
-        self.counts = dict()
-
-    def get(self):
-        """Get the current evaluation result.
-
-        Returns
-        -------
-        name : str
-           Name of the metric.
-        value : float
-           Value of the evaluation.
-        """
-        self._update()  # update metric at this time
-        if self.num is None:
-            if self.num_inst == 0:
-                return (self.name, float('nan'))
-            else:
-                return (self.name, self.sum_metric / self.num_inst)
-        else:
-            names = ['%s'%(self.name[i]) for i in range(self.num)]
-            values = [x / y if y != 0 else float('nan') \
-                for x, y in zip(self.sum_metric, self.num_inst)]
-            return (names, values)
-
-    def update(self, labels, preds):
-        """
-        Update internal records. This function now only update internal buffer,
-        sum_metric and num_inst are updated in _update() function instead when
-        get() is called to return results.
-
-        Params:
-        ----------
-        labels: mx.nd.array (n * 6) or (n * 5), difficult column is optional
-            2-d array of ground-truths, n objects(id-xmin-ymin-xmax-ymax-[difficult])
-        preds: mx.nd.array (m * 6)
-            2-d array of detections, m objects(id-score-xmin-ymin-xmax-ymax)
-        """
-        def iou(x, ys):
-            """
-            Calculate intersection-over-union overlap
-            Params:
-            ----------
-            x : numpy.array
-                single box [xmin, ymin ,xmax, ymax]
-            ys : numpy.array
-                multiple box [[xmin, ymin, xmax, ymax], [...], ]
-            Returns:
-            -----------
-            numpy.array
-                [iou1, iou2, ...], size == ys.shape[0]
-            """
-            ixmin = np.maximum(ys[:, 0], x[0])
-            iymin = np.maximum(ys[:, 1], x[1])
-            ixmax = np.minimum(ys[:, 2], x[2])
-            iymax = np.minimum(ys[:, 3], x[3])
-            iw = np.maximum(ixmax - ixmin, 0.)
-            ih = np.maximum(iymax - iymin, 0.)
-            inters = iw * ih
-            uni = (x[2] - x[0]) * (x[3] - x[1]) + (ys[:, 2] - ys[:, 0]) * \
-                (ys[:, 3] - ys[:, 1]) - inters
-            ious = inters / uni
-            ious[uni < 1e-12] = 0  # in case bad boxes
-            return ious
-
-        # independant execution for each image
-        for i in range(labels[0].shape[0]):
-            # get as numpy arrays
-            label = labels[0][i].asnumpy()
-            if np.sum(label[:, 0] >= 0) < 1:
-                continue
-            pred = preds[self.pred_idx][i].asnumpy()
-            # calculate for each class
-            while (pred.shape[0] > 0):
-                cid = int(pred[0, 0])
-                indices = np.where(pred[:, 0].astype(int) == cid)[0]
-                if cid < 0:
-                    pred = np.delete(pred, indices, axis=0)
-                    continue
-                dets = pred[indices]
-                pred = np.delete(pred, indices, axis=0)
-                # sort by score, desceding
-                dets = dets[dets[:,1].argsort()[::-1]]
-                records = np.hstack((dets[:, 1][:, np.newaxis], np.zeros((dets.shape[0], 1))))
-                # ground-truths
-                label_indices = np.where(label[:, 0].astype(int) == cid)[0]
-                gts = label[label_indices, :]
-                label = np.delete(label, label_indices, axis=0)
-                if gts.size > 0:
-                    found = [False] * gts.shape[0]
-                    for j in range(dets.shape[0]):
-                        # compute overlaps
-                        ious = iou(dets[j, 2:], gts[:, 1:5])
-                        ovargmax = np.argmax(ious)
-                        ovmax = ious[ovargmax]
-                        if ovmax > self.ovp_thresh:
-                            if (not self.use_difficult and
-                                gts.shape[1] >= 6 and
-                                gts[ovargmax, 5] > 0):
-                                pass
-                            else:
-                                if not found[ovargmax]:
-                                    records[j, -1] = 1  # tp
-                                    found[ovargmax] = True
-                                else:
-                                    # duplicate
-                                    records[j, -1] = 2  # fp
-                        else:
-                            records[j, -1] = 2 # fp
-                else:
-                    # no gt, mark all fp
-                    records[:, -1] = 2
-
-                # ground truth count
-                if (not self.use_difficult and gts.shape[1] >= 6):
-                    gt_count = np.sum(gts[:, 5] < 1)
-                else:
-                    gt_count = gts.shape[0]
-
-                # now we push records to buffer
-                # first column: score, second column: tp/fp
-                # 0: not set(matched to difficult or something), 1: tp, 2: fp
-                records = records[np.where(records[:, -1] > 0)[0], :]
-                if records.size > 0:
-                    self._insert(cid, records, gt_count)
-
-            # add missing class if not present in prediction
-            while (label.shape[0] > 0):
-                cid = int(label[0, 0])
-                label_indices = np.where(label[:, 0].astype(int) == cid)[0]
-                label = np.delete(label, label_indices, axis=0)
-                if cid < 0:
-                    continue
-                gt_count = label_indices.size
-                self._insert(cid, np.array([[0, 0]]), gt_count)
-
-    def _update(self):
-        """ update num_inst and sum_metric """
-        aps = []
-        for k, v in self.records.items():
-            recall, prec = self._recall_prec(v, self.counts[k])
-            ap = self._average_precision(recall, prec)
-            aps.append(ap)
-            if self.num is not None and k < (self.num - 1):
-                self.sum_metric[k] = ap
-                self.num_inst[k] = 1
-        if self.num is None:
-            self.num_inst = 1
-            self.sum_metric = np.mean(aps)
-        else:
-            self.num_inst[-1] = 1
-            self.sum_metric[-1] = np.mean(aps)
-
-    def _recall_prec(self, record, count):
-        """ get recall and precision from internal records """
-        record = np.delete(record, np.where(record[:, 1].astype(int) == 0)[0], axis=0)
-        sorted_records = record[record[:,0].argsort()[::-1]]
-        tp = np.cumsum(sorted_records[:, 1].astype(int) == 1)
-        fp = np.cumsum(sorted_records[:, 1].astype(int) == 2)
-        if count <= 0:
-            recall = tp * 0.0
-        else:
-            recall = tp / float(count)
-        prec = tp.astype(float) / (tp + fp)
-        return recall, prec
-
-    def _average_precision(self, rec, prec):
-        """
-        calculate average precision
-
-        Params:
-        ----------
-        rec : numpy.array
-            cumulated recall
-        prec : numpy.array
-            cumulated precision
-        Returns:
-        ----------
-        ap as float
-        """
-        # append sentinel values at both ends
-        mrec = np.concatenate(([0.], rec, [1.]))
-        mpre = np.concatenate(([0.], prec, [0.]))
-
-        # compute precision integration ladder
-        for i in range(mpre.size - 1, 0, -1):
-            mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
-
-        # look for recall value changes
-        i = np.where(mrec[1:] != mrec[:-1])[0]
-
-        # sum (\delta recall) * prec
-        ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
-        return ap
-
-    def _insert(self, key, records, count):
-        """ Insert records according to key """
-        if key not in self.records:
-            assert key not in self.counts
-            self.records[key] = records
-            self.counts[key] = count
-        else:
-            self.records[key] = np.vstack((self.records[key], records))
-            assert key in self.counts
-            self.counts[key] += count
-
-
-class VOC07MApMetric(MApMetric):
-    """ Mean average precision metric for PASCAL V0C 07 dataset """
-    def __init__(self, *args, **kwargs):
-        super(VOC07MApMetric, self).__init__(*args, **kwargs)
-
-    def _average_precision(self, rec, prec):
-        """
-        calculate average precision, override the default one,
-        special 11-point metric
-
-        Params:
-        ----------
-        rec : numpy.array
-            cumulated recall
-        prec : numpy.array
-            cumulated precision
-        Returns:
-        ----------
-        ap as float
-        """
-        ap = 0.
-        for t in np.arange(0., 1.1, 0.1):
-            if np.sum(rec >= t) == 0:
-                p = 0
-            else:
-                p = np.max(prec[rec >= t])
-            ap += p / 11.
-        return ap
diff --git a/example/ssd/evaluate/eval_voc.py b/example/ssd/evaluate/eval_voc.py
deleted file mode 100644
index 0ba7f7eaf843..000000000000
--- a/example/ssd/evaluate/eval_voc.py
+++ /dev/null
@@ -1,196 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""
-given a pascal voc imdb, compute mAP
-"""
-from __future__ import print_function
-import numpy as np
-import os
-try:
-    import cPickle as pickle
-except ImportError:
-    import pickle
-
-
-def parse_voc_rec(filename):
-    """
-    parse pascal voc record into a dictionary
-    :param filename: xml file path
-    :return: list of dict
-    """
-    import xml.etree.ElementTree as ET
-    tree = ET.parse(filename)
-    objects = []
-    for obj in tree.findall('object'):
-        obj_dict = dict()
-        obj_dict['name'] = obj.find('name').text
-        obj_dict['difficult'] = int(obj.find('difficult').text)
-        bbox = obj.find('bndbox')
-        obj_dict['bbox'] = [int(bbox.find('xmin').text),
-                            int(bbox.find('ymin').text),
-                            int(bbox.find('xmax').text),
-                            int(bbox.find('ymax').text)]
-        objects.append(obj_dict)
-    return objects
-
-
-def voc_ap(rec, prec, use_07_metric=False):
-    """
-    average precision calculations
-    [precision integrated to recall]
-    :param rec: recall
-    :param prec: precision
-    :param use_07_metric: 2007 metric is 11-recall-point based AP
-    :return: average precision
-    """
-    if use_07_metric:
-        ap = 0.
-        for t in np.arange(0., 1.1, 0.1):
-            if np.sum(rec >= t) == 0:
-                p = 0
-            else:
-                p = np.max(prec[rec >= t])
-            ap += p / 11.
-    else:
-        # append sentinel values at both ends
-        mrec = np.concatenate(([0.], rec, [1.]))
-        mpre = np.concatenate(([0.], prec, [0.]))
-
-        # compute precision integration ladder
-        for i in range(mpre.size - 1, 0, -1):
-            mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
-
-        # look for recall value changes
-        i = np.where(mrec[1:] != mrec[:-1])[0]
-
-        # sum (\delta recall) * prec
-        ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
-    return ap
-
-
-def voc_eval(detpath, annopath, imageset_file, classname, cache_dir, ovthresh=0.5, use_07_metric=False):
-    """
-    pascal voc evaluation
-    :param detpath: detection results detpath.format(classname)
-    :param annopath: annotations annopath.format(classname)
-    :param imageset_file: text file containing list of images
-    :param classname: category name
-    :param cache_dir: caching annotations
-    :param ovthresh: overlap threshold
-    :param use_07_metric: whether to use voc07's 11 point ap computation
-    :return: rec, prec, ap
-    """
-    if not os.path.isdir(cache_dir):
-        os.mkdir(cache_dir)
-    cache_file = os.path.join(cache_dir, 'annotations.pkl')
-    with open(imageset_file, 'r') as f:
-        lines = f.readlines()
-    image_filenames = [x.strip() for x in lines]
-
-    # load annotations from cache
-    if not os.path.isfile(cache_file):
-        recs = {}
-        for ind, image_filename in enumerate(image_filenames):
-            recs[image_filename] = parse_voc_rec(annopath.format(image_filename))
-            if ind % 100 == 0:
-                print('reading annotations for {:d}/{:d}'.format(ind + 1, len(image_filenames)))
-        print('saving annotations cache to {:s}'.format(cache_file))
-        with open(cache_file, 'wb') as f:
-            pickle.dump(recs, f)
-    else:
-        with open(cache_file, 'rb') as f:
-            recs = pickle.load(f)
-
-    # extract objects in :param classname:
-    class_recs = {}
-    npos = 0
-    for image_filename in image_filenames:
-        objects = [obj for obj in recs[image_filename] if obj['name'] == classname]
-        bbox = np.array([x['bbox'] for x in objects])
-        difficult = np.array([x['difficult'] for x in objects]).astype(np.bool)
-        det = [False] * len(objects)  # stand for detected
-        npos = npos + sum(~difficult)
-        class_recs[image_filename] = {'bbox': bbox,
-                                      'difficult': difficult,
-                                      'det': det}
-
-    # read detections
-    detfile = detpath.format(classname)
-    with open(detfile, 'r') as f:
-        lines = f.readlines()
-
-    splitlines = [x.strip().split(' ') for x in lines]
-    image_ids = [x[0] for x in splitlines]
-    confidence = np.array([float(x[1]) for x in splitlines])
-    bbox = np.array([[float(z) for z in x[2:]] for x in splitlines])
-
-    # sort by confidence
-    sorted_inds = np.argsort(-confidence)
-    sorted_scores = np.sort(-confidence)
-    bbox = bbox[sorted_inds, :]
-    image_ids = [image_ids[x] for x in sorted_inds]
-
-    # go down detections and mark true positives and false positives
-    nd = len(image_ids)
-    tp = np.zeros(nd)
-    fp = np.zeros(nd)
-    for d in range(nd):
-        r = class_recs[image_ids[d]]
-        bb = bbox[d, :].astype(float)
-        ovmax = -np.inf
-        bbgt = r['bbox'].astype(float)
-
-        if bbgt.size > 0:
-            # compute overlaps
-            # intersection
-            ixmin = np.maximum(bbgt[:, 0], bb[0])
-            iymin = np.maximum(bbgt[:, 1], bb[1])
-            ixmax = np.minimum(bbgt[:, 2], bb[2])
-            iymax = np.minimum(bbgt[:, 3], bb[3])
-            iw = np.maximum(ixmax - ixmin + 1., 0.)
-            ih = np.maximum(iymax - iymin + 1., 0.)
-            inters = iw * ih
-
-            # union
-            uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) +
-                   (bbgt[:, 2] - bbgt[:, 0] + 1.) *
-                   (bbgt[:, 3] - bbgt[:, 1] + 1.) - inters)
-
-            overlaps = inters / uni
-            ovmax = np.max(overlaps)
-            jmax = np.argmax(overlaps)
-
-        if ovmax > ovthresh:
-            if not r['difficult'][jmax]:
-                if not r['det'][jmax]:
-                    tp[d] = 1.
-                    r['det'][jmax] = 1
-                else:
-                    fp[d] = 1.
-        else:
-            fp[d] = 1.
-
-    # compute precision recall
-    fp = np.cumsum(fp)
-    tp = np.cumsum(tp)
-    rec = tp / float(npos)
-    # avoid division by zero in case first detection matches a difficult ground ruth
-    prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
-    ap = voc_ap(rec, prec, use_07_metric)
-
-    return rec, prec, ap
diff --git a/example/ssd/evaluate/evaluate_net.py b/example/ssd/evaluate/evaluate_net.py
deleted file mode 100644
index 35e253d44bba..000000000000
--- a/example/ssd/evaluate/evaluate_net.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from __future__ import print_function
-import os
-import sys
-import importlib
-import mxnet as mx
-from dataset.iterator import DetRecordIter
-from config.config import cfg
-from evaluate.eval_metric import MApMetric, VOC07MApMetric
-import logging
-import time
-from symbol.symbol_factory import get_symbol
-from symbol import symbol_builder
-from mxnet.base import SymbolHandle, check_call, _LIB, mx_uint, c_str_array
-import ctypes
-from mxnet.contrib.quantization import *
-
-def evaluate_net(net, path_imgrec, num_classes, num_batch, mean_pixels, data_shape,
-                 model_prefix, epoch, ctx=mx.cpu(), batch_size=32,
-                 path_imglist="", nms_thresh=0.45, force_nms=False,
-                 ovp_thresh=0.5, use_difficult=False, class_names=None,
-                 voc07_metric=False):
-    """
-    evalute network given validation record file
-
-    Parameters:
-    ----------
-    net : str or None
-        Network name or use None to load from json without modifying
-    path_imgrec : str
-        path to the record validation file
-    path_imglist : str
-        path to the list file to replace labels in record file, optional
-    num_classes : int
-        number of classes, not including background
-    mean_pixels : tuple
-        (mean_r, mean_g, mean_b)
-    data_shape : tuple or int
-        (3, height, width) or height/width
-    model_prefix : str
-        model prefix of saved checkpoint
-    epoch : int
-        load model epoch
-    ctx : mx.ctx
-        mx.gpu() or mx.cpu()
-    batch_size : int
-        validation batch size
-    nms_thresh : float
-        non-maximum suppression threshold
-    force_nms : boolean
-        whether suppress different class objects
-    ovp_thresh : float
-        AP overlap threshold for true/false postives
-    use_difficult : boolean
-        whether to use difficult objects in evaluation if applicable
-    class_names : comma separated str
-        class names in string, must correspond to num_classes if set
-    voc07_metric : boolean
-        whether to use 11-point evluation as in VOC07 competition
-    """
-    # set up logger
-    logging.basicConfig()
-    logger = logging.getLogger()
-    logger.setLevel(logging.INFO)
-
-    # args
-    if isinstance(data_shape, int):
-        data_shape = (3, data_shape, data_shape)
-    assert len(data_shape) == 3 and data_shape[0] == 3
-    model_prefix += '_' + str(data_shape[1])
-
-    # iterator
-    eval_iter = DetRecordIter(path_imgrec, batch_size, data_shape, mean_pixels=mean_pixels,
-                              path_imglist=path_imglist, **cfg.valid)
-    # model params
-    load_net, args, auxs = mx.model.load_checkpoint(model_prefix, epoch)
-    # network
-    if net is None:
-        net = load_net
-    else:
-        net = get_symbol(net, data_shape[1], num_classes=num_classes,
-            nms_thresh=nms_thresh, force_suppress=force_nms)
-    if not 'label' in net.list_arguments():
-        label = mx.sym.Variable(name='label')
-        net = mx.sym.Group([net, label])
-
-    # init module
-    mod = mx.mod.Module(net, label_names=('label',), logger=logger, context=ctx,
-        fixed_param_names=net.list_arguments())
-    mod.bind(data_shapes=eval_iter.provide_data, label_shapes=eval_iter.provide_label)
-    mod.set_params(args, auxs, allow_missing=False, force_init=True)
-
-    # run evaluation
-    if voc07_metric:
-        metric = VOC07MApMetric(ovp_thresh, use_difficult, class_names)
-    else:
-        metric = MApMetric(ovp_thresh, use_difficult, class_names)
-
-    num = num_batch * batch_size
-    data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=ctx) for _, shape in mod.data_shapes]
-    batch = mx.io.DataBatch(data, [])  # empty label
-
-    dry_run = 5                 # use 5 iterations to warm up
-    for i in range(dry_run):
-        mod.forward(batch, is_train=False)
-        for output in mod.get_outputs():
-            output.wait_to_read()
-
-    tic = time.time()
-    results = mod.score(eval_iter, metric, num_batch=num_batch)
-    speed = num / (time.time() - tic)
-    if logger is not None:
-        logger.info('Finished inference with %d images' % num)
-        logger.info('Finished with %f images per second', speed)
-
-    for k, v in results:
-        print("{}: {}".format(k, v))
diff --git a/example/ssd/init.sh b/example/ssd/init.sh
deleted file mode 100755
index 863ba871da8b..000000000000
--- a/example/ssd/init.sh
+++ /dev/null
@@ -1,59 +0,0 @@
-#!/bin/bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-#
-# Train SSD with Pascal VOC
-#
-set -e
-set -x
-
-
-function download_pascal_voc() {
-    pushd .
-    cd data
-    wget -c --show-progress http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar
-    wget -c --show-progress http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
-    wget -c --show-progress http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar
-    tar -xvf VOCtrainval_11-May-2012.tar
-    tar -xvf VOCtrainval_06-Nov-2007.tar
-    tar -xvf VOCtest_06-Nov-2007.tar
-    popd
-}
-
-function download_model() {
-    pushd .
-    MODELFILEURL="https://github.com/zhreshold/mxnet-ssd/releases/download/v0.6/resnet50_ssd_512_voc0712_trainval.zip"
-    MODELFILE="resnet50_ssd_512_voc0712_trainval.zip"
-    cd model
-    wget -c $MODELFILEURL
-    unzip $MODELFILE
-    popd
-}
-
-function download_demo_images() {
-    pushd .
-    cd data/demo
-    ./download_demo_images.py
-    popd
-}
-
-# Uncomment to download training dataset
-#download_pascal_voc
-download_model
-download_demo_images
diff --git a/example/ssd/model/README.md b/example/ssd/model/README.md
deleted file mode 100644
index 7c77ee2475e8..000000000000
--- a/example/ssd/model/README.md
+++ /dev/null
@@ -1,18 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-#### This is the default directory to store all the models, including `*.params` and `*.json`
diff --git a/example/ssd/quantization.py b/example/ssd/quantization.py
deleted file mode 100644
index 7ae6278648d4..000000000000
--- a/example/ssd/quantization.py
+++ /dev/null
@@ -1,159 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from __future__ import print_function
-import os
-import sys
-import importlib
-import mxnet as mx
-from dataset.iterator import DetRecordIter
-from config.config import cfg
-from evaluate.eval_metric import MApMetric, VOC07MApMetric
-import argparse
-import logging
-import time
-from symbol.symbol_factory import get_symbol
-from symbol import symbol_builder
-from mxnet.base import SymbolHandle, check_call, _LIB, mx_uint, c_str_array
-import ctypes
-from mxnet.contrib.quantization import *
-
-def save_symbol(fname, sym, logger=None):
-    if logger is not None:
-        logger.info('Saving symbol into file at %s' % fname)
-    sym.save(fname)
-
-
-def save_params(fname, arg_params, aux_params, logger=None):
-    if logger is not None:
-        logger.info('Saving params into file at %s' % fname)
-    save_dict = {('arg:%s' % k): v.as_in_context(cpu()) for k, v in arg_params.items()}
-    save_dict.update({('aux:%s' % k): v.as_in_context(cpu()) for k, v in aux_params.items()})
-    mx.nd.save(fname, save_dict)
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Generate a calibrated quantized SSD model from a FP32 model')
-    parser.add_argument('--batch-size', type=int, default=32)
-    parser.add_argument('--num-calib-batches', type=int, default=5,
-                        help='number of batches for calibration')
-    parser.add_argument('--exclude-first-conv', action='store_true', default=False,
-                        help='excluding quantizing the first conv layer since the'
-                             ' number of channels is usually not a multiple of 4 in that layer'
-                             ' which does not satisfy the requirement of cuDNN')
-    parser.add_argument('--shuffle-dataset', action='store_true', default=True,
-                        help='shuffle the calibration dataset')
-    parser.add_argument('--shuffle-chunk-seed', type=int, default=3982304,
-                        help='shuffling chunk seed, see'
-                             ' https://mxnet.apache.org/api/python/io/io.html?highlight=imager#mxnet.io.ImageRecordIter'
-                             ' for more details')
-    parser.add_argument('--shuffle-seed', type=int, default=48564309,
-                        help='shuffling seed, see'
-                             ' https://mxnet.apache.org/api/python/io/io.html?highlight=imager#mxnet.io.ImageRecordIter'
-                             ' for more details')
-    parser.add_argument('--calib-mode', type=str, default='naive',
-                        help='calibration mode used for generating calibration table for the quantized symbol; supports'
-                             ' 1. none: no calibration will be used. The thresholds for quantization will be calculated'
-                             ' on the fly. This will result in inference speed slowdown and loss of accuracy'
-                             ' in general.'
-                             ' 2. naive: simply take min and max values of layer outputs as thresholds for'
-                             ' quantization. In general, the inference accuracy worsens with more examples used in'
-                             ' calibration. It is recommended to use `entropy` mode as it produces more accurate'
-                             ' inference results.'
-                             ' 3. entropy: calculate KL divergence of the fp32 output and quantized output for optimal'
-                             ' thresholds. This mode is expected to produce the best inference accuracy of all three'
-                             ' kinds of quantized models if the calibration dataset is representative enough of the'
-                             ' inference dataset.')
-    parser.add_argument('--quantized-dtype', type=str, default='auto',
-                        choices=['auto', 'int8', 'uint8'],
-                        help='quantization destination data type for input data')
-
-    args = parser.parse_args()
-    ctx = mx.cpu(0)
-    logging.basicConfig()
-    logger = logging.getLogger('logger')
-    logger.setLevel(logging.INFO)
-
-    logger.info('shuffle_dataset=%s' % args.shuffle_dataset)
-
-    calib_mode = args.calib_mode
-    logger.info('calibration mode set to %s' % calib_mode)
-
-    # load FP32 models
-    prefix, epoch = "./model/ssd_vgg16_reduced_300", 0
-    sym, arg_params, aux_params = mx.model.load_checkpoint("./model/ssd_vgg16_reduced_300", 0)
-
-    if not 'label' in sym.list_arguments():
-        label = mx.sym.Variable(name='label')
-        sym = mx.sym.Group([sym, label])
-
-    sym = sym.get_backend_symbol('MKLDNN_QUANTIZE')
-
-    # get batch size
-    batch_size = args.batch_size
-    logger.info('batch size = %d for calibration' % batch_size)
-
-    # get number of batches for calibration
-    num_calib_batches = args.num_calib_batches
-    if calib_mode != 'none':
-        logger.info('number of batches = %d for calibration' % num_calib_batches)
-
-    # get image shape
-    image_shape = '3,300,300'
-
-    # Quantization layer configs
-    exclude_first_conv = args.exclude_first_conv
-    excluded_sym_names = []
-    rgb_mean = '123,117,104'
-    if exclude_first_conv:
-        excluded_sym_names += ['conv1_1']
-
-    label_name = 'label'
-    logger.info('label_name = %s' % label_name)
-
-    data_shape = tuple([int(i) for i in image_shape.split(',')])
-    logger.info('Input data shape = %s' % str(data_shape))
-
-    logger.info('rgb_mean = %s' % rgb_mean)
-    rgb_mean = [float(i) for i in rgb_mean.split(',')]
-    mean_args = {'mean_r': rgb_mean[0], 'mean_g': rgb_mean[1], 'mean_b': rgb_mean[2]}
-
-    if calib_mode == 'none':
-        qsym, qarg_params, aux_params = quantize_model(sym=sym, arg_params=arg_params, aux_params=aux_params,
-                                                       ctx=ctx, excluded_sym_names=excluded_sym_names,
-                                                       calib_mode=calib_mode, quantized_dtype=args.quantized_dtype,
-                                                       logger=logger)
-        sym_name = '%s-symbol.json' % ('./model/qssd_vgg16_reduced_300')
-        param_name = '%s-%04d.params' % ('./model/qssd_vgg16_reduced_300', epoch)
-        save_symbol(sym_name, qsym, logger)
-    else:
-        logger.info('Creating ImageRecordIter for reading calibration dataset')
-        eval_iter = DetRecordIter(os.path.join(os.getcwd(), 'data', 'val.rec'),
-                                  batch_size, data_shape, mean_pixels=(123, 117, 104),
-                                  path_imglist="", **cfg.valid)
-
-        qsym, qarg_params, aux_params = quantize_model(sym=sym, arg_params=arg_params, aux_params=aux_params,
-                                                        ctx=ctx, excluded_sym_names=excluded_sym_names,
-                                                        calib_mode=calib_mode, calib_data=eval_iter,
-                                                        num_calib_examples=num_calib_batches * batch_size,
-                                                        quantized_dtype=args.quantized_dtype,
-                                                        label_names=(label_name,), logger=logger)
-        sym_name = '%s-symbol.json' % ('./model/cqssd_vgg16_reduced_300')
-        param_name = '%s-%04d.params' % ('./model/cqssd_vgg16_reduced_300', epoch)
-    qsym = qsym.get_backend_symbol('MKLDNN_QUANTIZE')
-    save_symbol(sym_name, qsym, logger)
-    save_params(param_name, qarg_params, aux_params, logger)
diff --git a/example/ssd/symbol/README.md b/example/ssd/symbol/README.md
deleted file mode 100644
index d577b7067c92..000000000000
--- a/example/ssd/symbol/README.md
+++ /dev/null
@@ -1,66 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-## How to compose SSD network on top of mainstream classification networks
-
-1. Have the base network ready in this directory as `name.py`, such as `inceptionv3.py`.
-2. Add configuration to `symbol_factory.py`, an example would be:
-```
-if network == 'vgg16_reduced':
-    if data_shape >= 448:
-        from_layers = ['relu4_3', 'relu7', '', '', '', '', '']
-        num_filters = [512, -1, 512, 256, 256, 256, 256]
-        strides = [-1, -1, 2, 2, 2, 2, 1]
-        pads = [-1, -1, 1, 1, 1, 1, 1]
-        sizes = [[.07, .1025], [.15,.2121], [.3, .3674], [.45, .5196], [.6, .6708], \
-            [.75, .8216], [.9, .9721]]
-        ratios = [[1,2,.5], [1,2,.5,3,1./3], [1,2,.5,3,1./3], [1,2,.5,3,1./3], \
-            [1,2,.5,3,1./3], [1,2,.5], [1,2,.5]]
-        normalizations = [20, -1, -1, -1, -1, -1, -1]
-        steps = [] if data_shape != 512 else [x / 512.0 for x in
-            [8, 16, 32, 64, 128, 256, 512]]
-    else:
-        from_layers = ['relu4_3', 'relu7', '', '', '', '']
-        num_filters = [512, -1, 512, 256, 256, 256]
-        strides = [-1, -1, 2, 2, 1, 1]
-        pads = [-1, -1, 1, 1, 0, 0]
-        sizes = [[.1, .141], [.2,.272], [.37, .447], [.54, .619], [.71, .79], [.88, .961]]
-        ratios = [[1,2,.5], [1,2,.5,3,1./3], [1,2,.5,3,1./3], [1,2,.5,3,1./3], \
-            [1,2,.5], [1,2,.5]]
-        normalizations = [20, -1, -1, -1, -1, -1]
-        steps = [] if data_shape != 300 else [x / 300.0 for x in [8, 16, 32, 64, 100, 300]]
-    return locals()
-elif network == 'inceptionv3':
-    from_layers = ['ch_concat_mixed_7_chconcat', 'ch_concat_mixed_10_chconcat', '', '', '', '']
-    num_filters = [-1, -1, 512, 256, 256, 128]
-    strides = [-1, -1, 2, 2, 2, 2]
-    pads = [-1, -1, 1, 1, 1, 1]
-    sizes = [[.1, .141], [.2,.272], [.37, .447], [.54, .619], [.71, .79], [.88, .961]]
-    ratios = [[1,2,.5], [1,2,.5,3,1./3], [1,2,.5,3,1./3], [1,2,.5,3,1./3], \
-        [1,2,.5], [1,2,.5]]
-    normalizations = -1
-    steps = []
-    return locals()
-```
-Here `from_layers` indicate the feature layer you would like to extract from the base network.
-`''` indicate that we want add extra new layers on top of the last feature layer,
-and the number of filters must be specified in `num_filters`. Similarly, `strides` and `pads`
-are required to compose these new layers. `sizes` and `ratios` are the parameters controlling
-the anchor generation algorithm. `normalizations` is used to normalize and rescale feature if
-not `-1`. `steps`: optional, used to calculate the anchor sliding steps.
-
-3. Train or test with arguments `--network name --data-shape xxx --pretrained pretrained_model`
diff --git a/example/ssd/symbol/__init__.py b/example/ssd/symbol/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/example/ssd/symbol/common.py b/example/ssd/symbol/common.py
deleted file mode 100644
index a2fb4e69d18c..000000000000
--- a/example/ssd/symbol/common.py
+++ /dev/null
@@ -1,304 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import mxnet as mx
-import numpy as np
-
-def conv_act_layer(from_layer, name, num_filter, kernel=(1,1), pad=(0,0), \
-    stride=(1,1), act_type="relu", use_batchnorm=False):
-    """
-    wrapper for a small Convolution group
-
-    Parameters:
-    ----------
-    from_layer : mx.symbol
-        continue on which layer
-    name : str
-        base name of the new layers
-    num_filter : int
-        how many filters to use in Convolution layer
-    kernel : tuple (int, int)
-        kernel size (h, w)
-    pad : tuple (int, int)
-        padding size (h, w)
-    stride : tuple (int, int)
-        stride size (h, w)
-    act_type : str
-        activation type, can be relu...
-    use_batchnorm : bool
-        whether to use batch normalization
-
-    Returns:
-    ----------
-    (conv, relu) mx.Symbols
-    """
-    conv = mx.symbol.Convolution(data=from_layer, kernel=kernel, pad=pad, \
-        stride=stride, num_filter=num_filter, name="{}_conv".format(name))
-    if use_batchnorm:
-        conv = mx.symbol.BatchNorm(data=conv, name="{}_bn".format(name))
-    relu = mx.symbol.Activation(data=conv, act_type=act_type, \
-        name="{}_{}".format(name, act_type))
-    return relu
-
-def legacy_conv_act_layer(from_layer, name, num_filter, kernel=(1,1), pad=(0,0), \
-    stride=(1,1), act_type="relu", use_batchnorm=False):
-    """
-    wrapper for a small Convolution group
-
-    Parameters:
-    ----------
-    from_layer : mx.symbol
-        continue on which layer
-    name : str
-        base name of the new layers
-    num_filter : int
-        how many filters to use in Convolution layer
-    kernel : tuple (int, int)
-        kernel size (h, w)
-    pad : tuple (int, int)
-        padding size (h, w)
-    stride : tuple (int, int)
-        stride size (h, w)
-    act_type : str
-        activation type, can be relu...
-    use_batchnorm : bool
-        whether to use batch normalization
-
-    Returns:
-    ----------
-    (conv, relu) mx.Symbols
-    """
-    assert not use_batchnorm, "batchnorm not yet supported"
-    bias = mx.symbol.Variable(name="conv{}_bias".format(name),
-        init=mx.init.Constant(0.0), attr={'__lr_mult__': '2.0'})
-    conv = mx.symbol.Convolution(data=from_layer, bias=bias, kernel=kernel, pad=pad, \
-        stride=stride, num_filter=num_filter, name="conv{}".format(name))
-    relu = mx.symbol.Activation(data=conv, act_type=act_type, \
-        name="{}{}".format(act_type, name))
-    if use_batchnorm:
-        relu = mx.symbol.BatchNorm(data=relu, name="bn{}".format(name))
-    return conv, relu
-
-def multi_layer_feature(body, from_layers, num_filters, strides, pads, min_filter=128):
-    """Wrapper function to extract features from base network, attaching extra
-    layers and SSD specific layers
-
-    Parameters
-    ----------
-    from_layers : list of str
-        feature extraction layers, use '' for add extra layers
-        For example:
-        from_layers = ['relu4_3', 'fc7', '', '', '', '']
-        which means extract feature from relu4_3 and fc7, adding 4 extra layers
-        on top of fc7
-    num_filters : list of int
-        number of filters for extra layers, you can use -1 for extracted features,
-        however, if normalization and scale is applied, the number of filter for
-        that layer must be provided.
-        For example:
-        num_filters = [512, -1, 512, 256, 256, 256]
-    strides : list of int
-        strides for the 3x3 convolution appended, -1 can be used for extracted
-        feature layers
-    pads : list of int
-        paddings for the 3x3 convolution, -1 can be used for extracted layers
-    min_filter : int
-        minimum number of filters used in 1x1 convolution
-
-    Returns
-    -------
-    list of mx.Symbols
-
-    """
-    # arguments check
-    assert len(from_layers) > 0
-    assert isinstance(from_layers[0], str) and len(from_layers[0].strip()) > 0
-    assert len(from_layers) == len(num_filters) == len(strides) == len(pads)
-
-    internals = body.get_internals()
-    layers = []
-    for k, params in enumerate(zip(from_layers, num_filters, strides, pads)):
-        from_layer, num_filter, s, p = params
-        if from_layer.strip():
-            # extract from base network
-            layer = internals[from_layer.strip() + '_output']
-            layers.append(layer)
-        else:
-            # attach from last feature layer
-            assert len(layers) > 0
-            assert num_filter > 0
-            layer = layers[-1]
-            num_1x1 = max(min_filter, num_filter // 2)
-            conv_1x1 = conv_act_layer(layer, 'multi_feat_%d_conv_1x1' % (k),
-                num_1x1, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu')
-            conv_3x3 = conv_act_layer(conv_1x1, 'multi_feat_%d_conv_3x3' % (k),
-                num_filter, kernel=(3, 3), pad=(p, p), stride=(s, s), act_type='relu')
-            layers.append(conv_3x3)
-    return layers
-
-def multibox_layer(from_layers, num_classes, sizes=[.2, .95],
-                    ratios=[1], normalization=-1, num_channels=[],
-                    clip=False, interm_layer=0, steps=[]):
-    """
-    the basic aggregation module for SSD detection. Takes in multiple layers,
-    generate multiple object detection targets by customized layers
-
-    Parameters:
-    ----------
-    from_layers : list of mx.symbol
-        generate multibox detection from layers
-    num_classes : int
-        number of classes excluding background, will automatically handle
-        background in this function
-    sizes : list or list of list
-        [min_size, max_size] for all layers or [[], [], []...] for specific layers
-    ratios : list or list of list
-        [ratio1, ratio2...] for all layers or [[], [], ...] for specific layers
-    normalizations : int or list of int
-        use normalizations value for all layers or [...] for specific layers,
-        -1 indicate no normalizations and scales
-    num_channels : list of int
-        number of input layer channels, used when normalization is enabled, the
-        length of list should equals to number of normalization layers
-    clip : bool
-        whether to clip out-of-image boxes
-    interm_layer : int
-        if > 0, will add a intermediate Convolution layer
-    steps : list
-        specify steps for each MultiBoxPrior layer, leave empty, it will calculate
-        according to layer dimensions
-
-    Returns:
-    ----------
-    list of outputs, as [loc_preds, cls_preds, anchor_boxes]
-    loc_preds : localization regression prediction
-    cls_preds : classification prediction
-    anchor_boxes : generated anchor boxes
-    """
-    assert len(from_layers) > 0, "from_layers must not be empty list"
-    assert num_classes > 0, \
-        "num_classes {} must be larger than 0".format(num_classes)
-
-    assert len(ratios) > 0, "aspect ratios must not be empty list"
-    if not isinstance(ratios[0], list):
-        # provided only one ratio list, broadcast to all from_layers
-        ratios = [ratios] * len(from_layers)
-    assert len(ratios) == len(from_layers), \
-        "ratios and from_layers must have same length"
-
-    assert len(sizes) > 0, "sizes must not be empty list"
-    if len(sizes) == 2 and not isinstance(sizes[0], list):
-        # provided size range, we need to compute the sizes for each layer
-         assert sizes[0] > 0 and sizes[0] < 1
-         assert sizes[1] > 0 and sizes[1] < 1 and sizes[1] > sizes[0]
-         tmp = np.linspace(sizes[0], sizes[1], num=(len(from_layers)-1))
-         # Ref for start_offset value:
-         # https://arxiv.org/abs/1512.02325
-         start_offset = 0.1
-         min_sizes = [start_offset] + tmp.tolist()
-         max_sizes = tmp.tolist() + [tmp[-1]+start_offset]
-         sizes = zip(min_sizes, max_sizes)
-    assert len(sizes) == len(from_layers), \
-        "sizes and from_layers must have same length"
-
-    if not isinstance(normalization, list):
-        normalization = [normalization] * len(from_layers)
-    assert len(normalization) == len(from_layers)
-
-    assert sum(x > 0 for x in normalization) <= len(num_channels), \
-        "must provide number of channels for each normalized layer"
-
-    if steps:
-        assert len(steps) == len(from_layers), "provide steps for all layers or leave empty"
-
-    loc_pred_layers = []
-    cls_pred_layers = []
-    anchor_layers = []
-    num_classes += 1 # always use background as label 0
-
-    for k, from_layer in enumerate(from_layers):
-        from_name = from_layer.name
-        # normalize
-        if normalization[k] > 0:
-            from_layer = mx.symbol.L2Normalization(data=from_layer, \
-                mode="channel", name="{}_norm".format(from_name))
-            scale = mx.symbol.Variable(name="{}_scale".format(from_name),
-                shape=(1, num_channels.pop(0), 1, 1),
-                init=mx.init.Constant(normalization[k]),
-                attr={'__wd_mult__': '0.1'})
-            from_layer = mx.symbol.broadcast_mul(lhs=scale, rhs=from_layer)
-        if interm_layer > 0:
-            from_layer = mx.symbol.Convolution(data=from_layer, kernel=(3,3), \
-                stride=(1,1), pad=(1,1), num_filter=interm_layer, \
-                name="{}_inter_conv".format(from_name))
-            from_layer = mx.symbol.Activation(data=from_layer, act_type="relu", \
-                name="{}_inter_relu".format(from_name))
-
-        # estimate number of anchors per location
-        # here I follow the original version in caffe
-        # TODO: better way to shape the anchors??
-        size = sizes[k]
-        assert len(size) > 0, "must provide at least one size"
-        size_str = "(" + ",".join([str(x) for x in size]) + ")"
-        ratio = ratios[k]
-        assert len(ratio) > 0, "must provide at least one ratio"
-        ratio_str = "(" + ",".join([str(x) for x in ratio]) + ")"
-        num_anchors = len(size) -1 + len(ratio)
-
-        # create location prediction layer
-        num_loc_pred = num_anchors * 4
-        bias = mx.symbol.Variable(name="{}_loc_pred_conv_bias".format(from_name),
-            init=mx.init.Constant(0.0), attr={'__lr_mult__': '2.0'})
-        loc_pred = mx.symbol.Convolution(data=from_layer, bias=bias, kernel=(3,3), \
-            stride=(1,1), pad=(1,1), num_filter=num_loc_pred, \
-            name="{}_loc_pred_conv".format(from_name))
-        loc_pred = mx.symbol.transpose(loc_pred, axes=(0,2,3,1))
-        loc_pred = mx.symbol.Flatten(data=loc_pred)
-        loc_pred_layers.append(loc_pred)
-
-        # create class prediction layer
-        num_cls_pred = num_anchors * num_classes
-        bias = mx.symbol.Variable(name="{}_cls_pred_conv_bias".format(from_name),
-            init=mx.init.Constant(0.0), attr={'__lr_mult__': '2.0'})
-        cls_pred = mx.symbol.Convolution(data=from_layer, bias=bias, kernel=(3,3), \
-            stride=(1,1), pad=(1,1), num_filter=num_cls_pred, \
-            name="{}_cls_pred_conv".format(from_name))
-        cls_pred = mx.symbol.transpose(cls_pred, axes=(0,2,3,1))
-        cls_pred = mx.symbol.Flatten(data=cls_pred)
-        cls_pred_layers.append(cls_pred)
-
-        # create anchor generation layer
-        if steps:
-            step = (steps[k], steps[k])
-        else:
-            step = '(-1.0, -1.0)'
-        anchors = mx.symbol.contrib.MultiBoxPrior(from_layer, sizes=size_str, ratios=ratio_str,
-                                                  clip=clip, name="{}_anchors".format(from_name),
-                                                  steps=step)
-        anchors = mx.symbol.Flatten(data=anchors)
-        anchor_layers.append(anchors)
-
-    loc_preds = mx.symbol.Concat(*loc_pred_layers, num_args=len(loc_pred_layers), \
-        dim=1, name="multibox_loc_pred")
-    cls_preds = mx.symbol.Concat(*cls_pred_layers, num_args=len(cls_pred_layers), \
-        dim=1)
-    cls_preds = mx.symbol.Reshape(data=cls_preds, shape=(0, -1, num_classes))
-    cls_preds = mx.symbol.transpose(cls_preds, axes=(0, 2, 1), name="multibox_cls_pred")
-    anchor_boxes = mx.symbol.Concat(*anchor_layers, \
-        num_args=len(anchor_layers), dim=1)
-    anchor_boxes = mx.symbol.Reshape(data=anchor_boxes, shape=(0, -1, 4), name="multibox_anchors")
-    return [loc_preds, cls_preds, anchor_boxes]
diff --git a/example/ssd/symbol/inceptionv3.py b/example/ssd/symbol/inceptionv3.py
deleted file mode 100644
index 6022ce505a8f..000000000000
--- a/example/ssd/symbol/inceptionv3.py
+++ /dev/null
@@ -1,185 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""
-Inception V3, suitable for images with around 299 x 299
-
-Reference:
-
-Szegedy, Christian, et al. "Rethinking the Inception Architecture for Computer Vision." arXiv preprint arXiv:1512.00567 (2015).
-"""
-import mxnet as mx
-
-def Conv(data, num_filter, kernel=(1, 1), stride=(1, 1), pad=(0, 0), name=None, suffix=''):
-    conv = mx.sym.Convolution(data=data, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad, no_bias=True, name='%s%s_conv2d' %(name, suffix))
-    bn = mx.sym.BatchNorm(data=conv, name='%s%s_batchnorm' %(name, suffix), fix_gamma=True)
-    act = mx.sym.Activation(data=bn, act_type='relu', name='%s%s_relu' %(name, suffix))
-    return act
-
-
-def Inception7A(data,
-                num_1x1,
-                num_3x3_red, num_3x3_1, num_3x3_2,
-                num_5x5_red, num_5x5,
-                pool, proj,
-                name):
-    tower_1x1 = Conv(data, num_1x1, name=('%s_conv' % name))
-    tower_5x5 = Conv(data, num_5x5_red, name=('%s_tower' % name), suffix='_conv')
-    tower_5x5 = Conv(tower_5x5, num_5x5, kernel=(5, 5), pad=(2, 2), name=('%s_tower' % name), suffix='_conv_1')
-    tower_3x3 = Conv(data, num_3x3_red, name=('%s_tower_1' % name), suffix='_conv')
-    tower_3x3 = Conv(tower_3x3, num_3x3_1, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), suffix='_conv_1')
-    tower_3x3 = Conv(tower_3x3, num_3x3_2, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), suffix='_conv_2')
-    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
-    cproj = Conv(pooling, proj, name=('%s_tower_2' %  name), suffix='_conv')
-    concat = mx.sym.Concat(*[tower_1x1, tower_5x5, tower_3x3, cproj], name='ch_concat_%s_chconcat' % name)
-    return concat
-
-# First Downsample
-def Inception7B(data,
-                num_3x3,
-                num_d3x3_red, num_d3x3_1, num_d3x3_2,
-                pool,
-                name):
-    tower_3x3 = Conv(data, num_3x3, kernel=(3, 3), pad=(0, 0), stride=(2, 2), name=('%s_conv' % name))
-    tower_d3x3 = Conv(data, num_d3x3_red, name=('%s_tower' % name), suffix='_conv')
-    tower_d3x3 = Conv(tower_d3x3, num_d3x3_1, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name=('%s_tower' % name), suffix='_conv_1')
-    tower_d3x3 = Conv(tower_d3x3, num_d3x3_2, kernel=(3, 3), pad=(0, 0), stride=(2, 2), name=('%s_tower' % name), suffix='_conv_2')
-    pooling = mx.symbol.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pad=(0,0), pool_type="max", name=('max_pool_%s_pool' % name))
-    concat = mx.sym.Concat(*[tower_3x3, tower_d3x3, pooling], name='ch_concat_%s_chconcat' % name)
-    return concat
-
-def Inception7C(data,
-                num_1x1,
-                num_d7_red, num_d7_1, num_d7_2,
-                num_q7_red, num_q7_1, num_q7_2, num_q7_3, num_q7_4,
-                pool, proj,
-                name):
-    tower_1x1 = Conv(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_conv' % name))
-    tower_d7 = Conv(data=data, num_filter=num_d7_red, name=('%s_tower' % name), suffix='_conv')
-    tower_d7 = Conv(data=tower_d7, num_filter=num_d7_1, kernel=(1, 7), pad=(0, 3), name=('%s_tower' % name), suffix='_conv_1')
-    tower_d7 = Conv(data=tower_d7, num_filter=num_d7_2, kernel=(7, 1), pad=(3, 0), name=('%s_tower' % name), suffix='_conv_2')
-    tower_q7 = Conv(data=data, num_filter=num_q7_red, name=('%s_tower_1' % name), suffix='_conv')
-    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_1, kernel=(7, 1), pad=(3, 0), name=('%s_tower_1' % name), suffix='_conv_1')
-    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_2, kernel=(1, 7), pad=(0, 3), name=('%s_tower_1' % name), suffix='_conv_2')
-    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_3, kernel=(7, 1), pad=(3, 0), name=('%s_tower_1' % name), suffix='_conv_3')
-    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_4, kernel=(1, 7), pad=(0, 3), name=('%s_tower_1' % name), suffix='_conv_4')
-    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
-    cproj = Conv(data=pooling, num_filter=proj, kernel=(1, 1), name=('%s_tower_2' %  name), suffix='_conv')
-    # concat
-    concat = mx.sym.Concat(*[tower_1x1, tower_d7, tower_q7, cproj], name='ch_concat_%s_chconcat' % name)
-    return concat
-
-def Inception7D(data,
-                num_3x3_red, num_3x3,
-                num_d7_3x3_red, num_d7_1, num_d7_2, num_d7_3x3,
-                pool,
-                name):
-    tower_3x3 = Conv(data=data, num_filter=num_3x3_red, name=('%s_tower' % name), suffix='_conv')
-    tower_3x3 = Conv(data=tower_3x3, num_filter=num_3x3, kernel=(3, 3), pad=(0,0), stride=(2, 2), name=('%s_tower' % name), suffix='_conv_1')
-    tower_d7_3x3 = Conv(data=data, num_filter=num_d7_3x3_red, name=('%s_tower_1' % name), suffix='_conv')
-    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_1, kernel=(1, 7), pad=(0, 3), name=('%s_tower_1' % name), suffix='_conv_1')
-    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_2, kernel=(7, 1), pad=(3, 0), name=('%s_tower_1' % name), suffix='_conv_2')
-    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_3x3, kernel=(3, 3), stride=(2, 2), name=('%s_tower_1' % name), suffix='_conv_3')
-    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
-    # concat
-    concat = mx.sym.Concat(*[tower_3x3, tower_d7_3x3, pooling], name='ch_concat_%s_chconcat' % name)
-    return concat
-
-def Inception7E(data,
-                num_1x1,
-                num_d3_red, num_d3_1, num_d3_2,
-                num_3x3_d3_red, num_3x3, num_3x3_d3_1, num_3x3_d3_2,
-                pool, proj,
-                name):
-    tower_1x1 = Conv(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_conv' % name))
-    tower_d3 = Conv(data=data, num_filter=num_d3_red, name=('%s_tower' % name), suffix='_conv')
-    tower_d3_a = Conv(data=tower_d3, num_filter=num_d3_1, kernel=(1, 3), pad=(0, 1), name=('%s_tower' % name), suffix='_mixed_conv')
-    tower_d3_b = Conv(data=tower_d3, num_filter=num_d3_2, kernel=(3, 1), pad=(1, 0), name=('%s_tower' % name), suffix='_mixed_conv_1')
-    tower_3x3_d3 = Conv(data=data, num_filter=num_3x3_d3_red, name=('%s_tower_1' % name), suffix='_conv')
-    tower_3x3_d3 = Conv(data=tower_3x3_d3, num_filter=num_3x3, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), suffix='_conv_1')
-    tower_3x3_d3_a = Conv(data=tower_3x3_d3, num_filter=num_3x3_d3_1, kernel=(1, 3), pad=(0, 1), name=('%s_tower_1' % name), suffix='_mixed_conv')
-    tower_3x3_d3_b = Conv(data=tower_3x3_d3, num_filter=num_3x3_d3_2, kernel=(3, 1), pad=(1, 0), name=('%s_tower_1' % name), suffix='_mixed_conv_1')
-    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
-    cproj = Conv(data=pooling, num_filter=proj, kernel=(1, 1), name=('%s_tower_2' %  name), suffix='_conv')
-    # concat
-    concat = mx.sym.Concat(*[tower_1x1, tower_d3_a, tower_d3_b, tower_3x3_d3_a, tower_3x3_d3_b, cproj], name='ch_concat_%s_chconcat' % name)
-    return concat
-
-# In[49]:
-
-def get_symbol(num_classes=1000, **kwargs):
-    data = mx.symbol.Variable(name="data")
-    # stage 1
-    conv = Conv(data, 32, kernel=(3, 3), stride=(2, 2), name="conv")
-    conv_1 = Conv(conv, 32, kernel=(3, 3), name="conv_1")
-    conv_2 = Conv(conv_1, 64, kernel=(3, 3), pad=(1, 1), name="conv_2")
-    pool = mx.sym.Pooling(data=conv_2, kernel=(3, 3), stride=(2, 2), pool_type="max", name="pool")
-    # stage 2
-    conv_3 = Conv(pool, 80, kernel=(1, 1), name="conv_3")
-    conv_4 = Conv(conv_3, 192, kernel=(3, 3), name="conv_4")
-    pool1 = mx.sym.Pooling(data=conv_4, kernel=(3, 3), stride=(2, 2), pool_type="max", name="pool1")
-    # stage 3
-    in3a = Inception7A(pool1, 64,
-                       64, 96, 96,
-                       48, 64,
-                       "avg", 32, "mixed")
-    in3b = Inception7A(in3a, 64,
-                       64, 96, 96,
-                       48, 64,
-                       "avg", 64, "mixed_1")
-    in3c = Inception7A(in3b, 64,
-                       64, 96, 96,
-                       48, 64,
-                       "avg", 64, "mixed_2")
-    in3d = Inception7B(in3c, 384,
-                       64, 96, 96,
-                       "max", "mixed_3")
-    # stage 4
-    in4a = Inception7C(in3d, 192,
-                       128, 128, 192,
-                       128, 128, 128, 128, 192,
-                       "avg", 192, "mixed_4")
-    in4b = Inception7C(in4a, 192,
-                       160, 160, 192,
-                       160, 160, 160, 160, 192,
-                       "avg", 192, "mixed_5")
-    in4c = Inception7C(in4b, 192,
-                       160, 160, 192,
-                       160, 160, 160, 160, 192,
-                       "avg", 192, "mixed_6")
-    in4d = Inception7C(in4c, 192,
-                       192, 192, 192,
-                       192, 192, 192, 192, 192,
-                       "avg", 192, "mixed_7")
-    in4e = Inception7D(in4d, 192, 320,
-                       192, 192, 192, 192,
-                       "max", "mixed_8")
-    # stage 5
-    in5a = Inception7E(in4e, 320,
-                       384, 384, 384,
-                       448, 384, 384, 384,
-                       "avg", 192, "mixed_9")
-    in5b = Inception7E(in5a, 320,
-                       384, 384, 384,
-                       448, 384, 384, 384,
-                       "max", 192, "mixed_10")
-    # pool
-    pool = mx.sym.Pooling(data=in5b, kernel=(8, 8), stride=(1, 1), pool_type="avg", name="global_pool")
-    flatten = mx.sym.Flatten(data=pool, name="flatten")
-    fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=num_classes, name='fc1')
-    softmax = mx.symbol.SoftmaxOutput(data=fc1, name='softmax')
-    return softmax
diff --git a/example/ssd/symbol/legacy_vgg16_ssd_300.py b/example/ssd/symbol/legacy_vgg16_ssd_300.py
deleted file mode 100644
index 0acac6e4294b..000000000000
--- a/example/ssd/symbol/legacy_vgg16_ssd_300.py
+++ /dev/null
@@ -1,207 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import mxnet as mx
-from common import legacy_conv_act_layer
-from common import multibox_layer
-
-def get_symbol_train(num_classes=20, nms_thresh=0.5, force_suppress=False,
-                     nms_topk=400, **kwargs):
-    """
-    Single-shot multi-box detection with VGG 16 layers ConvNet
-    This is a modified version, with fc6/fc7 layers replaced by conv layers
-    And the network is slightly smaller than original VGG 16 network
-    This is a training network with losses
-
-    Parameters:
-    ----------
-    num_classes: int
-        number of object classes not including background
-    nms_thresh : float
-        non-maximum suppression threshold
-    force_suppress : boolean
-        whether suppress different class objects
-    nms_topk : int
-        apply NMS to top K detections
-
-    Returns:
-    ----------
-    mx.Symbol
-    """
-    data = mx.symbol.Variable(name="data")
-    label = mx.symbol.Variable(name="label")
-
-    # group 1
-    conv1_1 = mx.symbol.Convolution(
-        data=data, kernel=(3, 3), pad=(1, 1), num_filter=64, name="conv1_1")
-    relu1_1 = mx.symbol.Activation(data=conv1_1, act_type="relu", name="relu1_1")
-    conv1_2 = mx.symbol.Convolution(
-        data=relu1_1, kernel=(3, 3), pad=(1, 1), num_filter=64, name="conv1_2")
-    relu1_2 = mx.symbol.Activation(data=conv1_2, act_type="relu", name="relu1_2")
-    pool1 = mx.symbol.Pooling(
-        data=relu1_2, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool1")
-    # group 2
-    conv2_1 = mx.symbol.Convolution(
-        data=pool1, kernel=(3, 3), pad=(1, 1), num_filter=128, name="conv2_1")
-    relu2_1 = mx.symbol.Activation(data=conv2_1, act_type="relu", name="relu2_1")
-    conv2_2 = mx.symbol.Convolution(
-        data=relu2_1, kernel=(3, 3), pad=(1, 1), num_filter=128, name="conv2_2")
-    relu2_2 = mx.symbol.Activation(data=conv2_2, act_type="relu", name="relu2_2")
-    pool2 = mx.symbol.Pooling(
-        data=relu2_2, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool2")
-    # group 3
-    conv3_1 = mx.symbol.Convolution(
-        data=pool2, kernel=(3, 3), pad=(1, 1), num_filter=256, name="conv3_1")
-    relu3_1 = mx.symbol.Activation(data=conv3_1, act_type="relu", name="relu3_1")
-    conv3_2 = mx.symbol.Convolution(
-        data=relu3_1, kernel=(3, 3), pad=(1, 1), num_filter=256, name="conv3_2")
-    relu3_2 = mx.symbol.Activation(data=conv3_2, act_type="relu", name="relu3_2")
-    conv3_3 = mx.symbol.Convolution(
-        data=relu3_2, kernel=(3, 3), pad=(1, 1), num_filter=256, name="conv3_3")
-    relu3_3 = mx.symbol.Activation(data=conv3_3, act_type="relu", name="relu3_3")
-    pool3 = mx.symbol.Pooling(
-        data=relu3_3, pool_type="max", kernel=(2, 2), stride=(2, 2), \
-        pooling_convention="full", name="pool3")
-    # group 4
-    conv4_1 = mx.symbol.Convolution(
-        data=pool3, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv4_1")
-    relu4_1 = mx.symbol.Activation(data=conv4_1, act_type="relu", name="relu4_1")
-    conv4_2 = mx.symbol.Convolution(
-        data=relu4_1, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv4_2")
-    relu4_2 = mx.symbol.Activation(data=conv4_2, act_type="relu", name="relu4_2")
-    conv4_3 = mx.symbol.Convolution(
-        data=relu4_2, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv4_3")
-    relu4_3 = mx.symbol.Activation(data=conv4_3, act_type="relu", name="relu4_3")
-    pool4 = mx.symbol.Pooling(
-        data=relu4_3, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool4")
-    # group 5
-    conv5_1 = mx.symbol.Convolution(
-        data=pool4, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_1")
-    relu5_1 = mx.symbol.Activation(data=conv5_1, act_type="relu", name="relu5_1")
-    conv5_2 = mx.symbol.Convolution(
-        data=relu5_1, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_2")
-    relu5_2 = mx.symbol.Activation(data=conv5_2, act_type="relu", name="relu5_2")
-    conv5_3 = mx.symbol.Convolution(
-        data=relu5_2, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_3")
-    relu5_3 = mx.symbol.Activation(data=conv5_3, act_type="relu", name="relu5_3")
-    pool5 = mx.symbol.Pooling(
-        data=relu5_3, pool_type="max", kernel=(3, 3), stride=(1, 1),
-        pad=(1,1), name="pool5")
-    # group 6
-    conv6 = mx.symbol.Convolution(
-        data=pool5, kernel=(3, 3), pad=(6, 6), dilate=(6, 6),
-        num_filter=1024, name="conv6")
-    relu6 = mx.symbol.Activation(data=conv6, act_type="relu", name="relu6")
-    # drop6 = mx.symbol.Dropout(data=relu6, p=0.5, name="drop6")
-    # group 7
-    conv7 = mx.symbol.Convolution(
-        data=relu6, kernel=(1, 1), pad=(0, 0), num_filter=1024, name="conv7")
-    relu7 = mx.symbol.Activation(data=conv7, act_type="relu", name="relu7")
-    # drop7 = mx.symbol.Dropout(data=relu7, p=0.5, name="drop7")
-
-    ### ssd extra layers ###
-    conv8_1, relu8_1 = legacy_conv_act_layer(relu7, "8_1", 256, kernel=(1,1), pad=(0,0), \
-        stride=(1,1), act_type="relu", use_batchnorm=False)
-    conv8_2, relu8_2 = legacy_conv_act_layer(relu8_1, "8_2", 512, kernel=(3,3), pad=(1,1), \
-        stride=(2,2), act_type="relu", use_batchnorm=False)
-    conv9_1, relu9_1 = legacy_conv_act_layer(relu8_2, "9_1", 128, kernel=(1,1), pad=(0,0), \
-        stride=(1,1), act_type="relu", use_batchnorm=False)
-    conv9_2, relu9_2 = legacy_conv_act_layer(relu9_1, "9_2", 256, kernel=(3,3), pad=(1,1), \
-        stride=(2,2), act_type="relu", use_batchnorm=False)
-    conv10_1, relu10_1 = legacy_conv_act_layer(relu9_2, "10_1", 128, kernel=(1,1), pad=(0,0), \
-        stride=(1,1), act_type="relu", use_batchnorm=False)
-    conv10_2, relu10_2 = legacy_conv_act_layer(relu10_1, "10_2", 256, kernel=(3,3), pad=(0,0), \
-        stride=(1,1), act_type="relu", use_batchnorm=False)
-    conv11_1, relu11_1 = legacy_conv_act_layer(relu10_2, "11_1", 128, kernel=(1,1), pad=(0,0), \
-        stride=(1,1), act_type="relu", use_batchnorm=False)
-    conv11_2, relu11_2 = legacy_conv_act_layer(relu11_1, "11_2", 256, kernel=(3,3), pad=(0,0), \
-        stride=(1,1), act_type="relu", use_batchnorm=False)
-
-    # specific parameters for VGG16 network
-    from_layers = [relu4_3, relu7, relu8_2, relu9_2, relu10_2, relu11_2]
-    sizes = [[.1, .141], [.2,.272], [.37, .447], [.54, .619], [.71, .79], [.88, .961]]
-    ratios = [[1,2,.5], [1,2,.5,3,1./3], [1,2,.5,3,1./3], [1,2,.5,3,1./3], \
-        [1,2,.5], [1,2,.5]]
-    normalizations = [20, -1, -1, -1, -1, -1]
-    steps = [ x / 300.0 for x in [8, 16, 32, 64, 100, 300]]
-    num_channels = [512]
-
-    loc_preds, cls_preds, anchor_boxes = multibox_layer(from_layers, \
-        num_classes, sizes=sizes, ratios=ratios, normalization=normalizations, \
-        num_channels=num_channels, clip=False, interm_layer=0, steps=steps)
-
-    tmp = mx.symbol.contrib.MultiBoxTarget(
-        *[anchor_boxes, label, cls_preds], overlap_threshold=.5, \
-        ignore_label=-1, negative_mining_ratio=3, minimum_negative_samples=0, \
-        negative_mining_thresh=.5, variances=(0.1, 0.1, 0.2, 0.2),
-        name="multibox_target")
-    loc_target = tmp[0]
-    loc_target_mask = tmp[1]
-    cls_target = tmp[2]
-
-    cls_prob = mx.symbol.SoftmaxOutput(data=cls_preds, label=cls_target, \
-        ignore_label=-1, use_ignore=True, grad_scale=1., multi_output=True, \
-        normalization='valid', name="cls_prob")
-    loc_loss_ = mx.symbol.smooth_l1(name="loc_loss_", \
-        data=loc_target_mask * (loc_preds - loc_target), scalar=1.0)
-    loc_loss = mx.symbol.MakeLoss(loc_loss_, grad_scale=1., \
-        normalization='valid', name="loc_loss")
-
-    # monitoring training status
-    cls_label = mx.symbol.MakeLoss(data=cls_target, grad_scale=0, name="cls_label")
-    det = mx.symbol.contrib.MultiBoxDetection(*[cls_prob, loc_preds, anchor_boxes], \
-        name="detection", nms_threshold=nms_thresh, force_suppress=force_suppress,
-        variances=(0.1, 0.1, 0.2, 0.2), nms_topk=nms_topk)
-    det = mx.symbol.MakeLoss(data=det, grad_scale=0, name="det_out")
-
-    # group output
-    out = mx.symbol.Group([cls_prob, loc_loss, cls_label, det])
-    return out
-
-def get_symbol(num_classes=20, nms_thresh=0.5, force_suppress=False,
-               nms_topk=400, **kwargs):
-    """
-    Single-shot multi-box detection with VGG 16 layers ConvNet
-    This is a modified version, with fc6/fc7 layers replaced by conv layers
-    And the network is slightly smaller than original VGG 16 network
-    This is the detection network
-
-    Parameters:
-    ----------
-    num_classes: int
-        number of object classes not including background
-    nms_thresh : float
-        threshold of overlap for non-maximum suppression
-    force_suppress : boolean
-        whether suppress different class objects
-    nms_topk : int
-        apply NMS to top K detections
-
-    Returns:
-    ----------
-    mx.Symbol
-    """
-    net = get_symbol_train(num_classes)
-    cls_preds = net.get_internals()["multibox_cls_pred_output"]
-    loc_preds = net.get_internals()["multibox_loc_pred_output"]
-    anchor_boxes = net.get_internals()["multibox_anchors_output"]
-
-    cls_prob = mx.symbol.softmax(data=cls_preds, axis=1, name='cls_prob')
-    out = mx.symbol.contrib.MultiBoxDetection(*[cls_prob, loc_preds, anchor_boxes], \
-        name="detection", nms_threshold=nms_thresh, force_suppress=force_suppress,
-        variances=(0.1, 0.1, 0.2, 0.2), nms_topk=nms_topk)
-    return out
diff --git a/example/ssd/symbol/legacy_vgg16_ssd_512.py b/example/ssd/symbol/legacy_vgg16_ssd_512.py
deleted file mode 100644
index 74d6b37fc11e..000000000000
--- a/example/ssd/symbol/legacy_vgg16_ssd_512.py
+++ /dev/null
@@ -1,210 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import mxnet as mx
-from common import legacy_conv_act_layer
-from common import multibox_layer
-
-def get_symbol_train(num_classes=20, nms_thresh=0.5, force_suppress=False, nms_topk=400):
-    """
-    Single-shot multi-box detection with VGG 16 layers ConvNet
-    This is a modified version, with fc6/fc7 layers replaced by conv layers
-    And the network is slightly smaller than original VGG 16 network
-    This is a training network with losses
-
-    Parameters:
-    ----------
-    num_classes: int
-        number of object classes not including background
-    nms_thresh : float
-        non-maximum suppression threshold
-    force_suppress : boolean
-        whether suppress different class objects
-    nms_topk : int
-        apply NMS to top K detections
-
-    Returns:
-    ----------
-    mx.Symbol
-    """
-    data = mx.symbol.Variable(name="data")
-    label = mx.symbol.Variable(name="label")
-
-    # group 1
-    conv1_1 = mx.symbol.Convolution(
-        data=data, kernel=(3, 3), pad=(1, 1), num_filter=64, name="conv1_1")
-    relu1_1 = mx.symbol.Activation(data=conv1_1, act_type="relu", name="relu1_1")
-    conv1_2 = mx.symbol.Convolution(
-        data=relu1_1, kernel=(3, 3), pad=(1, 1), num_filter=64, name="conv1_2")
-    relu1_2 = mx.symbol.Activation(data=conv1_2, act_type="relu", name="relu1_2")
-    pool1 = mx.symbol.Pooling(
-        data=relu1_2, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool1")
-    # group 2
-    conv2_1 = mx.symbol.Convolution(
-        data=pool1, kernel=(3, 3), pad=(1, 1), num_filter=128, name="conv2_1")
-    relu2_1 = mx.symbol.Activation(data=conv2_1, act_type="relu", name="relu2_1")
-    conv2_2 = mx.symbol.Convolution(
-        data=relu2_1, kernel=(3, 3), pad=(1, 1), num_filter=128, name="conv2_2")
-    relu2_2 = mx.symbol.Activation(data=conv2_2, act_type="relu", name="relu2_2")
-    pool2 = mx.symbol.Pooling(
-        data=relu2_2, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool2")
-    # group 3
-    conv3_1 = mx.symbol.Convolution(
-        data=pool2, kernel=(3, 3), pad=(1, 1), num_filter=256, name="conv3_1")
-    relu3_1 = mx.symbol.Activation(data=conv3_1, act_type="relu", name="relu3_1")
-    conv3_2 = mx.symbol.Convolution(
-        data=relu3_1, kernel=(3, 3), pad=(1, 1), num_filter=256, name="conv3_2")
-    relu3_2 = mx.symbol.Activation(data=conv3_2, act_type="relu", name="relu3_2")
-    conv3_3 = mx.symbol.Convolution(
-        data=relu3_2, kernel=(3, 3), pad=(1, 1), num_filter=256, name="conv3_3")
-    relu3_3 = mx.symbol.Activation(data=conv3_3, act_type="relu", name="relu3_3")
-    pool3 = mx.symbol.Pooling(
-        data=relu3_3, pool_type="max", kernel=(2, 2), stride=(2, 2), \
-        pooling_convention="full", name="pool3")
-    # group 4
-    conv4_1 = mx.symbol.Convolution(
-        data=pool3, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv4_1")
-    relu4_1 = mx.symbol.Activation(data=conv4_1, act_type="relu", name="relu4_1")
-    conv4_2 = mx.symbol.Convolution(
-        data=relu4_1, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv4_2")
-    relu4_2 = mx.symbol.Activation(data=conv4_2, act_type="relu", name="relu4_2")
-    conv4_3 = mx.symbol.Convolution(
-        data=relu4_2, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv4_3")
-    relu4_3 = mx.symbol.Activation(data=conv4_3, act_type="relu", name="relu4_3")
-    pool4 = mx.symbol.Pooling(
-        data=relu4_3, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool4")
-    # group 5
-    conv5_1 = mx.symbol.Convolution(
-        data=pool4, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_1")
-    relu5_1 = mx.symbol.Activation(data=conv5_1, act_type="relu", name="relu5_1")
-    conv5_2 = mx.symbol.Convolution(
-        data=relu5_1, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_2")
-    relu5_2 = mx.symbol.Activation(data=conv5_2, act_type="relu", name="relu5_2")
-    conv5_3 = mx.symbol.Convolution(
-        data=relu5_2, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_3")
-    relu5_3 = mx.symbol.Activation(data=conv5_3, act_type="relu", name="relu5_3")
-    pool5 = mx.symbol.Pooling(
-        data=relu5_3, pool_type="max", kernel=(3, 3), stride=(1, 1),
-        pad=(1,1), name="pool5")
-    # group 6
-    conv6 = mx.symbol.Convolution(
-        data=pool5, kernel=(3, 3), pad=(6, 6), dilate=(6, 6),
-        num_filter=1024, name="conv6")
-    relu6 = mx.symbol.Activation(data=conv6, act_type="relu", name="relu6")
-    # drop6 = mx.symbol.Dropout(data=relu6, p=0.5, name="drop6")
-    # group 7
-    conv7 = mx.symbol.Convolution(
-        data=relu6, kernel=(1, 1), pad=(0, 0), num_filter=1024, name="conv7")
-    relu7 = mx.symbol.Activation(data=conv7, act_type="relu", name="relu7")
-    # drop7 = mx.symbol.Dropout(data=relu7, p=0.5, name="drop7")
-
-    ### ssd extra layers ###
-    conv8_1, relu8_1 = legacy_conv_act_layer(relu7, "8_1", 256, kernel=(1,1), pad=(0,0), \
-        stride=(1,1), act_type="relu", use_batchnorm=False)
-    conv8_2, relu8_2 = legacy_conv_act_layer(relu8_1, "8_2", 512, kernel=(3,3), pad=(1,1), \
-        stride=(2,2), act_type="relu", use_batchnorm=False)
-    conv9_1, relu9_1 = legacy_conv_act_layer(relu8_2, "9_1", 128, kernel=(1,1), pad=(0,0), \
-        stride=(1,1), act_type="relu", use_batchnorm=False)
-    conv9_2, relu9_2 = legacy_conv_act_layer(relu9_1, "9_2", 256, kernel=(3,3), pad=(1,1), \
-        stride=(2,2), act_type="relu", use_batchnorm=False)
-    conv10_1, relu10_1 = legacy_conv_act_layer(relu9_2, "10_1", 128, kernel=(1,1), pad=(0,0), \
-        stride=(1,1), act_type="relu", use_batchnorm=False)
-    conv10_2, relu10_2 = legacy_conv_act_layer(relu10_1, "10_2", 256, kernel=(3,3), pad=(1,1), \
-        stride=(2,2), act_type="relu", use_batchnorm=False)
-    conv11_1, relu11_1 = legacy_conv_act_layer(relu10_2, "11_1", 128, kernel=(1,1), pad=(0,0), \
-        stride=(1,1), act_type="relu", use_batchnorm=False)
-    conv11_2, relu11_2 = legacy_conv_act_layer(relu11_1, "11_2", 256, kernel=(3,3), pad=(1,1), \
-        stride=(2,2), act_type="relu", use_batchnorm=False)
-    conv12_1, relu12_1 = legacy_conv_act_layer(relu11_2, "12_1", 128, kernel=(1,1), pad=(0,0), \
-        stride=(1,1), act_type="relu", use_batchnorm=False)
-    conv12_2, relu12_2 = legacy_conv_act_layer(relu12_1, "12_2", 256, kernel=(4,4), pad=(1,1), \
-        stride=(1,1), act_type="relu", use_batchnorm=False)
-
-    # specific parameters for VGG16 network
-    from_layers = [relu4_3, relu7, relu8_2, relu9_2, relu10_2, relu11_2, relu12_2]
-    sizes = [[.07, .1025], [.15,.2121], [.3, .3674], [.45, .5196], [.6, .6708], \
-        [.75, .8216], [.9, .9721]]
-    ratios = [[1,2,.5], [1,2,.5,3,1./3], [1,2,.5,3,1./3], [1,2,.5,3,1./3], \
-        [1,2,.5,3,1./3], [1,2,.5], [1,2,.5]]
-    normalizations = [20, -1, -1, -1, -1, -1, -1]
-    steps = [ x / 512.0 for x in [8, 16, 32, 64, 128, 256, 512]]
-    num_channels = [512]
-
-    loc_preds, cls_preds, anchor_boxes = multibox_layer(from_layers, \
-        num_classes, sizes=sizes, ratios=ratios, normalization=normalizations, \
-        num_channels=num_channels, clip=False, interm_layer=0, steps=steps)
-
-    tmp = mx.symbol.contrib.MultiBoxTarget(
-        *[anchor_boxes, label, cls_preds], overlap_threshold=.5, \
-        ignore_label=-1, negative_mining_ratio=3, minimum_negative_samples=0, \
-        negative_mining_thresh=.5, variances=(0.1, 0.1, 0.2, 0.2),
-        name="multibox_target")
-    loc_target = tmp[0]
-    loc_target_mask = tmp[1]
-    cls_target = tmp[2]
-
-    cls_prob = mx.symbol.SoftmaxOutput(data=cls_preds, label=cls_target, \
-        ignore_label=-1, use_ignore=True, grad_scale=1., multi_output=True, \
-        normalization='valid', name="cls_prob")
-    loc_loss_ = mx.symbol.smooth_l1(name="loc_loss_", \
-        data=loc_target_mask * (loc_preds - loc_target), scalar=1.0)
-    loc_loss = mx.symbol.MakeLoss(loc_loss_, grad_scale=1., \
-        normalization='valid', name="loc_loss")
-
-    # monitoring training status
-    cls_label = mx.symbol.MakeLoss(data=cls_target, grad_scale=0, name="cls_label")
-    det = mx.symbol.contrib.MultiBoxDetection(*[cls_prob, loc_preds, anchor_boxes], \
-        name="detection", nms_threshold=nms_thresh, force_suppress=force_suppress,
-        variances=(0.1, 0.1, 0.2, 0.2), nms_topk=nms_topk)
-    det = mx.symbol.MakeLoss(data=det, grad_scale=0, name="det_out")
-
-    # group output
-    out = mx.symbol.Group([cls_prob, loc_loss, cls_label, det])
-    return out
-
-def get_symbol(num_classes=20, nms_thresh=0.5, force_suppress=False, nms_topk=400):
-    """
-    Single-shot multi-box detection with VGG 16 layers ConvNet
-    This is a modified version, with fc6/fc7 layers replaced by conv layers
-    And the network is slightly smaller than original VGG 16 network
-    This is the detection network
-
-    Parameters:
-    ----------
-    num_classes: int
-        number of object classes not including background
-    nms_thresh : float
-        threshold of overlap for non-maximum suppression
-    force_suppress : boolean
-        whether suppress different class objects
-    nms_topk : int
-        apply NMS to top K detections
-
-    Returns:
-    ----------
-    mx.Symbol
-    """
-    net = get_symbol_train(num_classes)
-    cls_preds = net.get_internals()["multibox_cls_pred_output"]
-    loc_preds = net.get_internals()["multibox_loc_pred_output"]
-    anchor_boxes = net.get_internals()["multibox_anchors_output"]
-
-    cls_prob = mx.symbol.softmax(data=cls_preds, axis=1, name='cls_prob')
-    out = mx.symbol.contrib.MultiBoxDetection(*[cls_prob, loc_preds, anchor_boxes], \
-        name="detection", nms_threshold=nms_thresh, force_suppress=force_suppress,
-        variances=(0.1, 0.1, 0.2, 0.2), nms_topk=nms_topk)
-    return out
diff --git a/example/ssd/symbol/resnet.py b/example/ssd/symbol/resnet.py
deleted file mode 100644
index d7dc3cc5bd76..000000000000
--- a/example/ssd/symbol/resnet.py
+++ /dev/null
@@ -1,186 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-'''
-Adapted from https://github.com/tornadomeet/ResNet/blob/master/symbol_resnet.py
-Original author Wei Wu
-
-Implemented the following paper:
-
-Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. "Identity Mappings in Deep Residual Networks"
-'''
-import mxnet as mx
-
-def residual_unit(data, num_filter, stride, dim_match, name, bottle_neck=True, bn_mom=0.9, workspace=256, memonger=False):
-    """Return ResNet Unit symbol for building ResNet
-    Parameters
-    ----------
-    data : str
-        Input data
-    num_filter : int
-        Number of output channels
-    bnf : int
-        Bottle neck channels factor with regard to num_filter
-    stride : tupe
-        Stride used in convolution
-    dim_match : Boolen
-        True means channel number between input and output is the same, otherwise means differ
-    name : str
-        Base name of the operators
-    workspace : int
-        Workspace used in convolution operator
-    """
-    if bottle_neck:
-        # the same as https://github.com/facebook/fb.resnet.torch#notes, a bit difference with origin paper
-        bn1 = mx.sym.BatchNorm(data=data, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn1')
-        act1 = mx.sym.Activation(data=bn1, act_type='relu', name=name + '_relu1')
-        conv1 = mx.sym.Convolution(data=act1, num_filter=int(num_filter*0.25), kernel=(1,1), stride=(1,1), pad=(0,0),
-                                   no_bias=True, workspace=workspace, name=name + '_conv1')
-        bn2 = mx.sym.BatchNorm(data=conv1, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn2')
-        act2 = mx.sym.Activation(data=bn2, act_type='relu', name=name + '_relu2')
-        conv2 = mx.sym.Convolution(data=act2, num_filter=int(num_filter*0.25), kernel=(3,3), stride=stride, pad=(1,1),
-                                   no_bias=True, workspace=workspace, name=name + '_conv2')
-        bn3 = mx.sym.BatchNorm(data=conv2, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn3')
-        act3 = mx.sym.Activation(data=bn3, act_type='relu', name=name + '_relu3')
-        conv3 = mx.sym.Convolution(data=act3, num_filter=num_filter, kernel=(1,1), stride=(1,1), pad=(0,0), no_bias=True,
-                                   workspace=workspace, name=name + '_conv3')
-        if dim_match:
-            shortcut = data
-        else:
-            shortcut = mx.sym.Convolution(data=act1, num_filter=num_filter, kernel=(1,1), stride=stride, no_bias=True,
-                                            workspace=workspace, name=name+'_sc')
-        if memonger:
-            shortcut._set_attr(mirror_stage='True')
-        return conv3 + shortcut
-    else:
-        bn1 = mx.sym.BatchNorm(data=data, fix_gamma=False, momentum=bn_mom, eps=2e-5, name=name + '_bn1')
-        act1 = mx.sym.Activation(data=bn1, act_type='relu', name=name + '_relu1')
-        conv1 = mx.sym.Convolution(data=act1, num_filter=num_filter, kernel=(3,3), stride=stride, pad=(1,1),
-                                      no_bias=True, workspace=workspace, name=name + '_conv1')
-        bn2 = mx.sym.BatchNorm(data=conv1, fix_gamma=False, momentum=bn_mom, eps=2e-5, name=name + '_bn2')
-        act2 = mx.sym.Activation(data=bn2, act_type='relu', name=name + '_relu2')
-        conv2 = mx.sym.Convolution(data=act2, num_filter=num_filter, kernel=(3,3), stride=(1,1), pad=(1,1),
-                                      no_bias=True, workspace=workspace, name=name + '_conv2')
-        if dim_match:
-            shortcut = data
-        else:
-            shortcut = mx.sym.Convolution(data=act1, num_filter=num_filter, kernel=(1,1), stride=stride, no_bias=True,
-                                            workspace=workspace, name=name+'_sc')
-        if memonger:
-            shortcut._set_attr(mirror_stage='True')
-        return conv2 + shortcut
-
-def resnet(units, num_stages, filter_list, num_classes, image_shape, bottle_neck=True, bn_mom=0.9, workspace=256, memonger=False):
-    """Return ResNet symbol of
-    Parameters
-    ----------
-    units : list
-        Number of units in each stage
-    num_stages : int
-        Number of stage
-    filter_list : list
-        Channel size of each stage
-    num_classes : int
-        Ouput size of symbol
-    dataset : str
-        Dataset type, only cifar10 and imagenet supports
-    workspace : int
-        Workspace used in convolution operator
-    """
-    num_unit = len(units)
-    assert(num_unit == num_stages)
-    data = mx.sym.Variable(name='data')
-    data = mx.sym.identity(data=data, name='id')
-    data = mx.sym.BatchNorm(data=data, fix_gamma=True, eps=2e-5, momentum=bn_mom, name='bn_data')
-    (nchannel, height, width) = image_shape
-    if height <= 32:            # such as cifar10
-        body = mx.sym.Convolution(data=data, num_filter=filter_list[0], kernel=(3, 3), stride=(1,1), pad=(1, 1),
-                                  no_bias=True, name="conv0", workspace=workspace)
-    else:                       # often expected to be 224 such as imagenet
-        body = mx.sym.Convolution(data=data, num_filter=filter_list[0], kernel=(7, 7), stride=(2,2), pad=(3, 3),
-                                  no_bias=True, name="conv0", workspace=workspace)
-        body = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name='bn0')
-        body = mx.sym.Activation(data=body, act_type='relu', name='relu0')
-        body = mx.symbol.Pooling(data=body, kernel=(3, 3), stride=(2,2), pad=(1,1), pool_type='max')
-
-    for i in range(num_stages):
-        body = residual_unit(body, filter_list[i+1], (1 if i==0 else 2, 1 if i==0 else 2), False,
-                             name='stage%d_unit%d' % (i + 1, 1), bottle_neck=bottle_neck, workspace=workspace,
-                             memonger=memonger)
-        for j in range(units[i]-1):
-            body = residual_unit(body, filter_list[i+1], (1,1), True, name='stage%d_unit%d' % (i + 1, j + 2),
-                                 bottle_neck=bottle_neck, workspace=workspace, memonger=memonger)
-    bn1 = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name='bn1')
-    relu1 = mx.sym.Activation(data=bn1, act_type='relu', name='relu1')
-    # Although kernel is not used here when global_pool=True, we should put one
-    pool1 = mx.symbol.Pooling(data=relu1, global_pool=True, kernel=(7, 7), pool_type='avg', name='pool1')
-    flat = mx.symbol.Flatten(data=pool1)
-    fc1 = mx.symbol.FullyConnected(data=flat, num_hidden=num_classes, name='fc1')
-    return mx.symbol.SoftmaxOutput(data=fc1, name='softmax')
-
-def get_symbol(num_classes, num_layers, image_shape, conv_workspace=256, **kwargs):
-    """
-    Adapted from https://github.com/tornadomeet/ResNet/blob/master/train_resnet.py
-    Original author Wei Wu
-    """
-    image_shape = [int(l) for l in image_shape.split(',')]
-    (nchannel, height, width) = image_shape
-    if height <= 28:
-        num_stages = 3
-        if (num_layers-2) % 9 == 0 and num_layers >= 164:
-            per_unit = [(num_layers-2)//9]
-            filter_list = [16, 64, 128, 256]
-            bottle_neck = True
-        elif (num_layers-2) % 6 == 0 and num_layers < 164:
-            per_unit = [(num_layers-2)//6]
-            filter_list = [16, 16, 32, 64]
-            bottle_neck = False
-        else:
-            raise ValueError("no experiments done on num_layers {}, you can do it yourself".format(num_layers))
-        units = per_unit * num_stages
-    else:
-        if num_layers >= 50:
-            filter_list = [64, 256, 512, 1024, 2048]
-            bottle_neck = True
-        else:
-            filter_list = [64, 64, 128, 256, 512]
-            bottle_neck = False
-        num_stages = 4
-        if num_layers == 18:
-            units = [2, 2, 2, 2]
-        elif num_layers == 34:
-            units = [3, 4, 6, 3]
-        elif num_layers == 50:
-            units = [3, 4, 6, 3]
-        elif num_layers == 101:
-            units = [3, 4, 23, 3]
-        elif num_layers == 152:
-            units = [3, 8, 36, 3]
-        elif num_layers == 200:
-            units = [3, 24, 36, 3]
-        elif num_layers == 269:
-            units = [3, 30, 48, 8]
-        else:
-            raise ValueError("no experiments done on num_layers {}, you can do it yourself".format(num_layers))
-
-    return resnet(units       = units,
-                  num_stages  = num_stages,
-                  filter_list = filter_list,
-                  num_classes = num_classes,
-                  image_shape = image_shape,
-                  bottle_neck = bottle_neck,
-                  workspace   = conv_workspace)
diff --git a/example/ssd/symbol/symbol_builder.py b/example/ssd/symbol/symbol_builder.py
deleted file mode 100644
index 135c42e8be15..000000000000
--- a/example/ssd/symbol/symbol_builder.py
+++ /dev/null
@@ -1,182 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import mxnet as mx
-from symbol.common import multi_layer_feature, multibox_layer
-
-
-def import_module(module_name):
-    """Helper function to import module"""
-    import sys, os
-    import importlib
-    sys.path.append(os.path.dirname(__file__))
-    return importlib.import_module(module_name)
-
-def get_symbol_train(network, num_classes, from_layers, num_filters, strides, pads,
-                     sizes, ratios, normalizations=-1, steps=[], min_filter=128,
-                     nms_thresh=0.5, force_suppress=False, nms_topk=400, **kwargs):
-    """Build network symbol for training SSD
-
-    Parameters
-    ----------
-    network : str
-        base network symbol name
-    num_classes : int
-        number of object classes not including background
-    from_layers : list of str
-        feature extraction layers, use '' for add extra layers
-        For example:
-        from_layers = ['relu4_3', 'fc7', '', '', '', '']
-        which means extract feature from relu4_3 and fc7, adding 4 extra layers
-        on top of fc7
-    num_filters : list of int
-        number of filters for extra layers, you can use -1 for extracted features,
-        however, if normalization and scale is applied, the number of filter for
-        that layer must be provided.
-        For example:
-        num_filters = [512, -1, 512, 256, 256, 256]
-    strides : list of int
-        strides for the 3x3 convolution appended, -1 can be used for extracted
-        feature layers
-    pads : list of int
-        paddings for the 3x3 convolution, -1 can be used for extracted layers
-    sizes : list or list of list
-        [min_size, max_size] for all layers or [[], [], []...] for specific layers
-    ratios : list or list of list
-        [ratio1, ratio2...] for all layers or [[], [], ...] for specific layers
-    normalizations : int or list of int
-        use normalizations value for all layers or [...] for specific layers,
-        -1 indicate no normalizations and scales
-    steps : list
-        specify steps for each MultiBoxPrior layer, leave empty, it will calculate
-        according to layer dimensions
-    min_filter : int
-        minimum number of filters used in 1x1 convolution
-    nms_thresh : float
-        non-maximum suppression threshold
-    force_suppress : boolean
-        whether suppress different class objects
-    nms_topk : int
-        apply NMS to top K detections
-
-    Returns
-    -------
-    mx.Symbol
-
-    """
-    label = mx.sym.Variable('label')
-    body = import_module(network).get_symbol(num_classes, **kwargs)
-    layers = multi_layer_feature(body, from_layers, num_filters, strides, pads,
-        min_filter=min_filter)
-
-    loc_preds, cls_preds, anchor_boxes = multibox_layer(layers, \
-        num_classes, sizes=sizes, ratios=ratios, normalization=normalizations, \
-        num_channels=num_filters, clip=False, interm_layer=0, steps=steps)
-
-    tmp = mx.symbol.contrib.MultiBoxTarget(
-        *[anchor_boxes, label, cls_preds], overlap_threshold=.5, \
-        ignore_label=-1, negative_mining_ratio=3, minimum_negative_samples=0, \
-        negative_mining_thresh=.5, variances=(0.1, 0.1, 0.2, 0.2),
-        name="multibox_target")
-    loc_target = tmp[0]
-    loc_target_mask = tmp[1]
-    cls_target = tmp[2]
-
-    cls_prob = mx.symbol.SoftmaxOutput(data=cls_preds, label=cls_target, \
-        ignore_label=-1, use_ignore=True, grad_scale=1., multi_output=True, \
-        normalization='valid', name="cls_prob")
-    loc_loss_ = mx.symbol.smooth_l1(name="loc_loss_", \
-        data=loc_target_mask * (loc_preds - loc_target), scalar=1.0)
-    loc_loss = mx.symbol.MakeLoss(loc_loss_, grad_scale=1., \
-        normalization='valid', name="loc_loss")
-
-    # monitoring training status
-    cls_label = mx.symbol.MakeLoss(data=cls_target, grad_scale=0, name="cls_label")
-    det = mx.symbol.contrib.MultiBoxDetection(*[cls_prob, loc_preds, anchor_boxes], \
-        name="detection", nms_threshold=nms_thresh, force_suppress=force_suppress,
-        variances=(0.1, 0.1, 0.2, 0.2), nms_topk=nms_topk)
-    det = mx.symbol.MakeLoss(data=det, grad_scale=0, name="det_out")
-
-    # group output
-    out = mx.symbol.Group([cls_prob, loc_loss, cls_label, det])
-    return out
-
-def get_symbol(network, num_classes, from_layers, num_filters, sizes, ratios,
-               strides, pads, normalizations=-1, steps=[], min_filter=128,
-               nms_thresh=0.5, force_suppress=False, nms_topk=400, **kwargs):
-    """Build network for testing SSD
-
-    Parameters
-    ----------
-    network : str
-        base network symbol name
-    num_classes : int
-        number of object classes not including background
-    from_layers : list of str
-        feature extraction layers, use '' for add extra layers
-        For example:
-        from_layers = ['relu4_3', 'fc7', '', '', '', '']
-        which means extract feature from relu4_3 and fc7, adding 4 extra layers
-        on top of fc7
-    num_filters : list of int
-        number of filters for extra layers, you can use -1 for extracted features,
-        however, if normalization and scale is applied, the number of filter for
-        that layer must be provided.
-        For example:
-        num_filters = [512, -1, 512, 256, 256, 256]
-    strides : list of int
-        strides for the 3x3 convolution appended, -1 can be used for extracted
-        feature layers
-    pads : list of int
-        paddings for the 3x3 convolution, -1 can be used for extracted layers
-    sizes : list or list of list
-        [min_size, max_size] for all layers or [[], [], []...] for specific layers
-    ratios : list or list of list
-        [ratio1, ratio2...] for all layers or [[], [], ...] for specific layers
-    normalizations : int or list of int
-        use normalizations value for all layers or [...] for specific layers,
-        -1 indicate no normalizations and scales
-    steps : list
-        specify steps for each MultiBoxPrior layer, leave empty, it will calculate
-        according to layer dimensions
-    min_filter : int
-        minimum number of filters used in 1x1 convolution
-    nms_thresh : float
-        non-maximum suppression threshold
-    force_suppress : boolean
-        whether suppress different class objects
-    nms_topk : int
-        apply NMS to top K detections
-
-    Returns
-    -------
-    mx.Symbol
-
-    """
-    body = import_module(network).get_symbol(num_classes, **kwargs)
-    layers = multi_layer_feature(body, from_layers, num_filters, strides, pads,
-        min_filter=min_filter)
-
-    loc_preds, cls_preds, anchor_boxes = multibox_layer(layers, \
-        num_classes, sizes=sizes, ratios=ratios, normalization=normalizations, \
-        num_channels=num_filters, clip=False, interm_layer=0, steps=steps)
-
-    cls_prob = mx.symbol.softmax(data=cls_preds, axis=1, name='cls_prob')
-    out = mx.symbol.contrib.MultiBoxDetection(*[cls_prob, loc_preds, anchor_boxes], \
-        name="detection", nms_threshold=nms_thresh, force_suppress=force_suppress,
-        variances=(0.1, 0.1, 0.2, 0.2), nms_topk=nms_topk)
-    return out
diff --git a/example/ssd/symbol/symbol_factory.py b/example/ssd/symbol/symbol_factory.py
deleted file mode 100644
index 3a4364a570bd..000000000000
--- a/example/ssd/symbol/symbol_factory.py
+++ /dev/null
@@ -1,139 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Presets for various network configurations"""
-import logging
-from symbol import symbol_builder
-
-def get_config(network, data_shape, **kwargs):
-    """Configuration factory for various networks
-
-    Parameters
-    ----------
-    network : str
-        base network name, such as vgg_reduced, inceptionv3, resnet...
-    data_shape : int
-        input data dimension
-    kwargs : dict
-        extra arguments
-    """
-    if network == 'vgg16_reduced':
-        if data_shape >= 448:
-            from_layers = ['relu4_3', 'relu7', '', '', '', '', '']
-            num_filters = [512, -1, 512, 256, 256, 256, 256]
-            strides = [-1, -1, 2, 2, 2, 2, 1]
-            pads = [-1, -1, 1, 1, 1, 1, 1]
-            sizes = [[.07, .1025], [.15,.2121], [.3, .3674], [.45, .5196], [.6, .6708], \
-                [.75, .8216], [.9, .9721]]
-            ratios = [[1,2,.5], [1,2,.5,3,1./3], [1,2,.5,3,1./3], [1,2,.5,3,1./3], \
-                [1,2,.5,3,1./3], [1,2,.5], [1,2,.5]]
-            normalizations = [20, -1, -1, -1, -1, -1, -1]
-            steps = [] if data_shape != 512 else [x / 512.0 for x in
-                [8, 16, 32, 64, 128, 256, 512]]
-        else:
-            from_layers = ['relu4_3', 'relu7', '', '', '', '']
-            num_filters = [512, -1, 512, 256, 256, 256]
-            strides = [-1, -1, 2, 2, 1, 1]
-            pads = [-1, -1, 1, 1, 0, 0]
-            sizes = [[.1, .141], [.2,.272], [.37, .447], [.54, .619], [.71, .79], [.88, .961]]
-            ratios = [[1,2,.5], [1,2,.5,3,1./3], [1,2,.5,3,1./3], [1,2,.5,3,1./3], \
-                [1,2,.5], [1,2,.5]]
-            normalizations = [20, -1, -1, -1, -1, -1]
-            steps = [] if data_shape != 300 else [x / 300.0 for x in [8, 16, 32, 64, 100, 300]]
-        if not (data_shape == 300 or data_shape == 512):
-            logging.warn('data_shape %d was not tested, use with caucious.' % data_shape)
-        return locals()
-    elif network == 'inceptionv3':
-        from_layers = ['ch_concat_mixed_7_chconcat', 'ch_concat_mixed_10_chconcat', '', '', '', '']
-        num_filters = [-1, -1, 512, 256, 256, 128]
-        strides = [-1, -1, 2, 2, 2, 2]
-        pads = [-1, -1, 1, 1, 1, 1]
-        sizes = [[.1, .141], [.2,.272], [.37, .447], [.54, .619], [.71, .79], [.88, .961]]
-        ratios = [[1,2,.5], [1,2,.5,3,1./3], [1,2,.5,3,1./3], [1,2,.5,3,1./3], \
-            [1,2,.5], [1,2,.5]]
-        normalizations = -1
-        steps = []
-        return locals()
-    elif network == 'resnet50':
-        num_layers = 50
-        image_shape = '3,224,224'  # resnet require it as shape check
-        network = 'resnet'
-        from_layers = ['_plus12', '_plus15', '', '', '', '']
-        num_filters = [-1, -1, 512, 256, 256, 128]
-        strides = [-1, -1, 2, 2, 2, 2]
-        pads = [-1, -1, 1, 1, 1, 1]
-        sizes = [[.1, .141], [.2,.272], [.37, .447], [.54, .619], [.71, .79], [.88, .961]]
-        ratios = [[1,2,.5], [1,2,.5,3,1./3], [1,2,.5,3,1./3], [1,2,.5,3,1./3], \
-            [1,2,.5], [1,2,.5]]
-        normalizations = -1
-        steps = []
-        return locals()
-    elif network == 'resnet101':
-        num_layers = 101
-        image_shape = '3,224,224'
-        network = 'resnet'
-        from_layers = ['_plus29', '_plus32', '', '', '', '']
-        num_filters = [-1, -1, 512, 256, 256, 128]
-        strides = [-1, -1, 2, 2, 2, 2]
-        pads = [-1, -1, 1, 1, 1, 1]
-        sizes = [[.1, .141], [.2,.272], [.37, .447], [.54, .619], [.71, .79], [.88, .961]]
-        ratios = [[1,2,.5], [1,2,.5,3,1./3], [1,2,.5,3,1./3], [1,2,.5,3,1./3], \
-            [1,2,.5], [1,2,.5]]
-        normalizations = -1
-        steps = []
-        return locals()
-    else:
-        msg = 'No configuration found for %s with data_shape %d' % (network, data_shape)
-        raise NotImplementedError(msg)
-
-def get_symbol_train(network, data_shape, **kwargs):
-    """Wrapper for get symbol for train
-
-    Parameters
-    ----------
-    network : str
-        name for the base network symbol
-    data_shape : int
-        input shape
-    kwargs : dict
-        see symbol_builder.get_symbol_train for more details
-    """
-    if network.startswith('legacy'):
-        logging.warn('Using legacy model.')
-        return symbol_builder.import_module(network).get_symbol_train(**kwargs)
-    config = get_config(network, data_shape, **kwargs).copy()
-    config.update(kwargs)
-    return symbol_builder.get_symbol_train(**config)
-
-def get_symbol(network, data_shape, **kwargs):
-    """Wrapper for get symbol for test
-
-    Parameters
-    ----------
-    network : str
-        name for the base network symbol
-    data_shape : int
-        input shape
-    kwargs : dict
-        see symbol_builder.get_symbol for more details
-    """
-    if network.startswith('legacy'):
-        logging.warn('Using legacy model.')
-        return symbol_builder.import_module(network).get_symbol(**kwargs)
-    config = get_config(network, data_shape, **kwargs).copy()
-    config.update(kwargs)
-    return symbol_builder.get_symbol(**config)
diff --git a/example/ssd/symbol/vgg16_reduced.py b/example/ssd/symbol/vgg16_reduced.py
deleted file mode 100644
index 16535e6dc22d..000000000000
--- a/example/ssd/symbol/vgg16_reduced.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import mxnet as mx
-
-def get_symbol(num_classes=1000, **kwargs):
-    """
-    VGG 16 layers network
-    This is a modified version, with fc6/fc7 layers replaced by conv layers
-    And the network is slightly smaller than original VGG 16 network
-    """
-    data = mx.symbol.Variable(name="data")
-    label = mx.symbol.Variable(name="label")
-
-    # group 1
-    conv1_1 = mx.symbol.Convolution(
-        data=data, kernel=(3, 3), pad=(1, 1), num_filter=64, name="conv1_1")
-    relu1_1 = mx.symbol.Activation(data=conv1_1, act_type="relu", name="relu1_1")
-    conv1_2 = mx.symbol.Convolution(
-        data=relu1_1, kernel=(3, 3), pad=(1, 1), num_filter=64, name="conv1_2")
-    relu1_2 = mx.symbol.Activation(data=conv1_2, act_type="relu", name="relu1_2")
-    pool1 = mx.symbol.Pooling(
-        data=relu1_2, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool1")
-    # group 2
-    conv2_1 = mx.symbol.Convolution(
-        data=pool1, kernel=(3, 3), pad=(1, 1), num_filter=128, name="conv2_1")
-    relu2_1 = mx.symbol.Activation(data=conv2_1, act_type="relu", name="relu2_1")
-    conv2_2 = mx.symbol.Convolution(
-        data=relu2_1, kernel=(3, 3), pad=(1, 1), num_filter=128, name="conv2_2")
-    relu2_2 = mx.symbol.Activation(data=conv2_2, act_type="relu", name="relu2_2")
-    pool2 = mx.symbol.Pooling(
-        data=relu2_2, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool2")
-    # group 3
-    conv3_1 = mx.symbol.Convolution(
-        data=pool2, kernel=(3, 3), pad=(1, 1), num_filter=256, name="conv3_1")
-    relu3_1 = mx.symbol.Activation(data=conv3_1, act_type="relu", name="relu3_1")
-    conv3_2 = mx.symbol.Convolution(
-        data=relu3_1, kernel=(3, 3), pad=(1, 1), num_filter=256, name="conv3_2")
-    relu3_2 = mx.symbol.Activation(data=conv3_2, act_type="relu", name="relu3_2")
-    conv3_3 = mx.symbol.Convolution(
-        data=relu3_2, kernel=(3, 3), pad=(1, 1), num_filter=256, name="conv3_3")
-    relu3_3 = mx.symbol.Activation(data=conv3_3, act_type="relu", name="relu3_3")
-    pool3 = mx.symbol.Pooling(
-        data=relu3_3, pool_type="max", kernel=(2, 2), stride=(2, 2), \
-        pooling_convention="full", name="pool3")
-    # group 4
-    conv4_1 = mx.symbol.Convolution(
-        data=pool3, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv4_1")
-    relu4_1 = mx.symbol.Activation(data=conv4_1, act_type="relu", name="relu4_1")
-    conv4_2 = mx.symbol.Convolution(
-        data=relu4_1, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv4_2")
-    relu4_2 = mx.symbol.Activation(data=conv4_2, act_type="relu", name="relu4_2")
-    conv4_3 = mx.symbol.Convolution(
-        data=relu4_2, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv4_3")
-    relu4_3 = mx.symbol.Activation(data=conv4_3, act_type="relu", name="relu4_3")
-    pool4 = mx.symbol.Pooling(
-        data=relu4_3, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool4")
-    # group 5
-    conv5_1 = mx.symbol.Convolution(
-        data=pool4, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_1")
-    relu5_1 = mx.symbol.Activation(data=conv5_1, act_type="relu", name="relu5_1")
-    conv5_2 = mx.symbol.Convolution(
-        data=relu5_1, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_2")
-    relu5_2 = mx.symbol.Activation(data=conv5_2, act_type="relu", name="relu5_2")
-    conv5_3 = mx.symbol.Convolution(
-        data=relu5_2, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_3")
-    relu5_3 = mx.symbol.Activation(data=conv5_3, act_type="relu", name="relu5_3")
-    pool5 = mx.symbol.Pooling(
-        data=relu5_3, pool_type="max", kernel=(3, 3), stride=(1, 1),
-        pad=(1,1), name="pool5")
-    # group 6
-    conv6 = mx.symbol.Convolution(
-        data=pool5, kernel=(3, 3), pad=(6, 6), dilate=(6, 6),
-        num_filter=1024, name="fc6")
-    relu6 = mx.symbol.Activation(data=conv6, act_type="relu", name="relu6")
-    # drop6 = mx.symbol.Dropout(data=relu6, p=0.5, name="drop6")
-    # group 7
-    conv7 = mx.symbol.Convolution(
-        data=relu6, kernel=(1, 1), pad=(0, 0), num_filter=1024, name="fc7")
-    relu7 = mx.symbol.Activation(data=conv7, act_type="relu", name="relu7")
-    # drop7 = mx.symbol.Dropout(data=relu7, p=0.5, name="drop7")
-
-    gpool = mx.symbol.Pooling(data=relu7, pool_type='avg', kernel=(7, 7),
-        global_pool=True, name='global_pool')
-    conv8 = mx.symbol.Convolution(data=gpool, num_filter=num_classes, kernel=(1, 1),
-        name='fc8')
-    flat = mx.symbol.Flatten(data=conv8)
-    softmax = mx.symbol.SoftmaxOutput(data=flat, name='softmax')
-    return softmax
diff --git a/example/ssd/tools/__init__.py b/example/ssd/tools/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/example/ssd/tools/find_mxnet.py b/example/ssd/tools/find_mxnet.py
deleted file mode 100644
index 0ad64cca01d7..000000000000
--- a/example/ssd/tools/find_mxnet.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-try:
-    import mxnet as mx
-except ImportError:
-    import os, sys
-    curr_path = os.path.abspath(os.path.dirname(__file__))
-    sys.path.append(os.path.join(curr_path, "../../../python"))
-    import mxnet as mx
diff --git a/example/ssd/tools/prepare_coco.sh b/example/ssd/tools/prepare_coco.sh
deleted file mode 100644
index 8b022ec23fc3..000000000000
--- a/example/ssd/tools/prepare_coco.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/usr/bin/env bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
-python $DIR/prepare_dataset.py --dataset coco --set train2014,valminusminival2014 --target $DIR/../data/train.lst  --root $DIR/../data/coco
-python $DIR/prepare_dataset.py --dataset coco --set minival2014 --target $DIR/../data/val.lst --root $DIR/../data/coco --no-shuffle
diff --git a/example/ssd/tools/prepare_dataset.py b/example/ssd/tools/prepare_dataset.py
deleted file mode 100644
index c031f04d4fe7..000000000000
--- a/example/ssd/tools/prepare_dataset.py
+++ /dev/null
@@ -1,141 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from __future__ import print_function
-import sys, os
-import argparse
-import subprocess
-curr_path = os.path.abspath(os.path.dirname(__file__))
-sys.path.append(os.path.join(curr_path, '..'))
-from dataset.pascal_voc import PascalVoc
-from dataset.mscoco import Coco
-from dataset.concat_db import ConcatDB
-
-def load_pascal(image_set, year, devkit_path, shuffle=False):
-    """
-    wrapper function for loading pascal voc dataset
-
-    Parameters:
-    ----------
-    image_set : str
-        train, trainval...
-    year : str
-        2007, 2012 or combinations splitted by comma
-    devkit_path : str
-        root directory of dataset
-    shuffle : bool
-        whether to shuffle initial list
-
-    Returns:
-    ----------
-    Imdb
-    """
-    image_set = [y.strip() for y in image_set.split(',')]
-    assert image_set, "No image_set specified"
-    year = [y.strip() for y in year.split(',')]
-    assert year, "No year specified"
-
-    # make sure (# sets == # years)
-    if len(image_set) > 1 and len(year) == 1:
-        year = year * len(image_set)
-    if len(image_set) == 1 and len(year) > 1:
-        image_set = image_set * len(year)
-    assert len(image_set) == len(year), "Number of sets and year mismatch"
-
-    imdbs = []
-    for s, y in zip(image_set, year):
-        imdbs.append(PascalVoc(s, y, devkit_path, shuffle, is_train=True))
-    if len(imdbs) > 1:
-        return ConcatDB(imdbs, shuffle)
-    else:
-        return imdbs[0]
-
-def load_coco(image_set, dirname, shuffle=False):
-    """
-    wrapper function for loading ms coco dataset
-
-    Parameters:
-    ----------
-    image_set : str
-        train2014, val2014, valminusminival2014, minival2014
-    dirname: str
-        root dir for coco
-    shuffle: boolean
-        initial shuffle
-    """
-    anno_files = ['instances_' + y.strip() + '.json' for y in image_set.split(',')]
-    assert anno_files, "No image set specified"
-    imdbs = []
-    for af in anno_files:
-        af_path = os.path.join(dirname, 'annotations', af)
-        imdbs.append(Coco(af_path, dirname, shuffle=shuffle))
-    if len(imdbs) > 1:
-        return ConcatDB(imdbs, shuffle)
-    else:
-        return imdbs[0]
-
-def parse_args():
-    parser = argparse.ArgumentParser(description='Prepare lists for dataset')
-    parser.add_argument('--dataset', dest='dataset', help='dataset to use',
-                        default='pascal', type=str)
-    parser.add_argument('--year', dest='year', help='which year to use',
-                        default='2007,2012', type=str)
-    parser.add_argument('--set', dest='set', help='train, val, trainval, test',
-                        default='trainval', type=str)
-    parser.add_argument('--target', dest='target', help='output list file',
-                        default=os.path.join(curr_path, '..', 'train.lst'),
-                        type=str)
-    parser.add_argument('--root', dest='root_path', help='dataset root path',
-                        default=os.path.join(curr_path, '..', 'data', 'VOCdevkit'),
-                        type=str)
-    parser.add_argument('--no-shuffle', dest='shuffle', help='shuffle list',
-                        action='store_false')
-    parser.add_argument('--num-thread', dest='num_thread', type=int, default=1,
-                        help='number of thread to use while runing im2rec.py')
-
-    args = parser.parse_args()
-    return args
-
-if __name__ == '__main__':
-    args = parse_args()
-    if args.dataset == 'pascal':
-        db = load_pascal(args.set, args.year, args.root_path, args.shuffle)
-        print("saving list to disk...")
-        db.save_imglist(args.target, root=args.root_path)
-    elif args.dataset == 'coco':
-        db = load_coco(args.set, args.root_path, args.shuffle)
-        print("saving list to disk...")
-        db.save_imglist(args.target, root=args.root_path)
-    else:
-        raise NotImplementedError("No implementation for dataset: " + args.dataset)
-
-    print("List file {} generated...".format(args.target))
-
-    cmd_arguments = ["python",
-                    os.path.join(curr_path, "../../../tools/im2rec.py"),
-                    os.path.abspath(args.target), os.path.abspath(args.root_path),
-                    "--pack-label", "--num-thread", str(args.num_thread)]
-
-    if not args.shuffle:
-        cmd_arguments.append("--no-shuffle")
-
-    subprocess.check_call(cmd_arguments)
-
-    print("Record file {} generated...".format(args.target.split('.')[0] + '.rec'))
diff --git a/example/ssd/tools/prepare_pascal.sh b/example/ssd/tools/prepare_pascal.sh
deleted file mode 100755
index 97eea262ac4d..000000000000
--- a/example/ssd/tools/prepare_pascal.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/usr/bin/env bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
-$DIR/prepare_dataset.py --dataset pascal --year 2007,2012 --set trainval --target $DIR/../data/train.lst
-$DIR/prepare_dataset.py --dataset pascal --year 2007 --set test --target $DIR/../data/val.lst --no-shuffle
diff --git a/example/ssd/tools/rand_sampler.py b/example/ssd/tools/rand_sampler.py
deleted file mode 100644
index 7f0cb6f8ba3d..000000000000
--- a/example/ssd/tools/rand_sampler.py
+++ /dev/null
@@ -1,287 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import numpy as np
-import math
-
-class RandSampler(object):
-    """
-    Random sampler base class, used for data augmentation
-
-    Parameters:
-    ----------
-    max_trials : int
-        maximum trials, if exceed this number, give up anyway
-    max_sample : int
-        maximum random crop samples to be generated
-    """
-    def __init__(self, max_trials, max_sample):
-        assert max_trials > 0
-        self.max_trials = int(max_trials)
-        assert max_sample >= 0
-        self.max_sample = int(max_sample)
-
-    def sample(self, label):
-        """
-        Interface for calling sampling function
-
-        Parameters:
-        ----------
-        label : numpy.array (n x 5 matrix)
-            ground-truths
-
-        Returns:
-        ----------
-        list of (crop_box, label) tuples, if failed, return empty list []
-        """
-        return NotImplementedError
-
-
-class RandCropper(RandSampler):
-    """
-    Random cropping original images with various settings
-
-    Parameters:
-    ----------
-    min_scale : float
-        minimum crop scale, (0, 1]
-    max_scale : float
-        maximum crop scale, (0, 1], must larger than min_scale
-    min_aspect_ratio : float
-        minimum crop aspect ratio, (0, 1]
-    max_aspect_ratio : float
-        maximum crop aspect ratio, [1, inf)
-    min_overlap : float
-        hreshold of minimum overlap between a rand crop and any gt
-    max_trials : int
-        maximum trials, if exceed this number, give up anyway
-    max_sample : int
-        maximum random crop samples to be generated
-    """
-    def __init__(self, min_scale=1., max_scale=1.,
-                 min_aspect_ratio=1., max_aspect_ratio=1.,
-                 min_overlap=0., max_trials=50, max_sample=1):
-        super(RandCropper, self).__init__(max_trials, max_sample)
-        assert min_scale <= max_scale, "min_scale must <= max_scale"
-        assert 0 < min_scale and min_scale <= 1, "min_scale must in (0, 1]"
-        assert 0 < max_scale and max_scale <= 1, "max_scale must in (0, 1]"
-        self.min_scale = min_scale
-        self.max_scale = max_scale
-        assert 0 < min_aspect_ratio and min_aspect_ratio <= 1, "min_ratio must in (0, 1]"
-        assert 1 <= max_aspect_ratio , "max_ratio must >= 1"
-        self.min_aspect_ratio = min_aspect_ratio
-        self.max_aspect_ratio = max_aspect_ratio
-        assert 0 <= min_overlap and min_overlap <= 1, "min_overlap must in [0,1]"
-        self.min_overlap = min_overlap
-
-        self.config = {'gt_constraint' : 'center'}
-
-    def sample(self, label):
-        """
-        generate random cropping boxes according to parameters
-        if satifactory crops generated, apply to ground-truth as well
-
-        Parameters:
-        ----------
-        label : numpy.array (n x 5 matrix)
-            ground-truths
-
-        Returns:
-        ----------
-        list of (crop_box, label) tuples, if failed, return empty list []
-        """
-        samples = []
-        count = 0
-        for trial in range(self.max_trials):
-            if count >= self.max_sample:
-                return samples
-            scale = np.random.uniform(self.min_scale, self.max_scale)
-            min_ratio = max(self.min_aspect_ratio, scale * scale)
-            max_ratio = min(self.max_aspect_ratio, 1. / scale / scale)
-            ratio = math.sqrt(np.random.uniform(min_ratio, max_ratio))
-            width = scale * ratio
-            height = scale / ratio
-            left = np.random.uniform(0., 1 - width)
-            top = np.random.uniform(0., 1 - height)
-            rand_box = (left, top, left + width, top + height)
-            valid_mask = np.where(label[:, 0] > -1)[0]
-            gt = label[valid_mask, :]
-            ious = self._check_satisfy(rand_box, gt)
-            if ious is not None:
-                # transform gt labels after crop, discard bad ones
-                l, t, r, b = rand_box
-                new_gt_boxes = []
-                new_width = r - l
-                new_height = b - t
-                for i in range(valid_mask.size):
-                    if ious[i] > 0:
-                        xmin = max(0., (gt[i, 1] - l) / new_width)
-                        ymin = max(0., (gt[i, 2] - t) / new_height)
-                        xmax = min(1., (gt[i, 3] - l) / new_width)
-                        ymax = min(1., (gt[i, 4] - t) / new_height)
-                        new_gt_boxes.append([gt[i, 0], xmin, ymin, xmax, ymax])
-                if not new_gt_boxes:
-                    continue
-                new_gt_boxes = np.array(new_gt_boxes)
-                label = np.lib.pad(new_gt_boxes,
-                    ((0, label.shape[0]-new_gt_boxes.shape[0]), (0,0)), \
-                    'constant', constant_values=(-1, -1))
-                samples.append((rand_box, label))
-                count += 1
-        return samples
-
-    def _check_satisfy(self, rand_box, gt_boxes):
-        """
-        check if overlap with any gt box is larger than threshold
-        """
-        l, t, r, b = rand_box
-        num_gt = gt_boxes.shape[0]
-        ls = np.ones(num_gt) * l
-        ts = np.ones(num_gt) * t
-        rs = np.ones(num_gt) * r
-        bs = np.ones(num_gt) * b
-        mask = np.where(ls < gt_boxes[:, 1])[0]
-        ls[mask] = gt_boxes[mask, 1]
-        mask = np.where(ts < gt_boxes[:, 2])[0]
-        ts[mask] = gt_boxes[mask, 2]
-        mask = np.where(rs > gt_boxes[:, 3])[0]
-        rs[mask] = gt_boxes[mask, 3]
-        mask = np.where(bs > gt_boxes[:, 4])[0]
-        bs[mask] = gt_boxes[mask, 4]
-        w = rs - ls
-        w[w < 0] = 0
-        h = bs - ts
-        h[h < 0] = 0
-        inter_area = h * w
-        union_area = np.ones(num_gt) * max(0, r - l) * max(0, b - t)
-        union_area += (gt_boxes[:, 3] - gt_boxes[:, 1]) * (gt_boxes[:, 4] - gt_boxes[:, 2])
-        union_area -= inter_area
-        ious = inter_area / union_area
-        ious[union_area <= 0] = 0
-        max_iou = np.amax(ious)
-        if max_iou < self.min_overlap:
-            return None
-        # check ground-truth constraint
-        if self.config['gt_constraint'] == 'center':
-            for i in range(ious.shape[0]):
-                if ious[i] > 0:
-                    gt_x = (gt_boxes[i, 1] + gt_boxes[i, 3]) / 2.0
-                    gt_y = (gt_boxes[i, 2] + gt_boxes[i, 4]) / 2.0
-                    if gt_x < l or gt_x > r or gt_y < t or gt_y > b:
-                        return None
-        elif self.config['gt_constraint'] == 'corner':
-            for i in range(ious.shape[0]):
-                if ious[i] > 0:
-                    if gt_boxes[i, 1] < l or gt_boxes[i, 3] > r \
-                        or gt_boxes[i, 2] < t or gt_boxes[i, 4] > b:
-                        return None
-        return ious
-
-
-class RandPadder(RandSampler):
-    """
-    Random cropping original images with various settings
-
-    Parameters:
-    ----------
-    min_scale : float
-        minimum crop scale, [1, inf)
-    max_scale : float
-        maximum crop scale, [1, inf), must larger than min_scale
-    min_aspect_ratio : float
-        minimum crop aspect ratio, (0, 1]
-    max_aspect_ratio : float
-        maximum crop aspect ratio, [1, inf)
-    min_gt_scale : float
-        minimum ground-truth scale to be satisfied after padding,
-        either width or height, [0, 1]
-    max_trials : int
-        maximum trials, if exceed this number, give up anyway
-    max_sample : int
-        maximum random crop samples to be generated
-    """
-    def __init__(self, min_scale=1., max_scale=1., min_aspect_ratio=1., \
-                 max_aspect_ratio=1., min_gt_scale=.01, max_trials=50,
-                 max_sample=1):
-        super(RandPadder, self).__init__(max_trials, max_sample)
-        assert min_scale <= max_scale, "min_scale must <= max_scale"
-        assert min_scale >= 1, "min_scale must in (0, 1]"
-        self.min_scale = min_scale
-        self.max_scale = max_scale
-        assert 0 < min_aspect_ratio and min_aspect_ratio <= 1, "min_ratio must in (0, 1]"
-        assert 1 <= max_aspect_ratio , "max_ratio must >= 1"
-        self.min_aspect_ratio = min_aspect_ratio
-        self.max_aspect_ratio = max_aspect_ratio
-        assert 0 <= min_gt_scale and min_gt_scale <= 1, "min_gt_scale must in [0, 1]"
-        self.min_gt_scale = min_gt_scale
-
-    def sample(self, label):
-        """
-        generate random padding boxes according to parameters
-        if satifactory padding generated, apply to ground-truth as well
-
-        Parameters:
-        ----------
-        label : numpy.array (n x 5 matrix)
-            ground-truths
-
-        Returns:
-        ----------
-        list of (crop_box, label) tuples, if failed, return empty list []
-        """
-        samples = []
-        count = 0
-        for trial in range(self.max_trials):
-            if count >= self.max_sample:
-                return samples
-            scale = np.random.uniform(self.min_scale, self.max_scale)
-            min_ratio = max(self.min_aspect_ratio, scale * scale)
-            max_ratio = min(self.max_aspect_ratio, 1. / scale / scale)
-            ratio = math.sqrt(np.random.uniform(min_ratio, max_ratio))
-            width = scale * ratio
-            if width < 1:
-                continue
-            height = scale / ratio
-            if height < 1:
-                continue
-            left = np.random.uniform(0., 1 - width)
-            top = np.random.uniform(0., 1 - height)
-            right = left + width
-            bot = top + height
-            rand_box = (left, top, right, bot)
-            valid_mask = np.where(label[:, 0] > -1)[0]
-            gt = label[valid_mask, :]
-            new_gt_boxes = []
-            for i in range(gt.shape[0]):
-                xmin = (gt[i, 1] - left) / width
-                ymin = (gt[i, 2] - top) / height
-                xmax = (gt[i, 3] - left) / width
-                ymax = (gt[i, 4] - top) / height
-                new_size = min(xmax - xmin, ymax - ymin)
-                if new_size < self.min_gt_scale:
-                    new_gt_boxes = []
-                    break
-                new_gt_boxes.append([gt[i, 0], xmin, ymin, xmax, ymax])
-            if not new_gt_boxes:
-                continue
-            new_gt_boxes = np.array(new_gt_boxes)
-            label = np.lib.pad(new_gt_boxes,
-                ((0, label.shape[0]-new_gt_boxes.shape[0]), (0,0)), \
-                'constant', constant_values=(-1, -1))
-            samples.append((rand_box, label))
-            count += 1
-        return samples
diff --git a/example/ssd/tools/visualize_net.py b/example/ssd/tools/visualize_net.py
deleted file mode 100644
index b3b714a7f49b..000000000000
--- a/example/ssd/tools/visualize_net.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from __future__ import print_function
-import find_mxnet
-import mxnet as mx
-import argparse
-import sys, os
-sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'symbol'))
-import symbol_factory
-
-
-parser = argparse.ArgumentParser(description='network visualization')
-parser.add_argument('--network', type=str, default='vgg16_reduced',
-                    help = 'the cnn to use')
-parser.add_argument('--num-classes', type=int, default=20,
-                    help='the number of classes')
-parser.add_argument('--data-shape', type=int, default=300,
-                    help='set image\'s shape')
-parser.add_argument('--train', action='store_true', default=False, help='show train net')
-args = parser.parse_args()
-
-if not args.train:
-    net = symbol_factory.get_symbol(args.network, args.data_shape, num_classes=args.num_classes)
-    a = mx.viz.plot_network(net, shape={"data":(1,3,args.data_shape,args.data_shape)}, \
-        node_attrs={"shape":'rect', "fixedsize":'false'})
-    a.render("ssd_" + args.network + '_' + str(args.data_shape))
-else:
-    net = symbol_factory.get_symbol_train(args.network, args.data_shape, num_classes=args.num_classes)
-    print(net.tojson())
diff --git a/example/ssd/train.py b/example/ssd/train.py
deleted file mode 100644
index 5965aeec6c7a..000000000000
--- a/example/ssd/train.py
+++ /dev/null
@@ -1,156 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import argparse
-import mxnet as mx
-import os
-from train.train_net import train_net
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description='Train a Single-shot detection network')
-    parser.add_argument('--train-path', dest='train_path', help='train record to use',
-                        default=os.path.join(os.getcwd(), 'data', 'train.rec'), type=str)
-    parser.add_argument('--train-list', dest='train_list', help='train list to use',
-                        default="", type=str)
-    parser.add_argument('--val-path', dest='val_path', help='validation record to use',
-                        default=os.path.join(os.getcwd(), 'data', 'val.rec'), type=str)
-    parser.add_argument('--val-list', dest='val_list', help='validation list to use',
-                        default="", type=str)
-    parser.add_argument('--network', dest='network', type=str, default='vgg16_reduced',
-                        help='which network to use')
-    parser.add_argument('--batch-size', dest='batch_size', type=int, default=32,
-                        help='training batch size')
-    parser.add_argument('--resume', dest='resume', type=int, default=-1,
-                        help='resume training from epoch n')
-    parser.add_argument('--finetune', dest='finetune', type=int, default=-1,
-                        help='finetune from epoch n, rename the model before doing this')
-    parser.add_argument('--pretrained', dest='pretrained', help='pretrained model prefix',
-                        default=os.path.join(os.getcwd(), 'model', 'vgg16_reduced'), type=str)
-    parser.add_argument('--epoch', dest='epoch', help='epoch of pretrained model',
-                        default=1, type=int)
-    parser.add_argument('--prefix', dest='prefix', help='new model prefix',
-                        default=os.path.join(os.getcwd(), 'model', 'ssd'), type=str)
-    parser.add_argument('--gpus', dest='gpus', help='GPU devices to train with',
-                        default='0', type=str)
-    parser.add_argument('--begin-epoch', dest='begin_epoch', help='begin epoch of training',
-                        default=0, type=int)
-    parser.add_argument('--end-epoch', dest='end_epoch', help='end epoch of training',
-                        default=240, type=int)
-    parser.add_argument('--frequent', dest='frequent', help='frequency of logging',
-                        default=20, type=int)
-    parser.add_argument('--data-shape', dest='data_shape', type=int, default=300,
-                        help='set image shape')
-    parser.add_argument('--label-width', dest='label_width', type=int, default=350,
-                        help='force padding label width to sync across train and validation')
-    parser.add_argument('--lr', dest='learning_rate', type=float, default=0.002,
-                        help='learning rate')
-    parser.add_argument('--momentum', dest='momentum', type=float, default=0.9,
-                        help='momentum')
-    parser.add_argument('--wd', dest='weight_decay', type=float, default=0.0005,
-                        help='weight decay')
-    parser.add_argument('--mean-r', dest='mean_r', type=float, default=123,
-                        help='red mean value')
-    parser.add_argument('--mean-g', dest='mean_g', type=float, default=117,
-                        help='green mean value')
-    parser.add_argument('--mean-b', dest='mean_b', type=float, default=104,
-                        help='blue mean value')
-    parser.add_argument('--lr-steps', dest='lr_refactor_step', type=str, default='80, 160',
-                        help='refactor learning rate at specified epochs')
-    parser.add_argument('--lr-factor', dest='lr_refactor_ratio', type=float, default=0.1,
-                        help='ratio to refactor learning rate')
-    parser.add_argument('--freeze', dest='freeze_pattern', type=str, default="^(conv1_|conv2_).*",
-                        help='freeze layer pattern')
-    parser.add_argument('--log', dest='log_file', type=str, default="train.log",
-                        help='save training log to file')
-    parser.add_argument('--monitor', dest='monitor', type=int, default=0,
-                        help='log network parameters every N iters if larger than 0')
-    parser.add_argument('--pattern', dest='monitor_pattern', type=str, default=".*",
-                        help='monitor parameter pattern, as regex')
-    parser.add_argument('--num-class', dest='num_class', type=int, default=20,
-                        help='number of classes')
-    parser.add_argument('--num-example', dest='num_example', type=int, default=16551,
-                        help='number of image examples')
-    parser.add_argument('--class-names', dest='class_names', type=str,
-                        default='aeroplane, bicycle, bird, boat, bottle, bus, \
-                        car, cat, chair, cow, diningtable, dog, horse, motorbike, \
-                        person, pottedplant, sheep, sofa, train, tvmonitor',
-                        help='string of comma separated names, or text filename')
-    parser.add_argument('--nms', dest='nms_thresh', type=float, default=0.45,
-                        help='non-maximum suppression threshold')
-    parser.add_argument('--overlap', dest='overlap_thresh', type=float, default=0.5,
-                        help='evaluation overlap threshold')
-    parser.add_argument('--force', dest='force_nms', action='store_true',
-                        help='force non-maximum suppression on different class')
-    parser.add_argument('--use-difficult', dest='use_difficult', action='store_true',
-                        help='use difficult ground-truths in evaluation')
-    parser.add_argument('--no-voc07', dest='use_voc07_metric', action='store_false',
-                        help='dont use PASCAL VOC 07 11-point metric')
-    parser.add_argument('--kv-store', type=str, default='local',
-                        help='key-value store type')
-    args = parser.parse_args()
-    return args
-
-def parse_class_names(args):
-    """ parse # classes and class_names if applicable """
-    num_class = args.num_class
-    if len(args.class_names) > 0:
-        if os.path.isfile(args.class_names):
-            # try to open it to read class names
-            with open(args.class_names, 'r') as f:
-                class_names = [l.strip() for l in f.readlines()]
-        else:
-            class_names = [c.strip() for c in args.class_names.split(',')]
-        assert len(class_names) == num_class, str(len(class_names))
-        for name in class_names:
-            assert len(name) > 0
-    else:
-        class_names = None
-    return class_names
-
-if __name__ == '__main__':
-    args = parse_args()
-    # context list
-    ctx = [mx.gpu(int(i)) for i in args.gpus.split(',') if i.strip()]
-    ctx = [mx.cpu()] if not ctx else ctx
-    # class names if applicable
-    class_names = parse_class_names(args)
-    # start training
-    train_net(args.network, args.train_path,
-              args.num_class, args.batch_size,
-              args.data_shape, [args.mean_r, args.mean_g, args.mean_b],
-              args.resume, args.finetune, args.pretrained,
-              args.epoch, args.prefix, ctx, args.begin_epoch, args.end_epoch,
-              args.frequent, args.learning_rate, args.momentum, args.weight_decay,
-              args.lr_refactor_step, args.lr_refactor_ratio,
-              val_path=args.val_path,
-              num_example=args.num_example,
-              class_names=class_names,
-              label_pad_width=args.label_width,
-              freeze_layer_pattern=args.freeze_pattern,
-              iter_monitor=args.monitor,
-              monitor_pattern=args.monitor_pattern,
-              log_file=args.log_file,
-              nms_thresh=args.nms_thresh,
-              force_nms=args.force_nms,
-              ovp_thresh=args.overlap_thresh,
-              use_difficult=args.use_difficult,
-              voc07_metric=args.use_voc07_metric,
-              kv_store=args.kv_store)
diff --git a/example/ssd/train/__init__.py b/example/ssd/train/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/example/ssd/train/metric.py b/example/ssd/train/metric.py
deleted file mode 100644
index a99c8762de16..000000000000
--- a/example/ssd/train/metric.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import mxnet as mx
-import numpy as np
-
-
-class MultiBoxMetric(mx.gluon.metric.EvalMetric):
-    """Calculate metrics for Multibox training """
-    def __init__(self, eps=1e-8):
-        super(MultiBoxMetric, self).__init__('MultiBox')
-        self.eps = eps
-        self.num = 2
-        self.name = ['CrossEntropy', 'SmoothL1']
-        self.reset()
-
-    def reset(self):
-        """
-        override reset behavior
-        """
-        if getattr(self, 'num', None) is None:
-            self.num_inst = 0
-            self.sum_metric = 0.0
-        else:
-            self.num_inst = [0] * self.num
-            self.sum_metric = [0.0] * self.num
-
-    def update(self, labels, preds):
-        """
-        Implementation of updating metrics
-        """
-        # get generated multi label from network
-        cls_prob = preds[0].asnumpy()
-        loc_loss = preds[1].asnumpy()
-        cls_label = preds[2].asnumpy()
-        valid_count = np.sum(cls_label >= 0)
-        # overall accuracy & object accuracy
-        label = cls_label.flatten()
-        mask = np.where(label >= 0)[0]
-        indices = np.int64(label[mask])
-        prob = cls_prob.transpose((0, 2, 1)).reshape((-1, cls_prob.shape[1]))
-        prob = prob[mask, indices]
-        self.sum_metric[0] += (-np.log(prob + self.eps)).sum()
-        self.num_inst[0] += valid_count
-        # smoothl1loss
-        self.sum_metric[1] += np.sum(loc_loss)
-        self.num_inst[1] += valid_count
-
-    def get(self):
-        """Get the current evaluation result.
-        Override the default behavior
-
-        Returns
-        -------
-        name : str
-           Name of the metric.
-        value : float
-           Value of the evaluation.
-        """
-        if self.num is None:
-            if self.num_inst == 0:
-                return (self.name, float('nan'))
-            else:
-                return (self.name, self.sum_metric / self.num_inst)
-        else:
-            names = ['%s'%(self.name[i]) for i in range(self.num)]
-            values = [x / y if y != 0 else float('nan') \
-                for x, y in zip(self.sum_metric, self.num_inst)]
-            return (names, values)
diff --git a/example/ssd/train/train_net.py b/example/ssd/train/train_net.py
deleted file mode 100644
index b37e3d5abcec..000000000000
--- a/example/ssd/train/train_net.py
+++ /dev/null
@@ -1,279 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import tools.find_mxnet
-import mxnet as mx
-import logging
-import sys
-import os
-import importlib
-import re
-from dataset.iterator import DetRecordIter
-from train.metric import MultiBoxMetric
-from evaluate.eval_metric import MApMetric, VOC07MApMetric
-from config.config import cfg
-from symbol.symbol_factory import get_symbol_train
-
-def convert_pretrained(name, args):
-    """
-    Special operations need to be made due to name inconsistance, etc
-
-    Parameters:
-    ---------
-    name : str
-        pretrained model name
-    args : dict
-        loaded arguments
-
-    Returns:
-    ---------
-    processed arguments as dict
-    """
-    return args
-
-def get_lr_scheduler(learning_rate, lr_refactor_step, lr_refactor_ratio,
-                     num_example, batch_size, begin_epoch):
-    """
-    Compute learning rate and refactor scheduler
-
-    Parameters:
-    ---------
-    learning_rate : float
-        original learning rate
-    lr_refactor_step : comma separated str
-        epochs to change learning rate
-    lr_refactor_ratio : float
-        lr *= ratio at certain steps
-    num_example : int
-        number of training images, used to estimate the iterations given epochs
-    batch_size : int
-        training batch size
-    begin_epoch : int
-        starting epoch
-
-    Returns:
-    ---------
-    (learning_rate, mx.lr_scheduler) as tuple
-    """
-    assert lr_refactor_ratio > 0
-    iter_refactor = [int(r) for r in lr_refactor_step.split(',') if r.strip()]
-    if lr_refactor_ratio >= 1:
-        return (learning_rate, None)
-    else:
-        lr = learning_rate
-        epoch_size = num_example // batch_size
-        for s in iter_refactor:
-            if begin_epoch >= s:
-                lr *= lr_refactor_ratio
-        if lr != learning_rate:
-            logging.getLogger().info("Adjusted learning rate to {} for epoch {}".format(lr, begin_epoch))
-        steps = [epoch_size * (x - begin_epoch) for x in iter_refactor if x > begin_epoch]
-        if not steps:
-            return (lr, None)
-        lr_scheduler = mx.lr_scheduler.MultiFactorScheduler(step=steps, factor=lr_refactor_ratio)
-        return (lr, lr_scheduler)
-
-def train_net(net, train_path, num_classes, batch_size,
-              data_shape, mean_pixels, resume, finetune, pretrained, epoch,
-              prefix, ctx, begin_epoch, end_epoch, frequent, learning_rate,
-              momentum, weight_decay, lr_refactor_step, lr_refactor_ratio,
-              freeze_layer_pattern='',
-              num_example=10000, label_pad_width=350,
-              nms_thresh=0.45, force_nms=False, ovp_thresh=0.5,
-              use_difficult=False, class_names=None,
-              voc07_metric=False, nms_topk=400, force_suppress=False,
-              train_list="", val_path="", val_list="", iter_monitor=0,
-              monitor_pattern=".*", log_file=None, kv_store=None):
-    """
-    Wrapper for training phase.
-
-    Parameters:
-    ----------
-    net : str
-        symbol name for the network structure
-    train_path : str
-        record file path for training
-    num_classes : int
-        number of object classes, not including background
-    batch_size : int
-        training batch-size
-    data_shape : int or tuple
-        width/height as integer or (3, height, width) tuple
-    mean_pixels : tuple of floats
-        mean pixel values for red, green and blue
-    resume : int
-        resume from previous checkpoint if > 0
-    finetune : int
-        fine-tune from previous checkpoint if > 0
-    pretrained : str
-        prefix of pretrained model, including path
-    epoch : int
-        load epoch of either resume/finetune/pretrained model
-    prefix : str
-        prefix for saving checkpoints
-    ctx : [mx.cpu()] or [mx.gpu(x)]
-        list of mxnet contexts
-    begin_epoch : int
-        starting epoch for training, should be 0 if not otherwise specified
-    end_epoch : int
-        end epoch of training
-    frequent : int
-        frequency to print out training status
-    learning_rate : float
-        training learning rate
-    momentum : float
-        trainig momentum
-    weight_decay : float
-        training weight decay param
-    lr_refactor_ratio : float
-        multiplier for reducing learning rate
-    lr_refactor_step : comma separated integers
-        at which epoch to rescale learning rate, e.g. '30, 60, 90'
-    freeze_layer_pattern : str
-        regex pattern for layers need to be fixed
-    num_example : int
-        number of training images
-    label_pad_width : int
-        force padding training and validation labels to sync their label widths
-    nms_thresh : float
-        non-maximum suppression threshold for validation
-    force_nms : boolean
-        suppress overlaped objects from different classes
-    train_list : str
-        list file path for training, this will replace the embeded labels in record
-    val_path : str
-        record file path for validation
-    val_list : str
-        list file path for validation, this will replace the embeded labels in record
-    iter_monitor : int
-        monitor internal stats in networks if > 0, specified by monitor_pattern
-    monitor_pattern : str
-        regex pattern for monitoring network stats
-    log_file : str
-        log to file if enabled
-    """
-    # set up logger
-    logging.basicConfig()
-    logger = logging.getLogger()
-    logger.setLevel(logging.INFO)
-    if log_file:
-        fh = logging.FileHandler(log_file)
-        logger.addHandler(fh)
-
-    # check args
-    if isinstance(data_shape, int):
-        data_shape = (3, data_shape, data_shape)
-    assert len(data_shape) == 3 and data_shape[0] == 3
-    prefix += '_' + net + '_' + str(data_shape[1])
-
-    if isinstance(mean_pixels, (int, float)):
-        mean_pixels = [mean_pixels, mean_pixels, mean_pixels]
-    assert len(mean_pixels) == 3, "must provide all RGB mean values"
-
-    train_iter = DetRecordIter(train_path, batch_size, data_shape, mean_pixels=mean_pixels,
-        label_pad_width=label_pad_width, path_imglist=train_list, **cfg.train)
-
-    if val_path:
-        val_iter = DetRecordIter(val_path, batch_size, data_shape, mean_pixels=mean_pixels,
-            label_pad_width=label_pad_width, path_imglist=val_list, **cfg.valid)
-    else:
-        val_iter = None
-
-    # load symbol
-    net = get_symbol_train(net, data_shape[1], num_classes=num_classes,
-        nms_thresh=nms_thresh, force_suppress=force_suppress, nms_topk=nms_topk)
-
-    # define layers with fixed weight/bias
-    if freeze_layer_pattern.strip():
-        re_prog = re.compile(freeze_layer_pattern)
-        fixed_param_names = [name for name in net.list_arguments() if re_prog.match(name)]
-    else:
-        fixed_param_names = None
-
-    # load pretrained or resume from previous state
-    ctx_str = '('+ ','.join([str(c) for c in ctx]) + ')'
-    if resume > 0:
-        logger.info("Resume training with {} from epoch {}"
-            .format(ctx_str, resume))
-        _, args, auxs = mx.model.load_checkpoint(prefix, resume)
-        begin_epoch = resume
-    elif finetune > 0:
-        logger.info("Start finetuning with {} from epoch {}"
-            .format(ctx_str, finetune))
-        _, args, auxs = mx.model.load_checkpoint(prefix, finetune)
-        begin_epoch = finetune
-        # the prediction convolution layers name starts with relu, so it's fine
-        fixed_param_names = [name for name in net.list_arguments() \
-            if name.startswith('conv')]
-    elif pretrained:
-        logger.info("Start training with {} from pretrained model {}"
-            .format(ctx_str, pretrained))
-        _, args, auxs = mx.model.load_checkpoint(pretrained, epoch)
-        args = convert_pretrained(pretrained, args)
-    else:
-        logger.info("Experimental: start training from scratch with {}"
-            .format(ctx_str))
-        args = None
-        auxs = None
-        fixed_param_names = None
-
-    # helper information
-    if fixed_param_names:
-        logger.info("Freezed parameters: [" + ','.join(fixed_param_names) + ']')
-
-    # init training module
-    mod = mx.mod.Module(net, label_names=('label',), logger=logger, context=ctx,
-                        fixed_param_names=fixed_param_names)
-
-    # fit parameters
-    batch_end_callback = mx.callback.Speedometer(train_iter.batch_size, frequent=frequent)
-    epoch_end_callback = mx.callback.do_checkpoint(prefix)
-    learning_rate, lr_scheduler = get_lr_scheduler(learning_rate, lr_refactor_step,
-        lr_refactor_ratio, num_example, batch_size, begin_epoch)
-    optimizer_params={'learning_rate':learning_rate,
-                      'momentum':momentum,
-                      'wd':weight_decay,
-                      'lr_scheduler':lr_scheduler,
-                      'clip_gradient':None,
-                      'rescale_grad': 1.0 / len(ctx) if len(ctx) > 0 else 1.0 }
-    monitor = mx.mon.Monitor(iter_monitor, pattern=monitor_pattern) if iter_monitor > 0 else None
-
-    # run fit net, every n epochs we run evaluation network to get mAP
-    if voc07_metric:
-        valid_metric = VOC07MApMetric(ovp_thresh, use_difficult, class_names, pred_idx=3)
-    else:
-        valid_metric = MApMetric(ovp_thresh, use_difficult, class_names, pred_idx=3)
-
-    # create kvstore when there are gpus
-    kv = mx.kvstore.create(kv_store) if kv_store else None
-
-    mod.fit(train_iter,
-            val_iter,
-            eval_metric=MultiBoxMetric(),
-            validation_metric=valid_metric,
-            batch_end_callback=batch_end_callback,
-            epoch_end_callback=epoch_end_callback,
-            optimizer='sgd',
-            optimizer_params=optimizer_params,
-            begin_epoch=begin_epoch,
-            num_epoch=end_epoch,
-            initializer=mx.init.Xavier(),
-            arg_params=args,
-            aux_params=auxs,
-            allow_missing=True,
-            monitor=monitor,
-            kvstore=kv)
diff --git a/python/mxnet/__init__.py b/python/mxnet/__init__.py
index ae90e4b958ab..a1523bd77154 100644
--- a/python/mxnet/__init__.py
+++ b/python/mxnet/__init__.py
@@ -76,9 +76,6 @@
 from . import profiler
 from . import log
 
-from . import module
-from . import module as mod
-
 from . import image
 from . import image as img
 
diff --git a/python/mxnet/callback.py b/python/mxnet/callback.py
index bd515707eace..7ada7fe029f8 100644
--- a/python/mxnet/callback.py
+++ b/python/mxnet/callback.py
@@ -23,34 +23,6 @@
 import time
 from .model import save_checkpoint
 
-def module_checkpoint(mod, prefix, period=1, save_optimizer_states=False):
-    """Callback to checkpoint Module to prefix every epoch.
-
-    Parameters
-    ----------
-    mod : subclass of BaseModule
-        The module to checkpoint.
-    prefix : str
-        The file prefix for this checkpoint.
-    period : int
-        How many epochs to wait before checkpointing. Defaults to 1.
-    save_optimizer_states : bool
-        Indicates whether or not to save optimizer states for continued training.
-
-    Returns
-    -------
-    callback : function
-        The callback function that can be passed as iter_end_callback to fit.
-    """
-    period = int(max(1, period))
-    # pylint: disable=unused-argument
-    def _callback(iter_no, sym=None, arg=None, aux=None):
-        """The checkpoint function."""
-        if (iter_no + 1) % period == 0:
-            mod.save_checkpoint(prefix, iter_no + 1, save_optimizer_states)
-    return _callback
-
-
 def do_checkpoint(prefix, period=1):
     """A callback that saves a model checkpoint every few epochs.
     Each checkpoint is made up of a couple of binary files: a model description file and a
diff --git a/python/mxnet/contrib/amp/amp.py b/python/mxnet/contrib/amp/amp.py
index 688d73e23ffd..ac70cfa08850 100644
--- a/python/mxnet/contrib/amp/amp.py
+++ b/python/mxnet/contrib/amp/amp.py
@@ -33,7 +33,6 @@
 from ... import symbol
 from ...context import gpu
 from ...symbol import Symbol
-from ...module import BucketingModule
 from ...symbol import contrib as symbol_contrib
 from ... import ndarray
 from ...ndarray import NDArray, _DTYPE_NP_TO_MX, _DTYPE_MX_TO_NP
@@ -723,69 +722,6 @@ def convert_hybrid_block(block, target_dtype="float16", target_dtype_ops=None,
     ret.collect_params().load_dict(arg_dict, ctx=ctx)
     return ret
 
-def convert_bucketing_module(bucketing_mod, target_dtype="float16", target_dtype_ops=None,
-                             fp32_ops=None, conditional_fp32_ops=None,
-                             excluded_sym_names=None, cast_optional_params=False):
-    """Given a bucketing module cast the symbols associated with the BucketingModule
-    and params if cast_optional_params is set.
-    bucketing_mod : BucketingModule instance
-    target_dtype : str
-        Currently only supports float16. The target dtype indicates to add cast layers
-        when possible so that lower precision computation can be leveraged.
-    target_dtype_ops : list of strs
-        Override the list of operator names casted to target_dtype.
-        If None, uses the framework's default list to be casted to target dtype.
-    fp32_ops : list of strs
-        Override the lists of operator names casted to FP32.
-        If None, uses the framework's default list to be casted to FP32.
-    widest_dtype_ops : list of strs
-        A list of op names provided by user which should run in widest precision among its inputs.
-        If None, uses the framework's default list of widest_precision_ops.
-    conditional_fp32_ops : list of (string, string, list of string)
-        Override the list of operators to be casted to FP32.
-        The format of the list is
-        (name of the function, name of the parameter,
-         list of values of the parameter that make the operator to be casted to
-        fp32)
-    excluded_sym_names : list of strs
-        A list of strings that represent the names of symbols that users want to exclude
-        from being executed in lower precision.
-    cast_optional_params : bool, default False
-        Whether to cast the arg_params and aux_params that don't require to be in LP16
-        because of a cast layer following it, but will reduce the computation and memory
-        overhead of the model if casted.
-    """
-    assert isinstance(bucketing_mod, BucketingModule), "module should be instance of bucketing module"
-    assert len(bucketing_mod._buckets) > 0, "Bucketing Module should not be empty"
-
-    sym_dict = {}
-    assert bucketing_mod.params_initialized, \
-        "bucketing_mod params should be initialized for mixed precision conversion"
-    arg_params, aux_params = bucketing_mod._curr_module._arg_params, bucketing_mod._curr_module._aux_params
-    for key, val in bucketing_mod._buckets.items():
-        sym_dict[key], result_arg_params, result_aux_params = convert_model(val._symbol,
-                                                                            arg_params,
-                                                                            aux_params,
-                                                                            target_dtype=target_dtype,
-                                                                            target_dtype_ops=target_dtype_ops,
-                                                                            fp32_ops=fp32_ops,
-                                                                            conditional_fp32_ops=conditional_fp32_ops,
-                                                                            excluded_sym_names=excluded_sym_names,
-                                                                            cast_optional_params=cast_optional_params)
-    result_mod = BucketingModule.load_dict(sym_dict,
-                                           sym_gen=bucketing_mod._sym_gen,
-                                           arg_params=result_arg_params,
-                                           aux_params=result_aux_params,
-                                           default_bucket_key=bucketing_mod._default_bucket_key,
-                                           logger=bucketing_mod.logger,
-                                           context=bucketing_mod._context,
-                                           work_load_list=bucketing_mod._work_load_list,
-                                           fixed_param_names=bucketing_mod._fixed_param_names,
-                                           state_names=bucketing_mod._state_names,
-                                           group2ctxs=bucketing_mod._group2ctxs,
-                                           compression_params=bucketing_mod._compression_params)
-    return result_mod
-
 def list_lp16_ops(target_dtype):
     """Get the default list of LP16 ops for AMP
     """
diff --git a/python/mxnet/contrib/onnx/onnx2mx/_import_helper.py b/python/mxnet/contrib/onnx/onnx2mx/_import_helper.py
index bd48d26ef6ba..418fb084aeab 100644
--- a/python/mxnet/contrib/onnx/onnx2mx/_import_helper.py
+++ b/python/mxnet/contrib/onnx/onnx2mx/_import_helper.py
@@ -19,7 +19,7 @@
 # pylint: disable=invalid-name
 """Operator attributes conversion"""
 from ._op_translations import identity, random_uniform, random_normal, sample_multinomial
-from ._op_translations import add, subtract, multiply, divide, absolute, negative, add_n
+from ._op_translations import absolute, negative, add_n
 from ._op_translations import tanh, arccos, arcsin, arctan, _cos, _sin, _tan
 from ._op_translations import softplus, shape, gather, lp_pooling, size
 from ._op_translations import ceil, floor, hardsigmoid, global_lppooling
@@ -50,10 +50,6 @@
     'RandomNormalLike'  : random_normal,
     'Multinomial'       : sample_multinomial,
     # Arithmetic Operators
-    'Add'               : add,
-    'Sub'               : subtract,
-    'Mul'               : multiply,
-    'Div'               : divide,
     'Abs'               : absolute,
     'Neg'               : negative,
     'Sum'               : add_n, #elemwise sum
diff --git a/python/mxnet/contrib/onnx/onnx2mx/_op_translations.py b/python/mxnet/contrib/onnx/onnx2mx/_op_translations.py
index 311fd86ef623..60ca44df387f 100644
--- a/python/mxnet/contrib/onnx/onnx2mx/_op_translations.py
+++ b/python/mxnet/contrib/onnx/onnx2mx/_op_translations.py
@@ -62,51 +62,6 @@ def sample_multinomial(attrs, inputs, proto_obj):
     new_attrs['dtype'] = TENSOR_TYPE_TO_NP_TYPE[int(attrs.get('dtype', 6))]
     return 'sample_multinomial', new_attrs, inputs
 
-# Arithmetic Operations
-def add(attrs, inputs, proto_obj):
-    """Adding two tensors"""
-    new_attr = {}
-
-    if 'broadcast' in attrs and attrs['broadcast'] == 1:
-        broadcast_axis = attrs['axis']
-        op_value = translation_utils._fix_broadcast('broadcast_add', inputs,
-                                                    broadcast_axis, proto_obj)
-        return op_value, new_attr, inputs
-    return 'broadcast_add', new_attr, inputs
-
-def subtract(attrs, inputs, proto_obj):
-    """Subtracting two tensors"""
-    new_attr = {}
-
-    if 'broadcast' in attrs and attrs['broadcast'] == 1:
-        broadcast_axis = attrs['axis']
-        op_value = translation_utils._fix_broadcast('broadcast_sub', inputs,
-                                                    broadcast_axis, proto_obj)
-        return op_value, new_attr, inputs
-    return 'broadcast_sub', new_attr, inputs
-
-def multiply(attrs, inputs, proto_obj):
-    """Multiply two tensors"""
-    new_attr = {}
-
-    if 'broadcast' in attrs and attrs['broadcast'] == 1:
-        broadcast_axis = attrs['axis']
-        op_value = translation_utils._fix_broadcast('broadcast_mul', inputs,
-                                                    broadcast_axis, proto_obj)
-        return op_value, new_attr, inputs
-    return 'broadcast_mul', new_attr, inputs
-
-def divide(attrs, inputs, proto_obj):
-    """Divide two tensors"""
-    new_attr = {}
-
-    if 'broadcast' in attrs and attrs['broadcast'] == 1:
-        broadcast_axis = attrs['axis']
-        op_value = translation_utils._fix_broadcast('broadcast_div', inputs,
-                                                    broadcast_axis, proto_obj)
-        return op_value, new_attr, inputs
-    return 'broadcast_div', new_attr, inputs
-
 def mean(attrs, inputs, proto_obj):
     """Mean of all the input tensors."""
     concat_input = [symbol.expand_dims(op_input, axis=0) for op_input in inputs]
diff --git a/python/mxnet/contrib/onnx/onnx2mx/_translation_utils.py b/python/mxnet/contrib/onnx/onnx2mx/_translation_utils.py
index 680c05699c1f..376d72d25949 100644
--- a/python/mxnet/contrib/onnx/onnx2mx/_translation_utils.py
+++ b/python/mxnet/contrib/onnx/onnx2mx/_translation_utils.py
@@ -19,10 +19,6 @@
 """Utilities used for translating operators from Onnx to Mxnet."""
 # pylint: disable=protected-access
 from .... import symbol
-from .... import  module
-from .... import  context
-from .... import  ndarray as nd
-from .... import  io
 
 
 def _fix_attribute_names(attrs, change_map):
@@ -155,23 +151,6 @@ def _fix_bias(op_name, attrs, num_inputs):
         raise ValueError("Unexpected number of inputs for: {}".format(op_name))
     return attrs
 
-def _fix_broadcast(op_name, inputs, broadcast_axis, proto_obj):
-    """A workaround to reshape bias term to (1, num_channel)."""
-    if int(len(proto_obj._params)) > 0:
-        assert len(list(inputs)) == 2
-
-        input0_shape = get_input_shape(inputs[0], proto_obj)
-        #creating reshape shape
-        reshape_shape = list(len(input0_shape) * (1,))
-        reshape_shape[broadcast_axis] = -1
-        reshape_shape = tuple(reshape_shape)
-        reshape_op_sym = symbol.reshape(inputs[1], shape=reshape_shape)
-        op_sym = getattr(symbol, op_name)(inputs[0], reshape_op_sym)
-    else:
-        op_sym = op_name
-    return op_sym
-
-
 def _fix_channels(op_name, attrs, inputs, proto_obj):
     """A workaround for getting 'channels' or 'units' since onnx don't provide
     these attributes. We check the shape of weights provided to get the number.
@@ -211,36 +190,3 @@ def _fix_gemm(op_name, inputs, old_attr, proto_obj):
     new_inputs = [alpha*inputs[0], inputs[1], beta*inputs[2]]
     new_attr = {'num_hidden' : proto_obj._params[inputs[2].name].shape[0]}
     return op_sym, new_attr, new_inputs
-
-def get_input_shape(sym, proto_obj):
-    """Helper function to obtain the shape of an array"""
-    arg_params = proto_obj.arg_dict
-    aux_params = proto_obj.aux_dict
-
-    model_input_shape = [data[1] for data  in proto_obj.model_metadata.get('input_tensor_data')]
-    data_names = [data[0] for data  in proto_obj.model_metadata.get('input_tensor_data')]
-
-    # creating dummy inputs
-    inputs = []
-    for  in_shape in model_input_shape:
-        inputs.append(nd.ones(shape=in_shape))
-
-    data_shapes = []
-    for idx, input_name in enumerate(data_names):
-        data_shapes.append((input_name, inputs[idx].shape))
-
-    ctx = context.cpu()
-    # create a module
-    mod = module.Module(symbol=sym, data_names=data_names, context=ctx, label_names=None)
-    mod.bind(for_training=False, data_shapes=data_shapes, label_shapes=None)
-    mod.set_params(arg_params=arg_params, aux_params=aux_params)
-
-    data_forward = []
-    for idx, input_name in enumerate(data_names):
-        val = inputs[idx]
-        data_forward.append(val)
-
-    mod.forward(io.DataBatch(data_forward))
-    result = mod.get_outputs()[0].asnumpy()
-
-    return result.shape
diff --git a/python/mxnet/contrib/quantization.py b/python/mxnet/contrib/quantization.py
index 0305adb48946..af3001235de0 100644
--- a/python/mxnet/contrib/quantization.py
+++ b/python/mxnet/contrib/quantization.py
@@ -25,8 +25,6 @@
 import ctypes
 import logging
 import os
-import shutil
-import warnings
 import numpy as np
 from ..base import _LIB, check_call, py_str
 from ..base import c_array, c_str, mx_uint, c_str_array
@@ -35,11 +33,9 @@
 from ..symbol import load as sym_load
 from .. import ndarray
 from ..ndarray import load as nd_load
-from ..ndarray import save as nd_save
 from ..ndarray import NDArray
 from ..io import DataIter, DataDesc, DataBatch
 from ..context import cpu, Context
-from ..module import Module
 
 
 def _quantize_params(qsym, params, th_dict):
@@ -459,194 +455,6 @@ def _as_data_iter(calib_data):
     calib_data = _DataIterWrapper(calib_data)
     return calib_data, calib_data.provide_data
 
-def quantize_model(sym, arg_params, aux_params,
-                   data_names=('data',), label_names=('softmax_label',),
-                   ctx=cpu(), excluded_sym_names=None, excluded_op_names=None, calib_mode='entropy',
-                   calib_data=None, num_calib_examples=None,
-                   quantized_dtype='int8', quantize_mode='smart',
-                   quantize_granularity='tensor-wise', logger=None):
-    """User-level API for generating a quantized model from a FP32 model w/ or w/o calibration.
-    The backend quantized operators are only enabled for Linux systems. Please do not run
-    inference using the quantized models on Windows for now.
-    The quantization implementation adopts the TensorFlow's approach:
-    https://www.tensorflow.org/performance/quantization.
-    The calibration implementation borrows the idea of Nvidia's 8-bit Inference with TensorRT:
-    http://on-demand.gputechconf.com/gtc/2017/presentation/s7310-8-bit-inference-with-tensorrt.pdf
-    and adapts the method to MXNet.
-
-    Parameters
-    ----------
-    sym : str or Symbol
-        Defines the structure of a neural network for FP32 data types.
-    arg_params : dict
-        Dictionary of name to `NDArray`.
-    aux_params : dict
-        Dictionary of name to `NDArray`.
-    data_names : a list of strs
-        Data names required for creating a Module object to run forward propagation on the
-        calibration dataset.
-    label_names : a list of strs
-        Label names required for creating a Module object to run forward propagation on the
-        calibration dataset.
-    ctx : Context
-        Defines the device that users want to run forward propagation on the calibration
-        dataset for collecting layer output statistics. Currently, only supports single context.
-    excluded_sym_names : list of strings
-        A list of strings representing the names of the symbols that users want to excluding
-        from being quantized.
-    excluded_op_names : list of strings
-        A list of strings representing the names of the operators that users want to excluding
-        from being quantized.
-    calib_mode : str
-        If calib_mode='none', no calibration will be used and the thresholds for
-        requantization after the corresponding layers will be calculated at runtime by
-        calling min and max operators. The quantized models generated in this
-        mode are normally 10-20% slower than those with calibrations during inference.
-        If calib_mode='naive', the min and max values of the layer outputs from a calibration
-        dataset will be directly taken as the thresholds for quantization.
-        If calib_mode='entropy' (default mode), the thresholds for quantization will be
-        derived such that the KL divergence between the distributions of FP32 layer outputs and
-        quantized layer outputs is minimized based upon the calibration dataset.
-    calib_data : DataIter
-        A data iterator initialized by the calibration dataset.
-    num_calib_examples : int or None
-        The maximum number of examples that user would like to use for calibration. If not provided,
-        the whole calibration dataset will be used.
-    quantized_dtype : str
-        The quantized destination type for input data. Currently support 'int8', 'uint8' and 'auto'.
-        'auto' means automatically select output type according to calibration result.
-        Default value is 'int8'.
-    quantize_mode : str
-        The mode that quantization pass to apply. Support 'full' and 'smart'.
-        'full' means quantize all operator if possible.
-        'smart' means quantization pass will smartly choice which operator should be quantized.
-    quantize_granularity: str
-        The granularity of quantization, currently supports 'tensor-wise' and 'channel-wise'
-        quantization. The default value is 'tensor-wise'.
-    logger : Object
-        A logging object for printing information during the process of quantization.
-
-    Returns
-    -------
-    tuple
-        A tuple of quantized symbol, quantized arg_params, and aux_params.
-    -------
-    """
-    if excluded_sym_names is None:
-        excluded_sym_names = []
-    if not isinstance(excluded_sym_names, list):
-        raise ValueError('excluded_sym_names must be a list of strings representing'
-                         ' the names of the symbols that will not be quantized,'
-                         ' while received type %s' % str(type(excluded_sym_names)))
-
-    if excluded_op_names is None:
-        excluded_op_names = []
-    if not isinstance(excluded_op_names, list):
-        raise ValueError('excluded_op_names must be a list of strings representing'
-                         ' the names of the operators that will not be quantized,'
-                         ' while received type %s' % str(type(excluded_op_names)))
-
-    if logger:
-        os.environ['MXNET_QUANTIZATION_VERBOSE'] = '1'
-        logger.info('Quantizing symbol')
-    if quantized_dtype not in ('int8', 'uint8', 'auto'):
-        raise ValueError('unknown quantized_dtype %s received,'
-                         ' expected `int8`, `uint8` or `auto`' % quantized_dtype)
-    if quantize_granularity not in ('tensor-wise', 'channel-wise'):
-        raise ValueError('unkonwn quantize_granularity %s received,'
-                         ' expected `tensor-wise` or `channel-wise`.' % quantize_granularity)
-    qsym, calib_layer = _quantize_symbol(sym, ctx, excluded_symbols=excluded_sym_names,
-                                         excluded_operators=excluded_op_names,
-                                         offline_params=list(arg_params.keys()),
-                                         quantized_dtype=quantized_dtype,
-                                         quantize_mode=quantize_mode,
-                                         quantize_granularity=quantize_granularity)
-    th_dict = {}
-    if calib_mode is not None and calib_mode != 'none':
-        if not isinstance(ctx, Context):
-            raise ValueError('currently only supports single ctx, while received %s' % str(ctx))
-        if calib_data is None:
-            raise ValueError('calib_data must be provided when calib_mode=%s' % calib_mode)
-        if not isinstance(calib_data, DataIter):
-            raise ValueError('calib_data must be of DataIter type when calib_mode=%s,'
-                             ' while received type %s' % (calib_mode, str(type(calib_data))))
-
-        mod = Module(symbol=sym, data_names=data_names, label_names=label_names, context=ctx)
-        if len(calib_data.provide_label) > 0:
-            mod.bind(for_training=False, data_shapes=calib_data.provide_data,
-                     label_shapes=calib_data.provide_label)
-        else:
-            mod.bind(for_training=False, data_shapes=calib_data.provide_data)
-        mod.set_params(arg_params, aux_params)
-        if calib_mode == 'entropy':
-            hist_dict, num_examples = _collect_layer_histogram(mod, calib_data,
-                                                               include_layer=calib_layer,
-                                                               max_num_examples=num_calib_examples,
-                                                               logger=logger)
-            if logger:
-                logger.info('Collected layer outputs from FP32 model using %d examples' % num_examples)
-                logger.info('Calculating optimal thresholds for quantization')
-            th_dict = _get_optimal_thresholds(hist_dict, quantized_dtype, logger=logger)
-        elif calib_mode == 'naive':
-            th_dict, num_examples = _collect_layer_output_min_max(
-                mod, calib_data, quantized_dtype, include_layer=calib_layer, max_num_examples=num_calib_examples,
-                logger=logger)
-            if logger:
-                logger.info('Collected layer output min/max values from FP32 model using %d examples'
-                            % num_examples)
-        else:
-            raise ValueError('unknown calibration mode %s received,'
-                             ' expected `none`, `naive`, or `entropy`' % calib_mode)
-        qsym = _calibrate_quantized_sym(qsym, th_dict)
-
-    if logger:
-        logger.info('Quantizing parameters')
-    qarg_params = _quantize_params(qsym, arg_params, th_dict)
-
-    return qsym, qarg_params, aux_params
-
-def quantize_model_mkldnn(sym, arg_params, aux_params,
-                          data_names=('data',), label_names=('softmax_label',),
-                          ctx=cpu(), excluded_sym_names=None, excluded_op_names=None,
-                          calib_mode='entropy', calib_data=None, num_calib_examples=None,
-                          quantized_dtype='int8', quantize_mode='smart',
-                          quantize_granularity='tensor-wise', logger=None):
-    """User-level API for generating a fusion + quantized model from a FP32 model
-    w/ or w/o calibration with Intel MKL-DNN.
-    The backend quantized operators are only enabled for Linux systems. Please do not run
-    inference using the quantized models on Windows for now.
-
-    Parameters
-    ----------
-    same with quantize_model
-
-    Returns
-    -------
-    tuple
-        A tuple of quantized symbol, quantized arg_params, and aux_params.
-    -------
-    """
-    if not isinstance(ctx, Context):
-        raise ValueError('currently only supports single ctx, while received %s' % str(ctx))
-    if ctx.device_type != 'cpu':
-        raise ValueError(
-            'quantize_model_mkldnn only support Intel cpu platform with MKL-DNN Backend')
-
-    sym = sym.get_backend_symbol('MKLDNN_QUANTIZE')
-
-    qsym, qarg_params, aux_params = quantize_model(sym=sym, arg_params=arg_params, aux_params=aux_params,
-                                                   data_names=data_names, label_names=label_names,
-                                                   ctx=ctx, excluded_sym_names=excluded_sym_names,
-                                                   excluded_op_names=excluded_op_names,
-                                                   calib_mode=calib_mode, calib_data=calib_data,
-                                                   num_calib_examples=num_calib_examples,
-                                                   quantized_dtype=quantized_dtype, quantize_mode=quantize_mode,
-                                                   quantize_granularity=quantize_granularity, logger=logger)
-
-    qsym = qsym.get_backend_symbol('MKLDNN_QUANTIZE')
-
-    return qsym, qarg_params, aux_params
-
 def quantize_graph(sym, arg_params, aux_params, ctx=cpu(),
                    excluded_sym_names=None, excluded_op_names=None,
                    calib_mode='entropy', quantized_dtype='int8',
@@ -821,202 +629,3 @@ def calib_graph(qsym, arg_params, aux_params, collector,
     qarg_params = _quantize_params(qsym, arg_params, th_dict)
 
     return qsym, qarg_params, aux_params
-
-def quantize_net_v2(network, quantized_dtype='auto', quantize_mode='full', quantize_granularity='tensor-wise',
-                    exclude_layers=None, exclude_layers_match=None, exclude_operators=None,
-                    calib_data=None, data_shapes=None, calib_mode='none',
-                    num_calib_examples=None, ctx=cpu(), LayerOutputCollector=None, logger=None):
-    """User-level API for Gluon users to generate a quantized SymbolBlock from a FP32 HybridBlock w/ or w/o calibration.
-    The backend quantized operators are only enabled for Linux systems. Please do not run
-    inference using the quantized models on Windows for now.
-
-    Parameters
-    ----------
-    network : Gluon HybridBlock
-        Defines the structure of a neural network for FP32 data types.
-    quantized_dtype : str
-        The quantized destination type for input data. Currently support 'int8'
-        , 'uint8' and 'auto'. 'auto' means automatically select output type according to calibration result.
-        Default value is 'int8'.
-    quantize_mode : str
-        The mode that quantization pass to apply. Support 'full' and 'smart'.
-        'full' means quantize all operator if possible.
-        'smart' means quantization pass will smartly choice which operator should be quantized.
-    quantize_granularity: str
-        The granularity of quantization, currently supports 'tensor-wise' and 'channel-wise'
-        quantization. The default value is 'tensor-wise'.
-    exclude_layers : list of strings
-        A list of strings representing the names of the symbols that users want to excluding
-    exclude_layers_match : list of strings
-        A list of strings wildcard matching the names of the symbols that users want to excluding
-        from being quantized.
-    exclude_operators : list of strings
-        A list of strings representing the names of the operators that users want to excluding
-    calib_data : mx.io.DataIter or gluon.DataLoader
-        A iterable data loading object.
-    data_shapes : list
-        List of DataDesc, required if calib_data is not provided
-    calib_mode : str
-        If calib_mode='none', no calibration will be used and the thresholds for
-        requantization after the corresponding layers will be calculated at runtime by
-        calling min and max operators. The quantized models generated in this
-        mode are normally 10-20% slower than those with calibrations during inference.
-        If calib_mode='naive', the min and max values of the layer outputs from a calibration
-        dataset will be directly taken as the thresholds for quantization.
-        If calib_mode='entropy' (default mode), the thresholds for quantization will be
-        derived such that the KL divergence between the distributions of FP32 layer outputs and
-        quantized layer outputs is minimized based upon the calibration dataset.
-    num_calib_examples : int or None
-        The maximum number of examples that user would like to use for calibration. If not provided,
-        the whole calibration dataset will be used.
-    ctx : Context
-        Defines the device that users want to run forward propagation on the calibration
-        dataset for collecting layer output statistics. Currently, only supports single context.
-    LayerOutputCollector : class
-        For customize calibration method usage.
-    logger : Object
-        A logging object for printing information during the process of quantization.
-
-    Returns
-    -------
-    network : Gluon SymbolBlock
-        Defines the structure of a neural network for INT8 data types.
-    -------
-    """
-
-    if logger:
-        logger.info('Export HybridBlock')
-    network.hybridize()
-    import mxnet as mx
-    if calib_data is not None:
-        if isinstance(calib_data, DataIter):
-            dshapes = calib_data.provide_data
-        else:
-            calib_data, dshapes = _as_data_iter(calib_data)
-    if not data_shapes:
-        data_shapes = dshapes
-    if not data_shapes:
-        raise ValueError('data_shapes required')
-    data_nd = []
-    for shape in data_shapes:
-        data_nd.append(mx.nd.zeros(shape.shape))
-    while True:
-        try:
-            network(*data_nd)
-        except TypeError:
-            del data_nd[-1]
-            del calib_data.provide_data[-1]
-            continue
-        else:
-            break
-
-    import tempfile
-    try:
-        from tempfile import TemporaryDirectory
-    except ImportError:
-        # really simple implementation of TemporaryDirectory
-        class TemporaryDirectory(object):
-            def __init__(self, suffix='', prefix='', dir=''):
-                self._dirname = tempfile.mkdtemp(suffix, prefix, dir)
-
-            def __enter__(self):
-                return self._dirname
-
-            def __exit__(self, exc_type, exc_value, traceback):
-                shutil.rmtree(self._dirname)
-    # TODO(xinyu-intel): tmp solution to save and reload for mxnet.mod.Module.
-    # will enhance `export` function to return `sym, args, auxs` directly.
-    with TemporaryDirectory() as tmpdirname:
-        prefix = os.path.join(tmpdirname, 'tmp')
-        network.export(prefix, epoch=0)
-        symnet, args, auxs = mx.model.load_checkpoint(prefix, 0)
-
-    if exclude_layers is None:
-        exclude_layers = []
-    if exclude_layers_match is None:
-        exclude_layers_match = []
-    if exclude_operators is None:
-        exclude_operators = []
-    for name_match in exclude_layers_match:
-        for layers in list(symnet.get_internals()):
-            if layers.name.find(name_match) != -1:
-                exclude_layers.append(layers.name)
-    if logger:
-        logger.info('These layers have been excluded %s' % exclude_layers)
-
-    if ctx == mx.cpu():
-        symnet = symnet.get_backend_symbol('MKLDNN_QUANTIZE')
-
-    qsym, qarg_params, aux_params, collector = quantize_graph(
-        sym=symnet, arg_params=args, aux_params=auxs, ctx=ctx,
-        excluded_sym_names=exclude_layers, excluded_op_names=exclude_operators,
-        calib_mode=calib_mode, quantized_dtype=quantized_dtype, quantize_mode=quantize_mode,
-        quantize_granularity=quantize_granularity, LayerOutputCollector=LayerOutputCollector,
-        logger=logger)
-
-    if calib_mode is not None and calib_mode != 'none':
-        if not isinstance(ctx, Context):
-            raise ValueError(
-                'currently only supports single ctx, while received %s' % str(ctx))
-        if calib_data is None:
-            raise ValueError(
-                'calib_data must be provided when calib_mode=%s' % calib_mode)
-        if calib_mode in ['naive', 'entropy', 'customize']:
-            data_names = [pair[0] for pair in calib_data.provide_data]
-            mod = Module(symbol=symnet, context=ctx,
-                         data_names=data_names, label_names=None)
-            mod.bind(for_training=False, data_shapes=data_shapes)
-            mod.set_params(args, auxs, allow_missing=False, force_init=True)
-            num_examples = _collect_layer_statistics(mod, calib_data, collector,
-                                                     num_calib_examples, logger)
-            if logger:
-                logger.info('Collected layer output values from FP32 model using %d examples'
-                            % num_examples)
-            qsym, qarg_params, aux_params = calib_graph(
-                qsym=qsym, arg_params=args, aux_params=auxs, collector=collector,
-                calib_mode=calib_mode, quantized_dtype=quantized_dtype, logger=logger)
-        else:
-            raise ValueError(
-                'please set calibration mode to naive or entropy.')
-    elif calib_mode is not None and calib_mode == 'none':
-        data_names = [pair[0] for pair in data_shapes]
-
-    if ctx == mx.cpu():
-        qsym = qsym.get_backend_symbol('MKLDNN_QUANTIZE')
-
-    from ..gluon import SymbolBlock
-    data_sym = []
-    for name in data_names:
-        data_sym.append(mx.sym.var(name))
-    net = SymbolBlock(qsym, data_sym)
-    # TODO(xinyu-intel): tmp solution to save param_dict and reload for SymbolBlock
-    # will enhance SymbolBlock to load args, auxs directly.
-    with TemporaryDirectory() as tmpdirname:
-        prefix = os.path.join(tmpdirname, 'tmp')
-        param_name = '%s-%04d.params' % (prefix + 'net-quantized', 0)
-        save_dict = {('arg:%s' % k): v.as_in_context(cpu())
-                     for k, v in qarg_params.items()}
-        save_dict.update({('aux:%s' % k): v.as_in_context(cpu())
-                          for k, v in aux_params.items()})
-        nd_save(param_name, save_dict)
-        net.collect_params().load(param_name, cast_dtype=True, dtype_source='saved')
-        net.collect_params().reset_ctx(ctx)
-    return net
-
-def quantize_net(network, quantized_dtype='auto', quantize_mode='full',
-                 exclude_layers=None, exclude_layers_match=None, exclude_operators=None,
-                 calib_data=None, data_shapes=None, calib_mode='none',
-                 num_calib_examples=None, ctx=cpu(), logger=None):
-    """User-level API for Gluon users to generate a quantized SymbolBlock from a FP32 HybridBlock w/ or w/o calibration.
-       Will be deprecated after MXNet 2.0, please use quantize_net_v2.
-    """
-    warnings.warn('WARNING: This will be deprecated after MXNet 2.0, please use quantize_net_v2.')
-    return quantize_net_v2(network=network, quantized_dtype=quantized_dtype,
-                           quantize_mode=quantize_mode,
-                           quantize_granularity='tensor-wise',
-                           exclude_layers=exclude_layers,
-                           exclude_layers_match=exclude_layers_match,
-                           exclude_operators=exclude_operators,
-                           calib_data=calib_data, data_shapes=data_shapes,
-                           calib_mode=calib_mode, num_calib_examples=num_calib_examples,
-                           ctx=ctx, LayerOutputCollector=None, logger=logger)
diff --git a/python/mxnet/contrib/svrg_optimization/__init__.py b/python/mxnet/contrib/svrg_optimization/__init__.py
deleted file mode 100644
index 6e70009983c9..000000000000
--- a/python/mxnet/contrib/svrg_optimization/__init__.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""SVRGModule, SVRGOptimization import.
-"""
-
-
-from . import svrg_module
-from . import svrg_optimizer
diff --git a/python/mxnet/contrib/svrg_optimization/svrg_module.py b/python/mxnet/contrib/svrg_optimization/svrg_module.py
deleted file mode 100644
index fc5a6c224809..000000000000
--- a/python/mxnet/contrib/svrg_optimization/svrg_module.py
+++ /dev/null
@@ -1,579 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# coding: utf-8
-"""A `SVRGModule` implements the `Module` API by wrapping an auxiliary module to perform
-SVRG optimization logic.
-"""
-
-import time
-import logging
-import mxnet as mx
-from mxnet.module import Module
-from .svrg_optimizer import _SVRGOptimizer
-
-
-class SVRGModule(Module):
-    """SVRGModule is a module that encapsulates two Modules to accommodate the SVRG optimization technique.
-    It is functionally the same as Module API, except it is implemented using SVRG optimization logic.
-
-    Parameters
-    ----------
-    symbol : Symbol
-    data_names : list of str
-        Defaults to `('data')` for a typical model used in image classification.
-    label_names : list of str
-        Defaults to `('softmax_label')` for a typical model used in image classification.
-    logger : Logger
-        Defaults to `logging`.
-    context : Context or list of Context
-        Defaults to ``mx.cpu()``.
-    work_load_list : list of number
-        Default ``None``, indicating uniform workload.
-    fixed_param_names: list of str
-        Default ``None``, indicating no network parameters are fixed.
-    state_names : list of str
-        states are similar to data and label, but not provided by data iterator. \
-        Instead they are initialized to 0 and can be set by `set_states()`.
-    group2ctxs : dict of str to context or list of context, or list of dict of str to context
-        Default is `None`. Mapping the `ctx_group` attribute to the context assignment.
-    compression_params : dict
-        Specifies type of gradient compression and additional arguments depending \
-        on the type of compression being used. For example, 2bit compression requires a threshold. \
-        Arguments would then be {'type':'2bit', 'threshold':0.5} \
-        See mxnet.KVStore.set_gradient_compression method for more details on gradient compression. \
-    update_freq: int
-        Specifies the number of times to update the full gradients to be used in the SVRG optimization. For instance, \
-        update_freq = 2 will calculates the gradients over all data every two epochs
-
-    Examples
-    --------
-    >>> # An example of declaring and using SVRGModule.
-    >>> mod = SVRGModule(symbol=lro, data_names=['data'], label_names=['lin_reg_label'], update_freq=2)
-    >>> mod.fit(di, eval_metric='mse', optimizer='sgd', optimizer_params=(('learning_rate', 0.025),),
-    >>>         num_epoch=num_epoch, kvstore='local')
-    """
-
-    def __init__(self, symbol, data_names=('data',), label_names=('softmax_label',),
-                 logger=logging, context=mx.cpu(), work_load_list=None,
-                 fixed_param_names=None, state_names=None, group2ctxs=None,
-                 compression_params=None, update_freq=None):
-        super(SVRGModule, self).__init__(symbol, data_names=data_names, label_names=label_names, logger=logger,
-                                         context=context, work_load_list=work_load_list,
-                                         fixed_param_names=fixed_param_names, state_names=state_names,
-                                         group2ctxs=group2ctxs, compression_params=compression_params)
-
-        # Type check update_frequency
-        if isinstance(update_freq, int):
-            if update_freq <= 0:
-                raise ValueError("update_freq in SVRGModule must be a positive integer to represent the frequency for "
-                                 "calculating full gradients")
-            self.update_freq = update_freq
-        else:
-            raise TypeError("update_freq in SVRGModule must be an integer to represent the frequency for "
-                            "calculating full gradients")
-
-        self._mod_aux = mx.mod.Module(symbol, data_names, label_names, logger, context, work_load_list,
-                                      fixed_param_names, state_names, group2ctxs, compression_params)
-
-        self._param_dict = None
-        self._ctx_len = len(self._context)
-
-    def _reset_bind(self):
-        """Internal function to reset binded state for both modules."""
-        super(SVRGModule, self)._reset_bind()
-        self._mod_aux._reset_bind()
-
-    def reshape(self, data_shapes, label_shapes=None):
-        """Reshapes both modules for new input shapes.
-
-        Parameters
-        ----------
-        data_shapes : list of (str, tuple)
-            Typically is ``data_iter.provide_data``.
-        label_shapes : list of (str, tuple)
-            Typically is ``data_iter.provide_label``.
-        """
-        super(SVRGModule, self).reshape(data_shapes, label_shapes=label_shapes)
-        self._mod_aux.reshape(data_shapes, label_shapes=label_shapes)
-
-    def init_optimizer(self, kvstore='local', optimizer='sgd',
-                       optimizer_params=(('learning_rate', 0.01),), force_init=False):
-        """Installs and initializes SVRGOptimizer. The SVRGOptimizer is a wrapper class for a regular optimizer that is
-        passed in and a special AssignmentOptimizer to accumulate the full gradients.  If KVStore is 'local' or None,
-        the full gradients will be accumulated locally without pushing to the KVStore. Otherwise, additional keys will
-        be pushed to accumulate the full gradients in the KVStore.
-
-        Parameters
-        ----------
-        kvstore : str or KVStore
-            Default `'local'`.
-        optimizer : str or Optimizer
-            Default `'sgd'`
-        optimizer_params : dict
-            Default `(('learning_rate', 0.01),)`. The default value is not a dictionary,
-            just to avoid pylint warning of dangerous default values.
-        force_init : bool
-            Default ``False``, indicating whether we should force re-initializing the
-            optimizer in the case an optimizer is already installed.
-        """
-
-        # Init dict for storing average of full gradients for each device
-        self._param_dict = [{key: mx.nd.zeros(shape=value.shape, ctx=self._context[i])
-                             for key, value in self.get_params()[0].items()} for i in range(self._ctx_len)]
-
-        svrg_optimizer = self._create_optimizer(_SVRGOptimizer.__name__, default_opt=optimizer,
-                                                kvstore=kvstore, optimizer_params=optimizer_params)
-
-        super(SVRGModule, self).init_optimizer(kvstore=kvstore, optimizer=svrg_optimizer,
-                                               optimizer_params=optimizer_params, force_init=force_init)
-
-        # Init additional keys for accumulating full grads in KVStore
-        if self._kvstore:
-            for idx, param_on_devs in enumerate(self._exec_group.param_arrays):
-                name = self._exec_group.param_names[idx]
-                self._kvstore.init(name + "_full", mx.nd.zeros(shape=self._arg_params[name].shape))
-                if self._update_on_kvstore:
-                    self._kvstore.pull(name + "_full", param_on_devs, priority=-idx)
-
-    def _create_optimizer(self, optimizer, default_opt, kvstore, optimizer_params):
-        """Helper function to create a svrg optimizer. SVRG optimizer encapsulates two optimizers and
-        will redirect update() to the correct optimizer based on the key.
-
-        Parameters
-        ----------
-        kvstore : str or KVStore
-            Default `'local'`.
-        optimizer: str
-            Name for SVRGOptimizer
-        default_opt : str or Optimizer that was passed in.
-        optimizer_params : dict
-           optimizer params that was passed in.
-        """
-
-        # code partially copied from mxnet module.init_optimizer() to accomodate svrg_optimizer
-        batch_size = self._exec_group.batch_size
-
-        (kv_store, update_on_kvstore) = mx.model._create_kvstore(kvstore, self._ctx_len, self._arg_params)
-        if kv_store and 'dist' in kv_store.type and '_sync' in kv_store.type:
-            batch_size *= kv_store.num_workers
-        rescale_grad = 1.0 / batch_size
-
-        idx2name = {}
-        if update_on_kvstore:
-            idx2name.update(enumerate(self._exec_group.param_names))
-        else:
-            for k in range(self._ctx_len):
-                idx2name.update({i * self._ctx_len + k: n
-                                 for i, n in enumerate(self._exec_group.param_names)})
-
-        # update idx2name to include new keys
-        for key in self._param_dict[0].keys():
-            max_key = max(list(idx2name.keys())) + 1
-            idx2name[max_key] = key + "_full"
-
-        optimizer_params = dict(optimizer_params)
-        if 'rescale_grad' not in optimizer_params:
-            optimizer_params['rescale_grad'] = rescale_grad
-        optimizer_params["default_optimizer"] = default_opt
-        optimizer_params["param_idx2name"] = idx2name
-        optimizer = mx.optimizer.create(optimizer, **optimizer_params)
-
-        return optimizer
-
-    def bind(self, data_shapes, label_shapes=None, for_training=True,
-             inputs_need_grad=False, force_rebind=False, shared_module=None, grad_req='write'):
-        """Binds the symbols to construct executors for both two modules. This is necessary before one
-        can perform computation with the SVRGModule.
-
-        Parameters
-        ----------
-        data_shapes : list of (str, tuple)
-            Typically is ``data_iter.provide_data``.
-        label_shapes : list of (str, tuple)
-            Typically is ``data_iter.provide_label``.
-        for_training : bool
-            Default is ``True``. Whether the executors should be bound for training.
-        inputs_need_grad : bool
-            Default is ``False``. Whether the gradients to the input data need to be computed.
-            Typically this is not needed. But this might be needed when implementing composition
-            of modules.
-        force_rebind : bool
-            Default is ``False``. This function does nothing if the executors are already
-            bound. But with this ``True``, the executors will be forced to rebind.
-        shared_module : Module
-            Default is ``None``. This is used in bucketing. When not ``None``, the shared module
-            essentially corresponds to a different bucket -- a module with different symbol
-            but with the same sets of parameters (e.g. unrolled RNNs with different lengths).
-        """
-        # force rebinding is typically used when one want to switch from
-        # training to prediction phase.
-        super(SVRGModule, self).bind(data_shapes, label_shapes, for_training, inputs_need_grad, force_rebind,
-                                     shared_module, grad_req)
-
-        if for_training:
-            self._mod_aux.bind(data_shapes, label_shapes, for_training, inputs_need_grad, force_rebind, shared_module,
-                               grad_req)
-
-    def forward(self, data_batch, is_train=None):
-        """Forward computation for both two modules. It supports data batches with different shapes, such as
-        different batch sizes or different image sizes.
-        If reshaping of data batch relates to modification of symbol or module, such as
-        changing image layout ordering or switching from training to predicting, module
-        rebinding is required.
-
-        See Also
-        ----------
-        :meth:`BaseModule.forward`.
-
-        Parameters
-        ----------
-        data_batch : DataBatch
-            Could be anything with similar API implemented.
-        is_train : bool
-            Default is ``None``, which means ``is_train`` takes the value of ``self.for_training``.
-        """
-        super(SVRGModule, self).forward(data_batch, is_train)
-
-        if is_train:
-            self._mod_aux.forward(data_batch, is_train)
-
-    def backward(self, out_grads=None):
-        """Backward computation.
-
-        See Also
-        ----------
-        :meth:`BaseModule.backward`.
-
-        Parameters
-        ----------
-        out_grads : NDArray or list of NDArray, optional
-            Gradient on the outputs to be propagated back.
-            This parameter is only needed when bind is called
-            on outputs that are not a loss function.
-        """
-        super(SVRGModule, self).backward(out_grads)
-
-        if self._mod_aux.binded:
-            self._mod_aux.backward(out_grads)
-
-    def update(self):
-        """Updates parameters according to the installed optimizer and the gradients computed
-        in the previous forward-backward batch. The gradients in the _exec_group will be overwritten
-        using the gradients calculated by the SVRG update rule.
-
-        When KVStore is used to update parameters for multi-device or multi-machine training,
-        a copy of the parameters is stored in KVStore. Note that for `row_sparse` parameters,
-        this function does update the copy of parameters in KVStore, but doesn't broadcast the
-        updated parameters to all devices / machines. Please call `prepare` to broadcast
-        `row_sparse` parameters with the next batch of data.
-
-        See Also
-        ----------
-        :meth:`BaseModule.update`.
-        """
-        self._update_svrg_gradients()
-        super(SVRGModule, self).update()
-
-    def update_full_grads(self, train_data):
-        """Computes the gradients over all data w.r.t weights of past
-        m epochs. For distributed env, it will accumulate full grads in the kvstore.
-
-        Parameters
-        ----------
-        train_data: DataIter
-            Train data iterator
-        """
-        param_names = self._exec_group.param_names
-        arg, aux = self.get_params()
-        self._mod_aux.set_params(arg_params=arg, aux_params=aux)
-        train_data.reset()
-        nbatch = 0
-        padding = 0
-        for batch in train_data:
-            self._mod_aux.forward(batch, is_train=True)
-            self._mod_aux.backward()
-            nbatch += 1
-            for ctx in range(self._ctx_len):
-                for index, name in enumerate(param_names):
-                    grads = self._mod_aux._exec_group.grad_arrays[index][ctx]
-                    self._param_dict[ctx][name] = mx.nd.broadcast_add(self._param_dict[ctx][name], grads, axis=0)
-            padding = batch.pad
-
-        true_num_batch = nbatch - padding / train_data.batch_size
-        for name in param_names:
-            grad_list = []
-            for i in range(self._ctx_len):
-                self._param_dict[i][name] /= true_num_batch
-                grad_list.append(self._param_dict[i][name])
-            if self._kvstore:
-                # If in distributed mode, push a list of gradients from each worker/device to the KVStore
-                self._accumulate_kvstore(name, grad_list)
-
-    def _accumulate_kvstore(self, key, value):
-        """Accumulate gradients over all data in the KVStore. In distributed setting, each worker sees a portion of
-        data. The full gradients will be aggregated from each worker in the KVStore.
-
-        Parameters
-        ----------
-
-        key: int or str
-            Key in the KVStore.
-        value: NDArray, RowSparseNDArray
-            Average of the full gradients.
-        """
-        # Accumulate full gradients for current epochs
-        self._kvstore.push(key + "_full", value)
-        self._kvstore._barrier()
-        self._kvstore.pull(key + "_full", value)
-
-        self._allocate_gradients(key, value)
-
-    def _allocate_gradients(self, key, value):
-        """Allocate average of full gradients accumulated in the KVStore to each device.
-
-        Parameters
-        ----------
-
-        key: int or str
-            Key in the kvstore.
-        value: List of NDArray, List of RowSparseNDArray
-            A list of average of the full gradients in the KVStore.
-        """
-        for i in range(self._ctx_len):
-            self._param_dict[i][key] = value[i] / self._ctx_len
-
-    def _svrg_grads_update_rule(self, g_curr_batch_curr_weight, g_curr_batch_special_weight,
-                                g_special_weight_all_batch):
-        """Calculates the gradient based on the SVRG update rule.
-        Parameters
-        ----------
-        g_curr_batch_curr_weight : NDArray
-            gradients of current weight of self.mod w.r.t current batch of data
-        g_curr_batch_special_weight: NDArray
-            gradients of the weight of past m epochs of self._mod_special w.r.t current batch of data
-        g_special_weight_all_batch: NDArray
-            average of full gradients over full pass of data
-
-        Returns
-        ----------
-        Gradients calculated using SVRG update rule:
-        grads = g_curr_batch_curr_weight - g_curr_batch_special_weight + g_special_weight_all_batch
-        """
-        for index, grad in enumerate(g_curr_batch_curr_weight):
-            grad -= g_curr_batch_special_weight[index]
-            grad += g_special_weight_all_batch[index]
-        return g_curr_batch_curr_weight
-
-    def _update_svrg_gradients(self):
-        """Calculates gradients based on the SVRG update rule.
-        """
-        param_names = self._exec_group.param_names
-        for ctx in range(self._ctx_len):
-            for index, name in enumerate(param_names):
-                g_curr_batch_reg = self._exec_group.grad_arrays[index][ctx]
-                g_curr_batch_special = self._mod_aux._exec_group.grad_arrays[index][ctx]
-                g_special_weight_all_batch = self._param_dict[ctx][name]
-                g_svrg = self._svrg_grads_update_rule(g_curr_batch_reg, g_curr_batch_special,
-                                                      g_special_weight_all_batch)
-                self._exec_group.grad_arrays[index][ctx] = g_svrg
-
-    def fit(self, train_data, eval_data=None, eval_metric='acc',
-            epoch_end_callback=None, batch_end_callback=None, kvstore='local',
-            optimizer='sgd', optimizer_params=(('learning_rate', 0.01),),
-            eval_end_callback=None,
-            eval_batch_end_callback=None, initializer=mx.init.Uniform(0.01),
-            arg_params=None, aux_params=None, allow_missing=False,
-            force_rebind=False, force_init=False, begin_epoch=0, num_epoch=None,
-            validation_metric=None, monitor=None, sparse_row_id_fn=None):
-        """Trains the module parameters.
-
-        Parameters
-        ----------
-        train_data : DataIter
-            Train DataIter.
-        eval_data : DataIter
-            If not ``None``, will be used as validation set and the performance
-            after each epoch will be evaluated.
-        eval_metric : str or EvalMetric
-            Defaults to 'accuracy'. The performance measure used to display during training.
-            Other possible predefined metrics are:
-            'ce' (CrossEntropy), 'f1', 'mae', 'mse', 'rmse', 'top_k_accuracy'.
-        epoch_end_callback : function or list of functions
-            Each callback will be called with the current `epoch`, `symbol`, `arg_params`
-            and `aux_params`.
-        batch_end_callback : function or list of function
-            Each callback will be called with a `BatchEndParam`.
-        kvstore : str or KVStore
-            Defaults to 'local'.
-        optimizer : str or Optimizer
-            Defaults to 'sgd'.
-        optimizer_params : dict
-            Defaults to ``(('learning_rate', 0.01),)``. The parameters for
-            the optimizer constructor.
-            The default value is not a dict, just to avoid pylint warning on dangerous
-            default values.
-        eval_end_callback : function or list of function
-            These will be called at the end of each full evaluation, with the metrics over
-            the entire evaluation set.
-        eval_batch_end_callback : function or list of function
-            These will be called at the end of each mini-batch during evaluation.
-        initializer : Initializer
-            The initializer is called to initialize the module parameters when they are
-            not already initialized.
-        arg_params : dict
-            Defaults to ``None``, if not ``None``, should be existing parameters from a trained
-            model or loaded from a checkpoint (previously saved model). In this case,
-            the value here will be used to initialize the module parameters, unless they
-            are already initialized by the user via a call to `init_params` or `fit`.
-            `arg_params` has a higher priority than `initializer`.
-        aux_params : dict
-            Defaults to ``None``. Similar to `arg_params`, except for auxiliary states.
-        allow_missing : bool
-            Defaults to ``False``. Indicates whether to allow missing parameters when `arg_params`
-            and `aux_params` are not ``None``. If this is ``True``, then the missing parameters
-            will be initialized via the `initializer`.
-        force_rebind : bool
-            Defaults to ``False``. Whether to force rebinding the executors if already bound.
-        force_init : bool
-            Defaults to ``False``. Indicates whether to force initialization even if the
-            parameters are already initialized.
-        begin_epoch : int
-            Defaults to 0. Indicates the starting epoch. Usually, if resumed from a
-            checkpoint saved at a previous training phase at epoch N, then this value should be
-            N+1.
-        num_epoch : int
-            Number of epochs for training.
-        sparse_row_id_fn : A callback function
-            The function  takes `data_batch` as an input and returns a dict of
-            str -> NDArray. The resulting dict is used for pulling row_sparse
-            parameters from the kvstore, where the str key is the name of the param,
-            and the value is the row id of the param to pull.
-        validation_metric: str or EvalMetric
-            The performance measure used to display during validation.
-        """
-        assert num_epoch is not None, 'please specify number of epochs'
-
-        self.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label,
-                  for_training=True, force_rebind=force_rebind)
-        if monitor is not None:
-            self.install_monitor(monitor)
-        self.init_params(initializer=initializer, arg_params=arg_params, aux_params=aux_params,
-                         allow_missing=allow_missing, force_init=force_init)
-        self.init_optimizer(kvstore=kvstore, optimizer=optimizer, optimizer_params=optimizer_params)
-
-        if validation_metric is None:
-            validation_metric = eval_metric
-        if not isinstance(eval_metric, mx.gluon.metric.EvalMetric):
-            eval_metric = mx.gluon.metric.create(eval_metric)
-
-        ################################################################################
-        # training loop
-        ################################################################################
-        for epoch in range(begin_epoch, num_epoch):
-            eval_metric.reset()
-            tic = time.time()
-            if epoch % self.update_freq == 0:
-                self.update_full_grads(train_data)
-
-            train_data.reset()
-            data_iter = iter(train_data)
-            end_of_batch = False
-            nbatch = 0
-            next_data_batch = next(data_iter)
-
-            while not end_of_batch:
-                data_batch = next_data_batch
-                if monitor is not None:
-                    monitor.tic()
-
-                self.forward_backward(data_batch)
-                self.update()
-
-                if isinstance(data_batch, list):
-                    self.update_metric(eval_metric, [db.label for db in data_batch], pre_sliced=True)
-                else:
-                    self.update_metric(eval_metric, data_batch.label)
-
-                try:
-                    # pre fetch next batch
-                    next_data_batch = next(data_iter)
-                    self.prepare(next_data_batch, sparse_row_id_fn=sparse_row_id_fn)
-                except StopIteration:
-                    end_of_batch = True
-
-                if monitor is not None:
-                    monitor.toc_print()
-
-                if end_of_batch:
-                    eval_name_vals = eval_metric.get_name_value()
-
-                if batch_end_callback is not None:
-                    batch_end_params = mx.model.BatchEndParam(epoch=epoch, nbatch=nbatch,
-                                                              eval_metric=eval_metric, locals=locals())
-                    for callback in mx.base._as_list(batch_end_callback):
-                        callback(batch_end_params)
-
-                nbatch += 1
-            for name, val in eval_name_vals:
-                self.logger.info('Epoch[%d] Train-%s=%f', epoch, name, val)
-            toc = time.time()
-            self.logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc - tic))
-
-            # sync aux params across devices
-            arg_params, aux_params = self.get_params()
-            self.set_params(arg_params, aux_params)
-
-            if epoch_end_callback is not None:
-                for callback in mx.base._as_list(epoch_end_callback):
-                    callback(epoch, self.symbol, arg_params, aux_params)
-
-            # ----------------------------------------
-            # evaluation on validation set
-            if eval_data:
-                res = self.score(eval_data, validation_metric,
-                                 score_end_callback=eval_end_callback,
-                                 batch_end_callback=eval_batch_end_callback, epoch=epoch)
-                for name, val in res:
-                    self.logger.info('Epoch[%d] Validation-%s=%f', epoch, name, val)
-
-    def prepare(self, data_batch, sparse_row_id_fn=None):
-        """Prepares two modules for processing a data batch.
-
-        Usually involves switching bucket and reshaping.
-        For modules that contain `row_sparse` parameters in KVStore,
-        it prepares the `row_sparse` parameters based on the sparse_row_id_fn.
-
-        When KVStore is used to update parameters for multi-device or multi-machine training,
-        a copy of the parameters are stored in KVStore. Note that for `row_sparse` parameters,
-        the `update()` updates the copy of parameters in KVStore, but doesn't broadcast
-        the updated parameters to all devices / machines. The `prepare` function is used to
-        broadcast `row_sparse` parameters with the next batch of data.
-
-        Parameters
-        ----------
-        data_batch : DataBatch
-            The current batch of data for forward computation.
-
-        sparse_row_id_fn : A callback function
-            The function  takes `data_batch` as an input and returns a dict of
-            str -> NDArray. The resulting dict is used for pulling row_sparse
-            parameters from the kvstore, where the str key is the name of the param,
-            and the value is the row id of the param to pull.
-        """
-        super(SVRGModule, self).prepare(data_batch, sparse_row_id_fn=sparse_row_id_fn)
-        self._mod_aux.prepare(data_batch, sparse_row_id_fn=sparse_row_id_fn)
diff --git a/python/mxnet/contrib/svrg_optimization/svrg_optimizer.py b/python/mxnet/contrib/svrg_optimization/svrg_optimizer.py
deleted file mode 100644
index fba99a0434d7..000000000000
--- a/python/mxnet/contrib/svrg_optimization/svrg_optimizer.py
+++ /dev/null
@@ -1,174 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=W0223
-"""A `_SVRGOptimizer` encapsulates two optimizers to support SVRGModule in single machine and distributed settings.
-Both `_AssignmentOptimizer` and `_SVRGOptimizer` are designed to be used with SVRGModule only.
-"""
-
-
-import mxnet as mx
-
-
-@mx.optimizer.register
-class _AssignmentOptimizer(mx.optimizer.Optimizer):
-    """_AssignmentOptimizer assigns gradients to weights for SVRGModule's full gradients
-    accumulation in the KVStore. It is a helper optimizer that is designed to be used with SVRGModule only.
-    """
-    def update(self, indices, weights, grads, states):
-        """Assign the gradients to weight for accumulating full gradients in the KVStore across all devices and workers.
-
-        Parameters
-        ----------
-        indices : list of int
-            List of unique indices of the parameters into the individual learning rates
-            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
-            and `set_wd_mult()`, respectively.
-        weights : list of NDArray
-            List of parameters to be updated.
-        grads : list of NDArray
-            List of gradients of the objective with respect to this parameter.
-        states : List of any obj
-            AssignmentOptimizer will not need to be associated with state.
-        """
-        for weight, grad in zip(weights, grads):
-            weight[:] = grad
-
-
-@mx.optimizer.register
-class _SVRGOptimizer(mx.optimizer.Optimizer):
-    """_SVRGOptimizer is a wrapper class for two optimizers: _AssignmentOptimizer for accumulating full gradients in the
-    KVStore and a default optimizer that is passed in as a parameter in `mod.init_optimizer()`
-    The _SVRGOptimizer is designed to be used with SVRGModule only.
-
-    This optimizer accepts the following parameters in addition to those accepted by :class:`.Optimizer`.
-
-    Parameters
-    ----------
-    default_optimizer: str or Optimizer
-        Optimizer passed-in when invoke on mx.mod.init_optimizer in SVRGModule
-    """
-
-    def __init__(self, default_optimizer, **kwargs):
-        # Reconstruct kwargs to identify additional params for default optimizer
-        base_param = self._check_params(**kwargs)
-        super(_SVRGOptimizer, self).__init__(**base_param)
-        if isinstance(default_optimizer, str):
-            self.default_opt = mx.optimizer.create(default_optimizer, **kwargs)
-        else:
-            self.default_opt = default_optimizer
-        self.aux_opt = mx.optimizer.create(_AssignmentOptimizer.__name__)
-
-    @staticmethod
-    def _check_params(**kwargs):
-        """ Reassemble kwargs to identify additional optimizer params for default optimizers. base_params contains
-        all the param names in base class Optimizer.
-
-        Parameters
-        ----------
-        kwargs: dict
-            Parameters for the default optimizer
-
-        Returns
-        ----------
-        default_params: dict
-            Optimizer parameters that are defined in base class Optimizer
-        """
-
-        optimizer_param = dict(kwargs)
-        base_params = ['rescale_grad', 'param_idx2name', 'wd', 'clip_gradient', 'learning_rate', 'lr_scheduler', 'sym',
-                       'begin_num_update', 'multi_precision', 'param_dict']
-
-        default_params = {}
-        for key, _ in optimizer_param.items():
-            if key in base_params:
-                default_params[key] = optimizer_param[key]
-
-        return default_params
-
-    def update(self, indices, weights, grads, states):
-        """Updates the given parameter using the corresponding gradient and state. If key contains 'full', update with
-        `_AssignmentOptimizer` otherwise will use default optimizer.
-
-        Parameters
-        ----------
-        indices : list of int
-            List of unique indices of the parameters into the individual learning rates
-            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
-            and `set_wd_mult()`, respectively.
-        weights : list of NDArray
-            List of parameters to be updated.
-        grads : list of NDArray
-            List of gradients of the objective with respect to this parameter.
-        states : List of any obj
-            List of state returned by `create_state()`.
-        """
-
-        for index, weight, grad, state in zip(indices, weights, grads, states):
-            name = self._check_index(index)
-
-            if "full" in name:
-                self.aux_opt.update([index], [weight], [grad], [state])
-            else:
-                # use the default optimizer
-                self.default_opt.update([index], [weight], [grad], [state])
-
-    def create_state(self, index, weight):
-        """Creates auxiliary state for a given weight.
-        Some optimizers require additional states, e.g. as momentum, in addition
-        to gradients in order to update weights. This function creates state
-        for a given weight which will be used in `update`. This function is
-        called only once for each weight.
-
-        Parameters
-        ----------
-        index : int
-            An unique index to identify the weight.
-        weight : NDArray
-            The weight.
-        Returns
-        -------
-        state : any obj
-            The state associated with the weight.
-        """
-
-        name = self._check_index(index)
-        if "full" in name:
-            return self.aux_opt.create_state(index, weight)
-        else:
-            #
-            return self.default_opt.create_state(index, weight)
-
-    def _check_index(self, index):
-        """Check index in idx2name to get corresponding param_name
-        Parameters
-        ----------
-        index : int or str
-            An unique index to identify the weight.
-        Returns
-        -------
-        name : str
-            Name of the Module parameter
-        """
-
-        if index in self.idx2name.values():
-            # index is a str
-            name = index
-        else:
-            # index is an int
-            name = self.idx2name[index]
-        return name
diff --git a/python/mxnet/executor.py b/python/mxnet/executor.py
index 03fa812f3200..25b5d85027bc 100644
--- a/python/mxnet/executor.py
+++ b/python/mxnet/executor.py
@@ -30,11 +30,6 @@
 from .ndarray import NDArray
 from .ndarray import _ndarray_cls
 
-# those functions are not used here, we just import them to keep backward compatibility
-# in case the end user calls them, as they originally lives here
-# pylint: disable=unused-import
-from .executor_manager import _split_input_slice, _check_arguments, _load_data, _load_label
-
 def _monitor_callback_wrapper(callback):
     """A wrapper for the user-defined handle."""
     def callback_handle(name, array, _):
diff --git a/python/mxnet/executor_manager.py b/python/mxnet/executor_manager.py
deleted file mode 100644
index d0b7050b8ad0..000000000000
--- a/python/mxnet/executor_manager.py
+++ /dev/null
@@ -1,443 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# coding: utf-8
-# pylint: disable=invalid-name, protected-access, too-many-locals, too-many-arguments, too-many-statements
-"""Executor manager."""
-
-import logging
-import numpy as np
-
-from .base import mx_real_t
-from . import ndarray as nd
-from .context import cpu
-from .io import DataDesc
-
-def _split_input_slice(batch_size, work_load_list):
-    """Get input slice from the input shape.
-
-    Parameters
-    ----------
-    batch_size : int
-        The number of samples in a mini-batch.
-    work_load_list : list of float or int, optional
-        The list of work load for different devices,
-        in the same order as `ctx`.
-
-    Returns
-    -------
-    slices : list of slice
-        The split slices to get a specific slice.
-
-    Raises
-    ------
-    ValueError
-        In case of too many splits, leading to some empty slices.
-    """
-    total_work_load = sum(work_load_list)
-    batch_num_list = [round(work_load * batch_size / total_work_load)
-                      for work_load in work_load_list]
-    batch_num_sum = sum(batch_num_list)
-    if batch_num_sum < batch_size:
-        batch_num_list[-1] += batch_size - batch_num_sum
-    slices = []
-    end = 0
-    for batch_num in batch_num_list:
-        begin = int(min((end, batch_size)))
-        end = int(min((begin + batch_num, batch_size)))
-        if begin >= end:
-            raise ValueError('Too many slices. Some splits are empty.')
-        slices.append(slice(begin, end))
-    return slices
-
-def _check_arguments(symbol):
-    """Check the argument names of symbol.
-    This function checks the duplication of arguments in Symbol.
-    The check is done for feedforward net for now.
-
-    Parameters
-    ----------
-    symbol : Symbol
-        The network configuration.
-    """
-    arg_set = set()
-    arg_names = symbol.list_arguments()
-    for name in arg_names:
-        if name in arg_set:
-            raise ValueError(('Find duplicated argument name \"%s\", ' +
-                              'please make the weight name non-duplicated(using name arguments), ' +
-                              'arguments are %s') % (name, str(arg_names)))
-        arg_set.add(name)
-
-    aux_set = set()
-    aux_names = symbol.list_auxiliary_states()
-    for name in aux_names:
-        if name in aux_set:
-            raise ValueError(
-                ('Find duplicated auxiliary param name \"%s\", ' +
-                 'please make the weight name non-duplicated(using name arguments), ' +
-                 'arguments are %s, auxiliary params are %s'
-                ) % (name, str(arg_names), str(aux_names)))
-        aux_set.add(name)
-
-def _load_general(data, targets):
-    """Load a list of arrays into a list of arrays specified by slices."""
-    for d_src, d_targets in zip(data, targets):
-        if isinstance(d_targets, nd.NDArray):
-            d_src.copyto(d_targets)
-        else:
-            assert d_targets[-1][0].stop == d_src.shape[0], \
-                "Batch size miss match. Expected %d, got %d"%( \
-                    d_targets[-1][0].stop, d_src.shape[0])
-            for slice_idx, d_dst in d_targets:
-                d_src[slice_idx].copyto(d_dst)
-
-def _load_data(batch, targets):
-    """Load data into sliced arrays."""
-    _load_general(batch.data, targets)
-
-def _load_label(batch, targets):
-    """Load label into sliced arrays."""
-    _load_general(batch.label, targets)
-
-# pylint: disable=too-many-branches
-def _bind_exec(sym, ctx, input_shapes, param_names, need_grad=False,
-               base_exec=None, shared_data_arrays=None, input_types=None, logger=logging):
-    """bind executor for bucketing, potentially sharing data with an existing executor."""
-    arg_shape, _, aux_shape = sym.infer_shape(**input_shapes)
-    assert(arg_shape is not None)
-    if input_types is None:
-        input_types = {k: mx_real_t for k in input_shapes.keys()}
-    arg_types, _, aux_types = sym.infer_type(**input_types)
-    assert(arg_types is not None)
-
-    arg_arrays = []
-    grad_arrays = {} if need_grad is not False else None
-
-    arg_names = sym.list_arguments()
-
-    if need_grad is False:
-        need_grad = set()
-    elif need_grad is True:
-        need_grad = set(arg_names) - set(input_shapes.keys())
-    elif isinstance(need_grad, set):
-        pass
-    else:
-        raise AssertionError("need_grad must be boolean or set.")
-    grad_req = {name:('write' if name in need_grad else 'null') for name in arg_names}
-
-
-    # create or borrow arguments and gradients
-    for i, name in enumerate(arg_names):
-        if not name in param_names:
-            # data or label
-            if shared_data_arrays is not None and \
-                    name in shared_data_arrays:
-                arg_arr = shared_data_arrays[name]
-
-                if np.prod(arg_arr.shape) >= np.prod(arg_shape[i]):
-                    # good, we can share this memory
-                    assert(arg_types[i] == arg_arr.dtype)
-                    arg_arr = arg_arr.reshape(arg_shape[i])
-                else:
-                    logger.warning(('bucketing: data "%s" has a shape %s' % (name, arg_shape[i])) +
-                                   (', which is larger than already allocated ') +
-                                   ('shape %s' % (arg_arr.shape,)) +
-                                   ('. Need to re-allocate. Consider putting ') +
-                                   ('default_bucket_key to be the bucket taking the largest ') +
-                                   ('input for better memory sharing.'))
-                    arg_arr = nd.zeros(arg_shape[i], ctx, dtype=arg_types[i])
-
-                    # replace existing shared array because the new one is bigger
-                    shared_data_arrays[name] = arg_arr
-            else:
-                arg_arr = nd.zeros(arg_shape[i], ctx, dtype=arg_types[i])
-                if shared_data_arrays is not None:
-                    shared_data_arrays[name] = arg_arr
-
-            arg_arrays.append(arg_arr)
-        else:
-            # model parameter
-            if base_exec is None:
-                arg_arr = nd.zeros(arg_shape[i], ctx, dtype=arg_types[i])
-                if name in need_grad:
-                    grad_arr = nd.zeros(arg_shape[i], ctx, dtype=arg_types[i])
-                    grad_arrays[name] = grad_arr
-            else:
-                arg_arr = base_exec.arg_dict[name]
-                assert arg_arr.shape == arg_shape[i]
-                assert arg_arr.dtype == arg_types[i]
-                if name in need_grad:
-                    grad_arrays[name] = base_exec.grad_dict[name]
-            arg_arrays.append(arg_arr)
-
-    # create or borrow aux variables
-    if base_exec is None:
-        aux_arrays = [nd.zeros(s, ctx, dtype=t) for s, t in zip(aux_shape, aux_types)]
-    else:
-        for i, a in enumerate(base_exec.aux_arrays):
-            assert aux_shape[i] == a.shape
-            assert aux_types[i] == a.dtype
-
-        aux_arrays = [a for a in base_exec.aux_arrays]
-
-    executor = sym.bind(ctx=ctx, args=arg_arrays, args_grad=grad_arrays,
-                        aux_states=aux_arrays,
-                        grad_req=grad_req, shared_exec=base_exec)
-    return executor
-
-class DataParallelExecutorGroup(object):
-    """A group of executors living on different devices, for data parallelization.
-
-    Parameters
-    ----------
-    sym: Symbol
-        The network configuration.
-    arg_names: list of str
-        Equals `sym.list_arguments()`
-    param_names: list of str
-        List of names of all trainable parameters.
-    ctx: list of Context
-        List of devices for training (data parallelization).
-    slices: list of int
-        Describes how the data parallelization splits data into different devices.
-    train_data: DataIter (or DataBatch)
-        The dataset for training. It could be any object with `provide_data` and
-        `provide_label` properties. Loading of actual data is not necessarily needed
-        at this stage.
-    shared_grop: DataParallelExecutorGroup
-        An existing executor group, if to share parameters with it.
-    """
-    def __init__(self, sym, arg_names, param_names, ctx, slices, train_data, shared_group=None):
-        # make sure the architecture is valid
-        _check_arguments(sym)
-
-        if shared_group is None:
-            self.shared_data_arrays = [{} for _ in ctx]
-        else:
-            self.shared_data_arrays = shared_group.shared_data_arrays
-
-        self.data_names = [x[0] for x in train_data.provide_data]
-        self.label_names = [x[0] for x in train_data.provide_label]
-        self.aux_names = sym.list_auxiliary_states()
-        self.param_idx = [i for i in range(len(arg_names)) if arg_names[i] in param_names]
-        self.param_names = [arg_names[i] for i in self.param_idx]
-
-        self.train_execs = []
-        for i, ctxi in enumerate(ctx):
-            data_shapes = {}
-            data_types = {}
-            for x in train_data.provide_data + train_data.provide_label:
-                data_shapes[x[0]] = tuple([slices[i].stop - slices[i].start] + list(x[1][1:]))
-                if isinstance(x, DataDesc):
-                    data_types[x.name] = x.dtype
-                else:
-                    data_types[x[0]] = mx_real_t
-            shared_exec = None if shared_group is None else shared_group.train_execs[i]
-            train_exec = _bind_exec(sym, ctxi, data_shapes, self.param_names,
-                                    need_grad=True, base_exec=shared_exec,
-                                    shared_data_arrays=self.shared_data_arrays[i],
-                                    input_types=data_types)
-            self.train_execs.append(train_exec)
-
-        # data structure
-        self.data_arrays = [[(slices[i], e.arg_dict[name]) for i, e in enumerate(self.train_execs)]
-                            for name in self.data_names]
-        self.label_arrays = [[(slices[i], e.arg_dict[name]) for i, e in enumerate(self.train_execs)]
-                             for name in self.label_names]
-
-        self.param_arrays = [[e.arg_arrays[i] for e in self.train_execs]
-                             for i in self.param_idx]
-        self.grad_arrays = [[e.grad_arrays[i] for e in self.train_execs]
-                            for i in self.param_idx]
-
-        self.aux_arrays = [[e.aux_arrays[i] for e in self.train_execs]
-                           for i in range(len(self.aux_names))]
-
-        self.slices = slices
-
-    def load_data_batch(self, data_batch):
-        """Load data and labels into arrays."""
-        _load_data(data_batch, self.data_arrays)
-        _load_label(data_batch, self.label_arrays)
-
-    def forward(self, is_train=False):
-        """Perform a forward pass on each executor."""
-        for texec in self.train_execs:
-            texec.forward(is_train=is_train)
-
-    def backward(self):
-        """Perform a backward pass on each executor."""
-        for texec in self.train_execs:
-            texec.backward()
-
-    def update_metric(self, metric, labels, pre_sliced=False):
-        """Update evaluation metric with label and current outputs."""
-        for current_exec, (texec, islice) in enumerate(zip(self.train_execs, self.slices)):
-            if not pre_sliced:
-                labels_slice = [label[islice] for label in labels]
-            else:
-                labels_slice = labels[current_exec]
-            metric.update(labels_slice, texec.outputs)
-
-class DataParallelExecutorManager(object):
-    """ Helper class to manage multiple executors for data parallelism.
-
-    Parameters
-    ----------
-    symbol : Symbol
-        Output symbol.
-    ctx : list of Context
-        Devices to run on.
-    param_names: list of str
-        Name of all trainable parameters of the network.
-    arg_names: list of str
-        Name of all arguments of the network.
-    aux_names: list of str
-        Name of all auxiliary states of the network.
-    train_data : DataIter
-        Training data iterator.
-    work_load_list : list of float or int, optional
-        The list of work load for different devices,
-        in the same order as ctx.
-    logger : logging logger
-        When not specified, default logger will be used.
-    sym_gen : A function that generate new Symbols depending on different
-        input shapes. Used only for bucketing.
-    """
-    def __init__(self, symbol, ctx, train_data,
-                 arg_names, param_names, aux_names,
-                 work_load_list=None, logger=None, sym_gen=None):
-        if logger is None:
-            logger = logging
-        # preparation
-        num_device = len(ctx)
-        logger.info('Start training with %s', str(ctx))
-
-        if work_load_list is None:
-            work_load_list = [1] * num_device
-        assert isinstance(work_load_list, list) and len(work_load_list) == num_device, \
-            "Invalid settings for work load. "
-
-        slices = _split_input_slice(train_data.batch_size, work_load_list)
-        self.slices = slices
-
-        self.arg_names = arg_names
-        self.param_names = param_names
-        self.aux_names = aux_names
-        self.ctx = ctx
-
-        self.execgrp = DataParallelExecutorGroup(symbol, self.arg_names, self.param_names, self.ctx,
-                                                 self.slices, train_data)
-        self.symbol = symbol
-
-        self.sym_gen = sym_gen
-        self.curr_execgrp = None # this is set when data is loaded
-        if self.sym_gen is not None:
-            self.execgrp_bucket = {train_data.default_bucket_key: self.execgrp}
-
-
-    def install_monitor(self, monitor):
-        """Install monitor on all executors."""
-        if self.sym_gen is not None:
-            raise NotImplementedError("Monitoring is not implemented for bucketing")
-
-        for train_exec in self.execgrp.train_execs:
-            monitor.install(train_exec)
-
-    def set_params(self, arg_params, aux_params):
-        """Set parameter and aux values.
-
-        Parameters
-        ----------
-        arg_params : list of NDArray
-            Source parameter arrays
-        aux_params : list of NDArray
-            Source aux arrays.
-        """
-
-        for texec in self.execgrp.train_execs:
-            texec.copy_params_from(arg_params, aux_params)
-
-    def copy_to(self, arg_params, aux_params):
-        """ Copy data from each executor to ```arg_params`` and ``aux_params``.
-
-        Parameters
-        ----------
-        arg_params : list of NDArray
-            Target parameter arrays.
-        aux_params : list of NDArray
-            Target aux arrays.
-
-        Notes
-        -----
-        - This function will inplace update the NDArrays in arg_params and aux_params.
-        """
-        for name, block in zip(self.param_names, self.param_arrays):
-            weight = sum(w.copyto(cpu()) for w in block) / len(block)
-            weight.astype(arg_params[name].dtype).copyto(arg_params[name])
-        for name, block in zip(self.aux_names, self.aux_arrays):
-            weight = sum(w.copyto(cpu()) for w in block) / len(block)
-            weight.astype(aux_params[name].dtype).copyto(aux_params[name])
-
-    @property
-    def param_arrays(self):
-        """Shared parameter arrays."""
-        # param arrays should be shared by all executor groups
-        return self.execgrp.param_arrays
-    @property
-    def grad_arrays(self):
-        """Shared gradient arrays."""
-        # grad arrays should be shared by all executor groups
-        return self.execgrp.grad_arrays
-
-    @property
-    def aux_arrays(self):
-        """Shared aux states."""
-        # aux arrays are also shared by all executor groups
-        return self.execgrp.aux_arrays
-
-    def load_data_batch(self, data_batch):
-        """Load data and labels into arrays."""
-        if self.sym_gen is not None:
-            key = data_batch.bucket_key
-            if key not in self.execgrp_bucket:
-                # create new bucket entry
-                symbol = self.sym_gen(key)
-                execgrp = DataParallelExecutorGroup(symbol, self.arg_names,
-                                                    self.param_names, self.ctx,
-                                                    self.slices, data_batch,
-                                                    shared_group=self.execgrp)
-                self.execgrp_bucket[key] = execgrp
-
-            self.curr_execgrp = self.execgrp_bucket[key]
-        else:
-            self.curr_execgrp = self.execgrp
-
-        self.curr_execgrp.load_data_batch(data_batch)
-
-    def forward(self, is_train=False):
-        """Run forward on the current executor."""
-        self.curr_execgrp.forward(is_train=is_train)
-
-    def backward(self):
-        """Run backward on the current executor."""
-        self.curr_execgrp.backward()
-
-    def update_metric(self, metric, labels, pre_sliced=False):
-        """Update metric with the current executor."""
-        self.curr_execgrp.update_metric(metric, labels, pre_sliced)
diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py
index 6c3612c7d784..91fe814811eb 100644
--- a/python/mxnet/gluon/block.py
+++ b/python/mxnet/gluon/block.py
@@ -1247,7 +1247,7 @@ def infer_type(self, *args):
 
     def export(self, path, epoch=0, remove_amp_cast=True):
         """Export HybridBlock to json format that can be loaded by
-        `gluon.SymbolBlock.imports`, `mxnet.mod.Module` or the C++ interface.
+        `gluon.SymbolBlock.imports` or the C++ interface.
 
         .. note:: When there are only one input, it will have name `data`. When there
                   Are more than one inputs, they will be named as `data0`, `data1`, etc.
@@ -1441,8 +1441,8 @@ class SymbolBlock(HybridBlock):
     """
     @staticmethod
     def imports(symbol_file, input_names, param_file=None, ctx=None):
-        """Import model previously saved by `gluon.HybridBlock.export` or
-        `Module.save_checkpoint` as a `gluon.SymbolBlock` for use in Gluon.
+        """Import model previously saved by `gluon.HybridBlock.export`
+        as a `gluon.SymbolBlock` for use in Gluon.
 
         Parameters
         ----------
diff --git a/python/mxnet/initializer.py b/python/mxnet/initializer.py
index 51cb63aae424..28f8af1eeed9 100644
--- a/python/mxnet/initializer.py
+++ b/python/mxnet/initializer.py
@@ -306,11 +306,9 @@ def register(klass):
     ...   def _init_bias(self, _, arr):
     ...     arr[:] = 1
     ...
-    >>> # Module is an instance of 'mxnet.module.Module'
+    >>> # block is an instance of 'mxnet.gluon.Block'
     ...
-    >>> module.init_params("custominit")
-    >>> # module.init_params("myinit")
-    >>> # module.init_params(CustomInit())
+    >>> block.initialize(CustomInit())
     """
     return _register(klass)
 
@@ -374,11 +372,11 @@ class Mixed(object):
 
     Example
     -------
-    >>> # Given 'module', an instance of 'mxnet.module.Module', initialize biases to zero
+    >>> # Given 'block', an instance of 'mxnet.gluon.Block', initialize biases to zero
     ... # and every other parameter to random values with uniform distribution.
     ...
     >>> init = mx.initializer.Mixed(['bias', '.*'], [mx.init.Zero(), mx.init.Uniform(0.1)])
-    >>> module.init_params(init)
+    >>> block.initialize(init)
     >>>
     >>> for dictionary in module.get_params():
     ...     for key in dictionary:
@@ -410,10 +408,10 @@ class Zero(Initializer):
 
     Example
     -------
-    >>> # Given 'module', an instance of 'mxnet.module.Module', initialize weights to zero.
+    >>> # Given 'block', an instance of 'mxnet.gluon.Block', initialize weights to zero.
     ...
     >>> init = mx.initializer.Zero()
-    >>> module.init_params(init)
+    >>> module.initialize(init)
     >>> for dictionary in module.get_params():
     ...     for key in dictionary:
     ...         print(key)
@@ -435,10 +433,10 @@ class One(Initializer):
 
     Example
     -------
-    >>> # Given 'module', an instance of 'mxnet.module.Module', initialize weights to one.
+    >>> # Given 'block', an instance of 'mxnet.gluon.Block', initialize weights to one.
     ...
     >>> init = mx.initializer.One()
-    >>> module.init_params(init)
+    >>> module.initialize(init)
     >>> for dictionary in module.get_params():
     ...     for key in dictionary:
     ...         print(key)
@@ -490,11 +488,11 @@ class Uniform(Initializer):
 
     Example
     -------
-    >>> # Given 'module', an instance of 'mxnet.module.Module', initialize weights
+    >>> # Given 'block', an instance of 'mxnet.gluon.Block', initialize weights
     >>> # to random values uniformly sampled between -0.1 and 0.1.
     ...
     >>> init = mx.init.Uniform(0.1)
-    >>> module.init_params(init)
+    >>> module.initialize(init)
     >>> for dictionary in module.get_params():
     ...     for key in dictionary:
     ...         print(key)
@@ -524,11 +522,11 @@ class Normal(Initializer):
 
     Example
     -------
-    >>> # Given 'module', an instance of 'mxnet.module.Module', initialize weights
+    >>> # Given 'block', an instance of 'mxnet.gluon.Block', initialize weights
     >>> # to random values sampled from a normal distribution.
     ...
     >>> init = mx.init.Normal(0.5)
-    >>> module.init_params(init)
+    >>> module.initialize(init)
     >>> for dictionary in module.get_params():
     ...     for key in dictionary:
     ...         print(key)
diff --git a/python/mxnet/module/__init__.py b/python/mxnet/module/__init__.py
deleted file mode 100644
index 32ecbb9c8be3..000000000000
--- a/python/mxnet/module/__init__.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""A module is like a FeedForward model. But we would like to make it
-easier to compose, similar to Torch modules.
-"""
-
-from .base_module import BaseModule
-from .module import Module
-from .bucketing_module import BucketingModule
-from .sequential_module import SequentialModule
-
-from .python_module import PythonModule, PythonLossModule
diff --git a/python/mxnet/module/base_module.py b/python/mxnet/module/base_module.py
deleted file mode 100644
index 92fb7f188bfb..000000000000
--- a/python/mxnet/module/base_module.py
+++ /dev/null
@@ -1,1067 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=fixme, too-many-arguments, too-many-locals, no-else-raise
-# pylint: disable=too-many-public-methods, too-many-branches, too-many-lines
-"""`BaseModule` defines an API for modules."""
-
-import time
-import logging
-import warnings
-import numpy as np
-
-from ..gluon import metric
-from .. import ndarray
-
-from ..context import cpu
-from ..model import BatchEndParam
-from ..initializer import Uniform
-from ..io import DataDesc, DataIter, DataBatch
-from ..base import _as_list
-
-
-def _check_input_names(symbol, names, typename, throw):
-    """Check that all input names are in symbol's arguments."""
-    args = symbol.list_arguments()
-    for name in names:
-        if name in args:
-            continue
-        candidates = [arg for arg in args if
-                      not arg.endswith('_weight') and
-                      not arg.endswith('_bias') and
-                      not arg.endswith('_gamma') and
-                      not arg.endswith('_beta')]
-        msg = "\033[91mYou created Module with Module(..., %s_names=%s) but " \
-              "input with name '%s' is not found in symbol.list_arguments(). " \
-              "Did you mean one of:\n\t%s\033[0m"%(
-                  typename, str(names), name, '\n\t'.join(candidates))
-        if throw:
-            raise ValueError(msg)
-        else:
-            warnings.warn(msg)
-
-
-def _check_names_match(data_names, data_shapes, name, throw):
-    """Check that input names matches input data descriptors."""
-    actual = [x[0] for x in data_shapes]
-    if sorted(data_names) != sorted(actual):
-        msg = "Data provided by %s_shapes don't match names specified by %s_names (%s vs. %s)"%(
-            name, name, str(data_shapes), str(data_names))
-        if throw:
-            raise ValueError(msg)
-        else:
-            warnings.warn(msg)
-
-
-def _parse_data_desc(data_names, label_names, data_shapes, label_shapes):
-    """parse data_attrs into DataDesc format and check that names match"""
-    data_shapes = [x if isinstance(x, DataDesc) else DataDesc(*x) for x in data_shapes]
-    _check_names_match(data_names, data_shapes, 'data', True)
-    if label_shapes is not None:
-        label_shapes = [x if isinstance(x, DataDesc) else DataDesc(*x) for x in label_shapes]
-        _check_names_match(label_names, label_shapes, 'label', False)
-    else:
-        _check_names_match(label_names, [], 'label', False)
-    return data_shapes, label_shapes
-
-
-class BaseModule(object):
-    """The base class of a module.
-
-    A module represents a computation component. One can think of module as a computation machine.
-    A module can execute forward and backward passes and update parameters in a model.
-    We aim to make the APIs easy to use, especially in the case when we need to use the imperative
-    API to work with multiple modules (e.g. stochastic depth network).
-
-    A module has several states:
-
-    - Initial state: Memory is not allocated yet, so the module is not ready for computation yet.
-    - Binded: Shapes for inputs, outputs, and parameters are all known, memory has been allocated,
-      and the module is ready for computation.
-    - Parameters are initialized: For modules with parameters, doing computation before
-      initializing the parameters might result in undefined outputs.
-    - Optimizer is installed: An optimizer can be installed to a module. After this, the parameters
-      of the module can be updated according to the optimizer after gradients are computed
-      (forward-backward).
-
-    In order for a module to interact with others, it must be able to report the
-    following information in its initial state (before binding):
-
-    - `data_names`: list of type string indicating the names of the required input data.
-    - `output_names`: list of type string indicating the names of the required outputs.
-
-    After binding, a module should be able to report the following richer information:
-
-    - state information
-        - `binded`: `bool`, indicates whether the memory buffers needed for computation
-          have been allocated.
-        - `for_training`: whether the module is bound for training.
-        - `params_initialized`: `bool`, indicates whether the parameters of this module
-          have been initialized.
-        - `optimizer_initialized`: `bool`, indicates whether an optimizer is defined
-          and initialized.
-        - `inputs_need_grad`: `bool`, indicates whether gradients with respect to the
-          input data are needed. Might be useful when implementing composition of modules.
-
-    - input/output information
-        - `data_shapes`: a list of `(name, shape)`. In theory, since the memory is allocated,
-          we could directly provide the data arrays. But in the case of data parallelism,
-          the data arrays might not be of the same shape as viewed from the external world.
-        - `label_shapes`: a list of `(name, shape)`. This might be `[]` if the module does
-          not need labels (e.g. it does not contains a loss function at the top), or a module
-          is not bound for training.
-        - `output_shapes`: a list of `(name, shape)` for outputs of the module.
-
-    - parameters (for modules with parameters)
-        - `get_params()`: return a tuple `(arg_params, aux_params)`. Each of those
-          is a dictionary of name to ``NDArray`` mapping. Those `NDArray` always lives on
-          CPU. The actual parameters used for computing might live on other devices (GPUs),
-          this function will retrieve (a copy of) the latest parameters.
-        - ``set_params(arg_params, aux_params)``: assign parameters to the devices
-          doing the computation.
-        - ``init_params(...)``: a more flexible interface to assign or initialize the parameters.
-
-    - setup
-        - `bind()`: prepare environment for computation.
-        - `init_optimizer()`: install optimizer for parameter updating.
-        - `prepare()`: prepare the module based on the current data batch.
-
-    - computation
-        - `forward(data_batch)`: forward operation.
-        - `backward(out_grads=None)`: backward operation.
-        - `update()`: update parameters according to installed optimizer.
-        - `get_outputs()`: get outputs of the previous forward operation.
-        - `get_input_grads()`: get the gradients with respect to the inputs computed
-          in the previous backward operation.
-        - `update_metric(metric, labels, pre_sliced=False)`: update performance metric
-          for the previous forward
-          computed results.
-
-    - other properties (mostly for backward compatibility)
-        - `symbol`: the underlying symbolic graph for this module (if any)
-          This property is not necessarily constant. For example, for `BucketingModule`,
-          this property is simply the *current* symbol being used. For other modules,
-          this value might not be well defined.
-
-    When those intermediate-level API are implemented properly, the following
-    high-level API will be automatically available for a module:
-
-    - `fit`: train the module parameters on a data set.
-    - `predict`: run prediction on a data set and collect outputs.
-    - `score`: run prediction on a data set and evaluate performance.
-
-    Examples
-    --------
-    >>> # An example of creating a mxnet module.
-    >>> import mxnet as mx
-    >>> data = mx.symbol.Variable('data')
-    >>> fc1  = mx.symbol.FullyConnected(data, name='fc1', num_hidden=128)
-    >>> act1 = mx.symbol.Activation(fc1, name='relu1', act_type="relu")
-    >>> fc2  = mx.symbol.FullyConnected(act1, name = 'fc2', num_hidden = 64)
-    >>> act2 = mx.symbol.Activation(fc2, name='relu2', act_type="relu")
-    >>> fc3  = mx.symbol.FullyConnected(act2, name='fc3', num_hidden=10)
-    >>> out  = mx.symbol.SoftmaxOutput(fc3, name = 'softmax')
-    >>> mod = mx.mod.Module(out)
-    """
-    def __init__(self, logger=logging):
-        self.logger = logger
-        self.binded = False
-        self.for_training = False
-        self.inputs_need_grad = False
-        self.params_initialized = False
-        self.optimizer_initialized = False
-        self._symbol = None
-        self._total_exec_bytes = 0
-
-    ################################################################################
-    # High Level API
-    ################################################################################
-    def forward_backward(self, data_batch):
-        """A convenient function that calls both ``forward`` and ``backward``."""
-        self.forward(data_batch, is_train=True)
-        self.backward()
-
-    def score(self, eval_data, eval_metric, num_batch=None, batch_end_callback=None,
-              score_end_callback=None,
-              reset=True, epoch=0, sparse_row_id_fn=None):
-        """Runs prediction on ``eval_data`` and evaluates the performance according to
-        the given ``eval_metric``.
-
-        Checkout `Module Tutorial <https://mxnet.apache.org/api/python/tutorials/packages/module/index.html>`_
-        to see an end-to-end use-case.
-
-        Parameters
-        ----------
-        eval_data : DataIter
-            Evaluation data to run prediction on.
-        eval_metric : EvalMetric or list of EvalMetrics
-            Evaluation metric to use.
-        num_batch : int
-            Number of batches to run. Defaults to ``None``, indicating run until the `DataIter`
-            finishes.
-        batch_end_callback : function
-            Could also be a list of functions.
-        reset : bool
-            Defaults to ``True``. Indicates whether we should reset `eval_data` before starting
-            evaluating.
-        epoch : int
-            Defaults to 0. For compatibility, this will be passed to callbacks (if any).
-            During training, this will correspond to the training epoch number.
-        sparse_row_id_fn : A callback function
-            The function  takes `data_batch` as an input and returns a dict of
-            str -> NDArray. The resulting dict is used for pulling row_sparse
-            parameters from the kvstore, where the str key is the name of the param,
-            and the value is the row id of the param to pull.
-
-        Examples
-        --------
-        >>> # An example of using score for prediction.
-        >>> # Evaluate accuracy on val_dataiter
-        >>> metric = mx.gluon.metric.Accuracy()
-        >>> mod.score(val_dataiter, metric)
-        >>> mod.score(val_dataiter, ['mse', 'acc'])
-        """
-        assert self.binded and self.params_initialized
-
-        if reset:
-            eval_data.reset()
-
-        if not isinstance(eval_metric, metric.EvalMetric):
-            eval_metric = metric.create(eval_metric)
-
-        eval_metric.reset()
-        actual_num_batch = 0
-
-        for nbatch, eval_batch in enumerate(eval_data):
-            if num_batch is not None and nbatch == num_batch:
-                break
-            self.prepare(eval_batch, sparse_row_id_fn=sparse_row_id_fn)
-            self.forward(eval_batch, is_train=False)
-            if isinstance(eval_batch, list):
-                self.update_metric(eval_metric, [eb.label for eb in eval_batch], pre_sliced=True)
-            else:
-                self.update_metric(eval_metric, eval_batch.label)
-
-            if batch_end_callback is not None:
-                batch_end_params = BatchEndParam(epoch=epoch,
-                                                 nbatch=nbatch,
-                                                 eval_metric=eval_metric,
-                                                 locals=locals())
-                for callback in _as_list(batch_end_callback):
-                    callback(batch_end_params)
-            actual_num_batch += 1
-
-        if score_end_callback:
-            params = BatchEndParam(epoch=epoch,
-                                   nbatch=actual_num_batch,
-                                   eval_metric=eval_metric,
-                                   locals=locals())
-            for callback in _as_list(score_end_callback):
-                callback(params)
-
-        return eval_metric.get_name_value()
-
-    def iter_predict(self, eval_data, num_batch=None, reset=True, sparse_row_id_fn=None):
-        """Iterates over predictions.
-
-        Examples
-        --------
-        >>> for pred, i_batch, batch in module.iter_predict(eval_data):
-        ...     # pred is a list of outputs from the module
-        ...     # i_batch is a integer
-        ...     # batch is the data batch from the data iterator
-
-        Parameters
-        ----------
-        eval_data : DataIter
-            Evaluation data to run prediction on.
-        num_batch : int
-            Default is ``None``, indicating running all the batches in the data iterator.
-        reset : bool
-            Default is ``True``, indicating whether we should reset the data iter before start
-            doing prediction.
-        sparse_row_id_fn : A callback function
-            The function  takes `data_batch` as an input and returns a dict of
-            str -> NDArray. The resulting dict is used for pulling row_sparse
-            parameters from the kvstore, where the str key is the name of the param,
-            and the value is the row id of the param to pull.
-        """
-        assert self.binded and self.params_initialized
-
-        if reset:
-            eval_data.reset()
-
-        for nbatch, eval_batch in enumerate(eval_data):
-            if num_batch is not None and nbatch == num_batch:
-                break
-            self.prepare(eval_batch, sparse_row_id_fn=sparse_row_id_fn)
-            self.forward(eval_batch, is_train=False)
-            pad = eval_batch.pad
-            outputs = [out[0:out.shape[0]-pad] for out in self.get_outputs()]
-
-            yield (outputs, nbatch, eval_batch)
-
-    def predict(self, eval_data, num_batch=None, merge_batches=True, reset=True,
-                always_output_list=False, sparse_row_id_fn=None):
-        """Runs prediction and collects the outputs.
-
-        When `merge_batches` is ``True`` (by default), the return value will be a list
-        ``[out1, out2, out3]``, where each element is formed by concatenating the outputs for
-        all the mini-batches. When `always_output_list` is ``False`` (as by default),
-        then in the case of a single output, `out1` is returned instead of ``[out1]``.
-
-        When `merge_batches` is ``False``, the return value will be a nested list like
-        ``[[out1_batch1, out2_batch1], [out1_batch2], ...]``. This mode is useful because
-        in some cases (e.g. bucketing), the module does not necessarily produce the same
-        number of outputs.
-
-        The objects in the results have type `NDArray`. If you need to work with a numpy array,
-        just call ``.asnumpy()`` on each `NDArray`.
-
-        Parameters
-        ----------
-        eval_data : DataIter or NDArray or numpy array
-            Evaluation data to run prediction on.
-        num_batch : int
-            Defaults to ``None``, indicates running all the batches in the data iterator.
-        merge_batches : bool
-            Defaults to ``True``, see above for return values.
-        reset : bool
-            Defaults to ``True``, indicates whether we should reset the data iter before
-            doing prediction.
-        always_output_list : bool
-            Defaults to ``False``, see above for return values.
-        sparse_row_id_fn : A callback function
-            The function  takes `data_batch` as an input and returns a dict of
-            str -> NDArray. The resulting dict is used for pulling row_sparse
-            parameters from the kvstore, where the str key is the name of the param,
-            and the value is the row id of the param to pull.
-
-        Returns
-        -------
-        list of NDArray or list of list of NDArray
-            Prediction results.
-
-        Examples
-        --------
-        >>> # An example of using `predict` for prediction.
-        >>> # Predict on the first 10 batches of val_dataiter
-        >>> mod.predict(eval_data=val_dataiter, num_batch=10)
-        """
-        assert self.binded and self.params_initialized
-
-        if isinstance(eval_data, (ndarray.NDArray, np.ndarray)):
-            if isinstance(eval_data, np.ndarray):
-                eval_data = ndarray.array(eval_data)
-            self.forward(DataBatch([eval_data]))
-            return self.get_outputs()[0]
-
-        if not isinstance(eval_data, DataIter):
-            raise ValueError('eval_data must be of type NDArray or DataIter')
-
-        if reset:
-            eval_data.reset()
-
-        output_list = []
-
-        for nbatch, eval_batch in enumerate(eval_data):
-            if num_batch is not None and nbatch == num_batch:
-                break
-            self.prepare(eval_batch, sparse_row_id_fn=sparse_row_id_fn)
-            self.forward(eval_batch, is_train=False)
-            pad = eval_batch.pad
-            outputs = [out[0:out.shape[0]-pad].copy() for out in self.get_outputs()]
-
-            output_list.append(outputs)
-
-        if len(output_list) == 0:
-            return output_list
-
-        if merge_batches:
-            num_outputs = len(output_list[0])
-            for out in output_list:
-                assert len(out) == num_outputs, \
-                       'Cannot merge batches, as num of outputs is not the same ' + \
-                       'in mini-batches. Maybe bucketing is used?'
-            output_list2 = [ndarray.concatenate([out[i] for out in output_list])
-                            for i in range(num_outputs)]
-
-            if num_outputs == 1 and not always_output_list:
-                return output_list2[0]
-            return output_list2
-
-        return output_list
-
-    def fit(self, train_data, eval_data=None, eval_metric='acc',
-            epoch_end_callback=None, batch_end_callback=None, kvstore='local',
-            optimizer='sgd', optimizer_params=(('learning_rate', 0.01),),
-            eval_end_callback=None,
-            eval_batch_end_callback=None, initializer=Uniform(0.01),
-            arg_params=None, aux_params=None, allow_missing=False,
-            force_rebind=False, force_init=False, begin_epoch=0, num_epoch=None,
-            validation_metric=None, monitor=None, sparse_row_id_fn=None):
-        """Trains the module parameters.
-
-        Checkout `Module Tutorial <https://mxnet.apache.org/api/python/tutorials/packages/module/index.html>`_
-        to see an end-to-end use-case.
-
-        Parameters
-        ----------
-        train_data : DataIter
-            Train DataIter.
-        eval_data : DataIter
-            If not ``None``, will be used as validation set and the performance
-            after each epoch will be evaluated.
-        eval_metric : str or EvalMetric
-            Defaults to 'accuracy'. The performance measure used to display during training.
-            Other possible predefined metrics are:
-            'ce' (CrossEntropy), 'f1', 'mae', 'mse', 'rmse', 'top_k_accuracy'.
-        epoch_end_callback : function or list of functions
-            Each callback will be called with the current `epoch`, `symbol`, `arg_params`
-            and `aux_params`.
-        batch_end_callback : function or list of function
-            Each callback will be called with a `BatchEndParam`.
-        kvstore : str or KVStore
-            Defaults to 'local'.
-        optimizer : str or Optimizer
-            Defaults to 'sgd'.
-        optimizer_params : dict
-            Defaults to ``(('learning_rate', 0.01),)``. The parameters for
-            the optimizer constructor.
-            The default value is not a dict, just to avoid pylint warning on dangerous
-            default values.
-        eval_end_callback : function or list of function
-            These will be called at the end of each full evaluation, with the metrics over
-            the entire evaluation set.
-        eval_batch_end_callback : function or list of function
-            These will be called at the end of each mini-batch during evaluation.
-        initializer : Initializer
-            The initializer is called to initialize the module parameters when they are
-            not already initialized.
-        arg_params : dict
-            Defaults to ``None``, if not ``None``, should be existing parameters from a trained
-            model or loaded from a checkpoint (previously saved model). In this case,
-            the value here will be used to initialize the module parameters, unless they
-            are already initialized by the user via a call to `init_params` or `fit`.
-            `arg_params` has a higher priority than `initializer`.
-        aux_params : dict
-            Defaults to ``None``. Similar to `arg_params`, except for auxiliary states.
-        allow_missing : bool
-            Defaults to ``False``. Indicates whether to allow missing parameters when `arg_params`
-            and `aux_params` are not ``None``. If this is ``True``, then the missing parameters
-            will be initialized via the `initializer`.
-        force_rebind : bool
-            Defaults to ``False``. Whether to force rebinding the executors if already bound.
-        force_init : bool
-            Defaults to ``False``. Indicates whether to force initialization even if the
-            parameters are already initialized.
-        begin_epoch : int
-            Defaults to 0. Indicates the starting epoch. Usually, if resumed from a
-            checkpoint saved at a previous training phase at epoch N, then this value should be
-            N+1.
-        num_epoch : int
-            Number of epochs for training.
-        sparse_row_id_fn : A callback function
-            The function  takes `data_batch` as an input and returns a dict of
-            str -> NDArray. The resulting dict is used for pulling row_sparse
-            parameters from the kvstore, where the str key is the name of the param,
-            and the value is the row id of the param to pull.
-
-        Examples
-        --------
-        >>> # An example of using fit for training.
-        >>> # Assume training dataIter and validation dataIter are ready
-        >>> # Assume loading a previously checkpointed model
-        >>> sym, arg_params, aux_params = mx.model.load_checkpoint(model_prefix, 3)
-        >>> mod.fit(train_data=train_dataiter, eval_data=val_dataiter, optimizer='sgd',
-        ...     optimizer_params={'learning_rate':0.01, 'momentum': 0.9},
-        ...     arg_params=arg_params, aux_params=aux_params,
-        ...     eval_metric='acc', num_epoch=10, begin_epoch=3)
-        """
-        assert num_epoch is not None, 'please specify number of epochs'
-
-        self.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label,
-                  for_training=True, force_rebind=force_rebind)
-        if monitor is not None:
-            self.install_monitor(monitor)
-        self.init_params(initializer=initializer, arg_params=arg_params, aux_params=aux_params,
-                         allow_missing=allow_missing, force_init=force_init)
-        self.init_optimizer(kvstore=kvstore, optimizer=optimizer,
-                            optimizer_params=optimizer_params)
-
-        if validation_metric is None:
-            validation_metric = eval_metric
-        if not isinstance(eval_metric, metric.EvalMetric):
-            eval_metric = metric.create(eval_metric)
-
-        ################################################################################
-        # training loop
-        ################################################################################
-        for epoch in range(begin_epoch, num_epoch):
-            tic = time.time()
-            eval_metric.reset()
-            nbatch = 0
-            data_iter = iter(train_data)
-            end_of_batch = False
-            next_data_batch = next(data_iter)
-            while not end_of_batch:
-                data_batch = next_data_batch
-                if monitor is not None:
-                    monitor.tic()
-                self.forward_backward(data_batch)
-                self.update()
-
-                if isinstance(data_batch, list):
-                    self.update_metric(eval_metric,
-                                       [db.label for db in data_batch],
-                                       pre_sliced=True)
-                else:
-                    self.update_metric(eval_metric, data_batch.label)
-
-                try:
-                    # pre fetch next batch
-                    next_data_batch = next(data_iter)
-                    self.prepare(next_data_batch, sparse_row_id_fn=sparse_row_id_fn)
-                except StopIteration:
-                    end_of_batch = True
-
-                if monitor is not None:
-                    monitor.toc_print()
-
-                if end_of_batch:
-                    eval_name_vals = eval_metric.get_name_value()
-
-                if batch_end_callback is not None:
-                    batch_end_params = BatchEndParam(epoch=epoch, nbatch=nbatch,
-                                                     eval_metric=eval_metric,
-                                                     locals=locals())
-                    for callback in _as_list(batch_end_callback):
-                        callback(batch_end_params)
-                nbatch += 1
-
-            # one epoch of training is finished
-            for name, val in eval_name_vals:
-                self.logger.info('Epoch[%d] Train-%s=%f', epoch, name, val)
-            toc = time.time()
-            self.logger.info('Epoch[%d] Time cost=%.3f', epoch, (toc-tic))
-
-            # sync aux params across devices
-            arg_params, aux_params = self.get_params()
-            self.set_params(arg_params, aux_params)
-
-            if epoch_end_callback is not None:
-                for callback in _as_list(epoch_end_callback):
-                    callback(epoch, self.symbol, arg_params, aux_params)
-
-            #----------------------------------------
-            # evaluation on validation set
-            if eval_data:
-                res = self.score(eval_data, validation_metric,
-                                 score_end_callback=eval_end_callback,
-                                 batch_end_callback=eval_batch_end_callback, epoch=epoch)
-                #TODO: pull this into default
-                for name, val in res:
-                    self.logger.info('Epoch[%d] Validation-%s=%f', epoch, name, val)
-
-            # end of 1 epoch, reset the data-iter for another epoch
-            train_data.reset()
-
-    ################################################################################
-    # Symbol information
-    ################################################################################
-    @property
-    def data_names(self):
-        """A list of names for data required by this module."""
-        raise NotImplementedError()
-
-    @property
-    def output_names(self):
-        """A list of names for the outputs of this module."""
-        raise NotImplementedError()
-
-    ################################################################################
-    # Input/Output information
-    ################################################################################
-    @property
-    def data_shapes(self):
-        """A list of (name, shape) pairs specifying the data inputs to this module."""
-        raise NotImplementedError()
-
-    @property
-    def label_shapes(self):
-        """A list of (name, shape) pairs specifying the label inputs to this module.
-        If this module does not accept labels -- either it is a module without loss
-        function, or it is not bound for training, then this should return an empty
-        list ``[]``.
-        """
-        raise NotImplementedError()
-
-    @property
-    def output_shapes(self):
-        """A list of (name, shape) pairs specifying the outputs of this module."""
-        raise NotImplementedError()
-
-    ################################################################################
-    # Parameters of a module
-    ################################################################################
-    def get_params(self):
-        """Gets parameters, those are potentially copies of the actual parameters used
-        to do computation on the device.
-
-        Returns
-        -------
-        ``(arg_params, aux_params)``
-            A pair of dictionaries each mapping parameter names to NDArray values.
-
-        Examples
-        --------
-        >>> # An example of getting module parameters.
-        >>> print mod.get_params()
-        ({'fc2_weight': <NDArray 64x128 @cpu(0)>, 'fc1_weight': <NDArray 128x100 @cpu(0)>,
-        'fc3_bias': <NDArray 10 @cpu(0)>, 'fc3_weight': <NDArray 10x64 @cpu(0)>,
-        'fc2_bias': <NDArray 64 @cpu(0)>, 'fc1_bias': <NDArray 128 @cpu(0)>}, {})
-        """
-        raise NotImplementedError()
-
-    def init_params(self, initializer=Uniform(0.01), arg_params=None, aux_params=None,
-                    allow_missing=False, force_init=False, allow_extra=False):
-        """Initializes the parameters and auxiliary states.
-
-        Parameters
-        ----------
-        initializer : Initializer
-            Called to initialize parameters if needed.
-        arg_params : dict
-            If not ``None``, should be a dictionary of existing `arg_params`. Initialization
-            will be copied from that.
-        aux_params : dict
-            If not ``None``, should be a dictionary of existing `aux_params`. Initialization
-            will be copied from that.
-        allow_missing : bool
-            If ``True``, params could contain missing values, and the initializer will be
-            called to fill those missing params.
-        force_init : bool
-            If ``True``, `force_init` will force re-initialize even if already initialized.
-        allow_extra : boolean, optional
-            Whether allow extra parameters that are not needed by symbol.
-            If this is True, no error will be thrown when arg_params or aux_params
-            contain extra parameters that is not needed by the executor.
-
-        Examples
-        --------
-        >>> # An example of initializing module parameters.
-        >>> mod.init_params()
-        """
-        raise NotImplementedError()
-
-    def set_params(self, arg_params, aux_params, allow_missing=False, force_init=True,
-                   allow_extra=False):
-        """Assigns parameter and aux state values.
-
-        Parameters
-        ----------
-        arg_params : dict
-            Dictionary of name to value (`NDArray`) mapping.
-        aux_params : dict
-            Dictionary of name to value (`NDArray`) mapping.
-        allow_missing : bool
-            If ``True``, params could contain missing values, and the initializer will be
-            called to fill those missing params.
-        force_init : bool
-            If ``True``, will force re-initialize even if already initialized.
-        allow_extra : boolean, optional
-            Whether allow extra parameters that are not needed by symbol.
-            If this is True, no error will be thrown when arg_params or aux_params
-            contain extra parameters that is not needed by the executor.
-
-        Examples
-        --------
-        >>> # An example of setting module parameters.
-        >>> sym, arg_params, aux_params = mx.model.load_checkpoint(model_prefix, n_epoch_load)
-        >>> mod.set_params(arg_params=arg_params, aux_params=aux_params)
-        """
-        self.init_params(initializer=None, arg_params=arg_params, aux_params=aux_params,
-                         allow_missing=allow_missing, force_init=force_init,
-                         allow_extra=allow_extra)
-
-    def save_params(self, fname):
-        """Saves model parameters to file.
-
-        Parameters
-        ----------
-        fname : str
-            Path to output param file.
-
-        Examples
-        --------
-        >>> # An example of saving module parameters.
-        >>> mod.save_params('myfile')
-        """
-        arg_params, aux_params = self.get_params()
-        save_dict = {('arg:%s' % k) : v.as_in_context(cpu()) for k, v in arg_params.items()}
-        save_dict.update({('aux:%s' % k) : v.as_in_context(cpu()) for k, v in aux_params.items()})
-        ndarray.save(fname, save_dict)
-
-    def load_params(self, fname):
-        """Loads model parameters from file.
-
-        Parameters
-        ----------
-        fname : str
-            Path to input param file.
-
-        Examples
-        --------
-        >>> # An example of loading module parameters.
-        >>> mod.load_params('myfile')
-        """
-        save_dict = ndarray.load(fname)
-        arg_params = {}
-        aux_params = {}
-        for k, value in save_dict.items():
-            arg_type, name = k.split(':', 1)
-            if arg_type == 'arg':
-                arg_params[name] = value
-            elif arg_type == 'aux':
-                aux_params[name] = value
-            else:
-                raise ValueError("Invalid param file " + fname)
-        self.set_params(arg_params, aux_params)
-
-    def get_states(self, merge_multi_context=True):
-        """Gets states from all devices
-
-        If `merge_multi_context` is ``True``, returns output of form ``[out1, out2]``.
-        Otherwise, it returns output of the form
-        ``[[out1_dev1, out1_dev2], [out2_dev1, out2_dev2]]``.
-        All output elements are `NDArray`.
-
-        Parameters
-        ----------
-        merge_multi_context : bool
-            Defaults to ``True``. In the case when data-parallelism is used, the states
-            will be collected from multiple devices. A ``True`` value indicates that we
-            should merge the collected results so that they look like from a single
-            executor.
-
-        Returns
-        -------
-        A list of ``NDArray`` or a list of list of ``NDArray``.
-        """
-        assert self.binded and self.params_initialized
-        assert not merge_multi_context
-        return []
-
-    def set_states(self, states=None, value=None):
-        """Sets value for states. Only one of states & value can be specified.
-
-        Parameters
-        ----------
-        states : list of list of NDArray
-            Source states arrays formatted like
-            ``[[state1_dev1, state1_dev2], [state2_dev1, state2_dev2]]``.
-        value : number
-            A single scalar value for all state arrays.
-        """
-        assert self.binded and self.params_initialized
-        assert not states and not value
-
-    def install_monitor(self, mon):
-        """Installs monitor on all executors."""
-        raise NotImplementedError()
-
-    ################################################################################
-    # Computations
-    ################################################################################
-    # pylint: disable=unused-argument
-    def prepare(self, data_batch, sparse_row_id_fn=None):
-        '''Prepares the module for processing a data batch.
-
-        Usually involves switching bucket and reshaping.
-        For modules that contain `row_sparse` parameters in KVStore,
-        it prepares the `row_sparse` parameters based on the sparse_row_id_fn.
-
-        When KVStore is used to update parameters for multi-device or multi-machine training,
-        a copy of the parameters are stored in KVStore. Note that for `row_sparse` parameters,
-        the `update()` updates the copy of parameters in KVStore, but doesn't broadcast
-        the updated parameters to all devices / machines. The `prepare` function is used to
-        broadcast `row_sparse` parameters with the next batch of data.
-
-        Parameters
-        ----------
-        data_batch : DataBatch
-            The current batch of data for forward computation.
-
-        sparse_row_id_fn : A callback function
-            The function  takes `data_batch` as an input and returns a dict of
-            str -> NDArray. The resulting dict is used for pulling row_sparse
-            parameters from the kvstore, where the str key is the name of the param,
-            and the value is the row id of the param to pull.
-        '''
-        if sparse_row_id_fn is not None:
-            warnings.warn(UserWarning("sparse_row_id_fn is not invoked for BaseModule."))
-    # pylint: enable=unused-argument
-
-    def forward(self, data_batch, is_train=None):
-        """Forward computation. It supports data batches with different shapes, such as
-        different batch sizes or different image sizes.
-        If reshaping of data batch relates to modification of symbol or module, such as
-        changing image layout ordering or switching from training to predicting, module
-        rebinding is required.
-
-        Parameters
-        ----------
-        data_batch : DataBatch
-            Could be anything with similar API implemented.
-        is_train : bool
-            Default is ``None``, which means `is_train` takes the value of ``self.for_training``.
-
-        Examples
-        --------
-        >>> import mxnet as mx
-        >>> from collections import namedtuple
-        >>> Batch = namedtuple('Batch', ['data'])
-        >>> data = mx.sym.Variable('data')
-        >>> out = data * 2
-        >>> mod = mx.mod.Module(symbol=out, label_names=None)
-        >>> mod.bind(data_shapes=[('data', (1, 10))])
-        >>> mod.init_params()
-        >>> data1 = [mx.nd.ones((1, 10))]
-        >>> mod.forward(Batch(data1))
-        >>> print mod.get_outputs()[0].asnumpy()
-        [[ 2.  2.  2.  2.  2.  2.  2.  2.  2.  2.]]
-        >>> # Forward with data batch of different shape
-        >>> data2 = [mx.nd.ones((3, 5))]
-        >>> mod.forward(Batch(data2))
-        >>> print mod.get_outputs()[0].asnumpy()
-        [[ 2.  2.  2.  2.  2.]
-         [ 2.  2.  2.  2.  2.]
-         [ 2.  2.  2.  2.  2.]]
-        """
-        raise NotImplementedError()
-
-    def backward(self, out_grads=None):
-        """Backward computation.
-
-        Parameters
-        ----------
-        out_grads : NDArray or list of NDArray, optional
-            Gradient on the outputs to be propagated back.
-            This parameter is only needed when bind is called
-            on outputs that are not a loss function.
-
-        Examples
-        --------
-        >>> # An example of backward computation.
-        >>> mod.backward()
-        >>> print mod.get_input_grads()[0].asnumpy()
-        [[[  1.10182791e-05   5.12257748e-06   4.01927764e-06   8.32566820e-06
-            -1.59775993e-06   7.24269375e-06   7.28067835e-06  -1.65902311e-05
-             5.46342608e-06   8.44196393e-07]
-             ...]]
-        """
-        raise NotImplementedError()
-
-    def get_outputs(self, merge_multi_context=True):
-        """Gets outputs of the previous forward computation.
-
-        If `merge_multi_context` is ``True``, it is like ``[out1, out2]``. Otherwise,
-        it returns out put of form ``[[out1_dev1, out1_dev2], [out2_dev1, out2_dev2]]``.
-        All the output elements have type `NDArray`. When `merge_multi_context` is ``False``,
-        those `NDArray` instances might live on different devices.
-
-        Parameters
-        ----------
-        merge_multi_context : bool
-            Defaults to ``True``. In the case when data-parallelism is used, the outputs
-            will be collected from multiple devices. A ``True`` value indicates that we
-            should merge the collected results so that they look like from a single
-            executor.
-
-        Returns
-        -------
-        list of `NDArray` or list of list of `NDArray`.
-            Output
-
-        Examples
-        --------
-        >>> # An example of getting forward output.
-        >>> print mod.get_outputs()[0].asnumpy()
-        [[ 0.09999977  0.10000153  0.10000716  0.10000195  0.09999853  0.09999743
-           0.10000272  0.10000113  0.09999088  0.09999888]]
-        """
-        raise NotImplementedError()
-
-    def get_input_grads(self, merge_multi_context=True):
-        """Gets the gradients to the inputs, computed in the previous backward computation.
-
-        If `merge_multi_context` is ``True``, it is like ``[grad1, grad2]``. Otherwise, it
-        is like ``[[grad1_dev1, grad1_dev2], [grad2_dev1, grad2_dev2]]``. All the output
-        elements have type `NDArray`. When `merge_multi_context` is ``False``, those `NDArray`
-        instances might live on different devices.
-
-        Parameters
-        ----------
-        merge_multi_context : bool
-            Defaults to ``True``. In the case when data-parallelism is used, the gradients
-            will be collected from multiple devices. A ``True`` value indicates that we
-            should merge the collected results so that they look like from a single
-            executor.
-
-        Returns
-        -------
-        list of NDArray or list of list of NDArray
-              Input gradients.
-
-        Examples
-        --------
-        >>> # An example of getting input gradients.
-        >>> print mod.get_input_grads()[0].asnumpy()
-        [[[  1.10182791e-05   5.12257748e-06   4.01927764e-06   8.32566820e-06
-            -1.59775993e-06   7.24269375e-06   7.28067835e-06  -1.65902311e-05
-            5.46342608e-06   8.44196393e-07]
-            ...]]
-        """
-        raise NotImplementedError()
-
-    def update(self):
-        """Updates parameters according to the installed optimizer and the gradients computed
-        in the previous forward-backward batch.
-
-        When KVStore is used to update parameters for multi-device or multi-machine training,
-        a copy of the parameters are stored in KVStore. Note that for `row_sparse` parameters,
-        this function does update the copy of parameters in KVStore, but doesn't broadcast the
-        updated parameters to all devices / machines. Please call `prepare` to broadcast
-        `row_sparse` parameters with the next batch of data.
-
-        Examples
-        --------
-        >>> # An example of updating module parameters.
-        >>> mod.init_optimizer(kvstore='local', optimizer='sgd',
-        ...     optimizer_params=(('learning_rate', 0.01), ))
-        >>> mod.backward()
-        >>> mod.update()
-        >>> print mod.get_params()[0]['fc3_weight'].asnumpy()
-        [[  5.86930104e-03   5.28078526e-03  -8.88729654e-03  -1.08308345e-03
-            6.13054074e-03   4.27560415e-03   1.53817423e-03   4.62131854e-03
-            4.69872449e-03  -2.42400169e-03   9.94111411e-04   1.12386420e-03
-            ...]]
-        """
-        raise NotImplementedError()
-
-    def update_metric(self, eval_metric, labels, pre_sliced=False):
-        """Evaluates and accumulates evaluation metric on outputs of the last forward
-        computation.
-
-        Parameters
-        ----------
-        eval_metric : EvalMetric
-            Evaluation metric to use.
-        labels : list of NDArray if `pre_sliced` parameter is set to `False`,
-            list of lists of NDArray otherwise. Typically `data_batch.label`.
-        pre_sliced: bool
-            Whether the labels are already sliced per device (default: False).
-
-        Examples
-        --------
-        >>> # An example of updating evaluation metric.
-        >>> mod.forward(data_batch)
-        >>> mod.update_metric(metric, data_batch.label)
-        """
-        raise NotImplementedError()
-
-    ################################################################################
-    # module setup
-    ################################################################################
-    def bind(self, data_shapes, label_shapes=None, for_training=True,
-             inputs_need_grad=False, force_rebind=False, shared_module=None,
-             grad_req='write'):
-        """Binds the symbols to construct executors. This is necessary before one
-        can perform computation with the module.
-
-        Parameters
-        ----------
-        data_shapes : list of (str, tuple) or DataDesc objects
-            Typically is ``data_iter.provide_data``. Can also be a list of
-            (data name, data shape).
-        label_shapes : list of (str, tuple) or DataDesc objects
-            Typically is ``data_iter.provide_label``. Can also be a list of
-            (label name, label shape).
-        for_training : bool
-            Default is ``True``. Whether the executors should be bind for training.
-        inputs_need_grad : bool
-            Default is ``False``. Whether the gradients to the input data need to be computed.
-            Typically this is not needed. But this might be needed when implementing composition
-            of modules.
-        force_rebind : bool
-            Default is ``False``. This function does nothing if the executors are already
-            bound. But with this ``True``, the executors will be forced to rebind.
-        shared_module : Module
-            Default is ``None``. This is used in bucketing. When not ``None``, the shared module
-            essentially corresponds to a different bucket -- a module with different symbol
-            but with the same sets of parameters (e.g. unrolled RNNs with different lengths).
-        grad_req : str, list of str, dict of str to str
-            Requirement for gradient accumulation. Can be 'write', 'add', or 'null'
-            (default to 'write').
-            Can be specified globally (str) or for each argument (list, dict).
-
-        Examples
-        --------
-        >>> # An example of binding symbols.
-        >>> mod.bind(data_shapes=[('data', (1, 10, 10))])
-        >>> # Assume train_iter is already created.
-        >>> mod.bind(data_shapes=train_iter.provide_data, label_shapes=train_iter.provide_label)
-        """
-        raise NotImplementedError()
-
-    def init_optimizer(self, kvstore='local', optimizer='sgd',
-                       optimizer_params=(('learning_rate', 0.01),), force_init=False):
-        """Installs and initializes optimizers, as well as initialize kvstore for
-           distributed training
-
-        Parameters
-        ----------
-        kvstore : str or KVStore
-            Defaults to `'local'`.
-        optimizer : str or Optimizer
-            Defaults to `'sgd'`.
-        optimizer_params : dict
-            Defaults to ``(('learning_rate', 0.01),)``. The default value is not a dictionary,
-            just to avoid pylint warning of dangerous default values.
-        force_init : bool
-            Defaults to ``False``, indicates whether to force re-initializing an optimizer
-            if it is already installed.
-
-        Examples
-        --------
-        >>> # An example of initializing optimizer.
-        >>> mod.init_optimizer(optimizer='sgd', optimizer_params=(('learning_rate', 0.005),))
-        """
-        raise NotImplementedError()
-
-    ################################################################################
-    # misc
-    ################################################################################
-    @property
-    def symbol(self):
-        """Gets the symbol associated with this module.
-
-        Except for `Module`, for other types of modules (e.g. `BucketingModule`), this
-        property might not be a constant throughout its life time. Some modules might
-        not even be associated with any symbols.
-        """
-        return self._symbol
diff --git a/python/mxnet/module/bucketing_module.py b/python/mxnet/module/bucketing_module.py
deleted file mode 100644
index dcf2ad7b8e1e..000000000000
--- a/python/mxnet/module/bucketing_module.py
+++ /dev/null
@@ -1,702 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=too-many-instance-attributes, too-many-arguments, protected-access
-# pylint: disable=too-many-public-methods
-"""A `BucketingModule` implement the `BaseModule` API, and allows multiple
-symbols to be used depending on the `bucket_key` provided by each different
-mini-batch of data.
-"""
-
-import logging
-import warnings
-import numpy as np
-
-from .. import context as ctx
-
-from ..initializer import Uniform
-from .. import ndarray as nd
-from .. import symbol as sym
-
-from .base_module import BaseModule, _check_input_names
-from .module import Module
-from ..model import load_params
-from ..name import NameManager
-
-class BucketingModule(BaseModule):
-    """This module helps to deal efficiently with varying-length inputs.
-
-    Parameters
-    ----------
-    sym_gen : function
-        A function when called with a bucket key, returns a triple
-        ``(symbol, data_names, label_names)``.
-    default_bucket_key : str (or any python object)
-        The key for the default bucket.
-    logger : Logger
-    context : Context or list of Context
-        Defaults to ``mx.cpu()``
-    work_load_list : list of number
-        Defaults to ``None``, indicating uniform workload.
-    fixed_param_names: list of str
-        Defaults to ``None``, indicating no network parameters are fixed.
-    state_names : list of str
-        States are similar to data and label, but not provided by data iterator.
-        Instead they are initialized to 0 and can be set by set_states()
-    group2ctxs : dict of str to context or list of context,
-                 or list of dict of str to context
-        Default is `None`. Mapping the `ctx_group` attribute to the context assignment.
-    compression_params : dict
-        Specifies type of gradient compression and additional arguments depending
-        on the type of compression being used. For example, 2bit compression requires a threshold.
-        Arguments would then be {'type':'2bit', 'threshold':0.5}
-        See mxnet.KVStore.set_gradient_compression method for more details on gradient compression.
-    """
-    def __init__(self, sym_gen, default_bucket_key=None, logger=logging,
-                 context=ctx.cpu(), work_load_list=None,
-                 fixed_param_names=None, state_names=None, group2ctxs=None,
-                 compression_params=None):
-        super(BucketingModule, self).__init__(logger=logger)
-
-        assert default_bucket_key is not None
-        self._default_bucket_key = default_bucket_key
-        self._sym_gen = sym_gen
-
-        symbol, data_names, label_names = self._call_sym_gen(default_bucket_key)
-        data_names = list(data_names) if data_names is not None else []
-        label_names = list(label_names) if label_names is not None else []
-        state_names = list(state_names) if state_names is not None else []
-        fixed_param_names = list(fixed_param_names) if fixed_param_names is not None else []
-
-        _check_input_names(symbol, data_names, "data", True)
-        _check_input_names(symbol, label_names, "label", False)
-        _check_input_names(symbol, state_names, "state", True)
-        _check_input_names(symbol, fixed_param_names, "fixed_param", True)
-
-        self._compression_params = compression_params
-        self._fixed_param_names = fixed_param_names
-        self._state_names = state_names
-        self._context = context
-        self._work_load_list = work_load_list
-        self._group2ctxs = group2ctxs
-
-        self._buckets = {}
-        self._curr_module = None
-        self._curr_bucket_key = None
-        self._params_dirty = False
-        self._monitor = None
-        self._grad_req = None
-
-    def _reset_bind(self):
-        """Internal utility function to reset binding."""
-        self.binded = False
-        self._buckets = {}
-        self._curr_module = None
-        self._curr_bucket_key = None
-
-    def _call_sym_gen(self, *args, **kwargs):
-        with NameManager():
-            return self._sym_gen(*args, **kwargs)
-
-    @property
-    def data_names(self):
-        """A list of names for data required by this module."""
-        if self.binded:
-            return self._curr_module.data_names
-        else:
-            _, data_names, _ = self._call_sym_gen(self._default_bucket_key)
-            return data_names
-
-    @property
-    def output_names(self):
-        """A list of names for the outputs of this module."""
-        if self.binded:
-            return self._curr_module.output_names
-        else:
-            symbol, _, _ = self._call_sym_gen(self._default_bucket_key)
-            return symbol.list_outputs()
-
-    @property
-    def data_shapes(self):
-        """Get data shapes.
-
-        Returns
-        -------
-        A list of `(name, shape)` pairs.
-        """
-        assert self.binded
-        return self._curr_module.data_shapes
-
-    @property
-    def label_shapes(self):
-        """Get label shapes.
-
-        Returns
-        -------
-        A list of `(name, shape)` pairs.
-            The return value could be ``None`` if the module does not need labels,
-            or if the module is not bound for training (in this case, label information
-            is not available).
-        """
-        assert self.binded
-        return self._curr_module.label_shapes
-
-    @property
-    def output_shapes(self):
-        """Gets output shapes.
-
-        Returns
-        -------
-        A list of `(name, shape)` pairs.
-        """
-        assert self.binded
-        return self._curr_module.output_shapes
-
-    def get_params(self):
-        """Gets current parameters.
-
-        Returns
-        -------
-        `(arg_params, aux_params)`
-            A pair of dictionaries each mapping parameter names to NDArray values.
-        """
-        assert self.params_initialized
-        self._curr_module._params_dirty = self._params_dirty
-        params = self._curr_module.get_params()
-        self._params_dirty = False
-        return params
-
-    def set_params(self, arg_params, aux_params, allow_missing=False, force_init=True,
-                   allow_extra=False):
-        """Assigns parameters and aux state values.
-
-        Parameters
-        ----------
-        arg_params : dict
-            Dictionary of name to value (`NDArray`) mapping.
-        aux_params : dict
-            Dictionary of name to value (`NDArray`) mapping.
-        allow_missing : bool
-            If true, params could contain missing values, and the initializer will be
-            called to fill those missing params.
-        force_init : bool
-            If true, will force re-initialize even if already initialized.
-        allow_extra : boolean, optional
-            Whether allow extra parameters that are not needed by symbol.
-            If this is True, no error will be thrown when arg_params or aux_params
-            contain extra parameters that is not needed by the executor.
-
-        Examples
-        --------
-        >>> # An example of setting module parameters.
-        >>> sym, arg_params, aux_params = mx.model.load_checkpoint(model_prefix, n_epoch_load)
-        >>> mod.set_params(arg_params=arg_params, aux_params=aux_params)
-        """
-        if not allow_missing:
-            self.init_params(initializer=None, arg_params=arg_params, aux_params=aux_params,
-                             allow_missing=allow_missing, force_init=force_init)
-            return
-
-        if self.params_initialized and not force_init:
-            warnings.warn("Parameters already initialized and force_init=False. "
-                          "set_params call ignored.", stacklevel=2)
-            return
-
-        self._curr_module.set_params(arg_params, aux_params, allow_missing=allow_missing,
-                                     force_init=force_init, allow_extra=allow_extra)
-
-        # because we didn't update self._arg_params, they are dirty now.
-        self._params_dirty = True
-        self.params_initialized = True
-
-    def init_params(self, initializer=Uniform(0.01), arg_params=None, aux_params=None,
-                    allow_missing=False, force_init=False, allow_extra=False):
-        """Initializes parameters.
-
-        Parameters
-        ----------
-        initializer : Initializer
-        arg_params : dict
-            Defaults to ``None``. Existing parameters. This has higher priority
-            than `initializer`.
-        aux_params : dict
-            Defaults to ``None``. Existing auxiliary states. This has higher priority
-            than `initializer`.
-        allow_missing : bool
-            Allow missing values in `arg_params` and `aux_params` (if not ``None``).
-            In this case, missing values will be filled with `initializer`.
-        force_init : bool
-            Defaults to ``False``.
-        allow_extra : boolean, optional
-            Whether allow extra parameters that are not needed by symbol.
-            If this is True, no error will be thrown when arg_params or aux_params
-            contain extra parameters that is not needed by the executor.
-        """
-        if self.params_initialized and not force_init:
-            return
-        assert self.binded, 'call bind before initializing the parameters'
-        self._curr_module.init_params(initializer=initializer, arg_params=arg_params,
-                                      aux_params=aux_params, allow_missing=allow_missing,
-                                      force_init=force_init, allow_extra=allow_extra)
-        self._params_dirty = False
-        self.params_initialized = True
-
-    def get_states(self, merge_multi_context=True):
-        """Gets states from all devices.
-
-        Parameters
-        ----------
-        merge_multi_context : bool
-            Default is `True`. In the case when data-parallelism is used, the states
-            will be collected from multiple devices. A `True` value indicate that we
-            should merge the collected results so that they look like from a single
-            executor.
-
-        Returns
-        -------
-        list of NDArrays or list of list of NDArrays
-            If `merge_multi_context` is ``True``, it is like ``[out1, out2]``. Otherwise, it
-            is like ``[[out1_dev1, out1_dev2], [out2_dev1, out2_dev2]]``. All the output
-            elements are `NDArray`.
-        """
-        assert self.binded and self.params_initialized
-        return self._curr_module.get_states(merge_multi_context=merge_multi_context)
-
-    def set_states(self, states=None, value=None):
-        """Sets value for states. Only one of states & values can be specified.
-
-        Parameters
-        ----------
-        states : list of list of NDArrays
-            Source states arrays formatted like ``[[state1_dev1, state1_dev2],
-            [state2_dev1, state2_dev2]]``.
-        value : number
-            A single scalar value for all state arrays.
-        """
-        assert self.binded and self.params_initialized
-        self._curr_module.set_states(states, value)
-
-    def bind(self, data_shapes, label_shapes=None, for_training=True,
-             inputs_need_grad=False, force_rebind=False, shared_module=None,
-             grad_req='write'):
-        """Binding for a `BucketingModule` means setting up the buckets and binding the
-        executor for the default bucket key. Executors corresponding to other keys are
-        bound afterwards with `switch_bucket`.
-
-        Parameters
-        ----------
-        data_shapes : list of (str, tuple)
-            This should correspond to the symbol for the default bucket.
-        label_shapes : list of (str, tuple)
-            This should correspond to the symbol for the default bucket.
-        for_training : bool
-            Default is ``True``.
-        inputs_need_grad : bool
-            Default is ``False``.
-        force_rebind : bool
-            Default is ``False``.
-        shared_module : BucketingModule
-            Default is ``None``. This value is currently not used.
-        grad_req : str, list of str, dict of str to str
-            Requirement for gradient accumulation. Can be 'write', 'add', or 'null'
-            (default to 'write').
-            Can be specified globally (str) or for each argument (list, dict).
-        bucket_key : str (or any python object)
-            bucket key for binding. by default use the default_bucket_key
-        """
-        # in case we already initialized params, keep it
-        if self.params_initialized:
-            arg_params, aux_params = self.get_params()
-
-        # force rebinding is typically used when one want to switch from
-        # training to prediction phase.
-        if force_rebind:
-            self._reset_bind()
-
-        if self.binded:
-            self.logger.warning('Already bound, ignoring bind()')
-            return
-
-        assert shared_module is None, 'shared_module for BucketingModule is not supported'
-
-        self.for_training = for_training
-        self.inputs_need_grad = inputs_need_grad
-        self.binded = True
-        self._grad_req = grad_req
-
-        symbol, data_names, label_names = self._call_sym_gen(self._default_bucket_key)
-        module = None
-        if not self._default_bucket_key in self._buckets:
-            module = Module(symbol, data_names, label_names, logger=self.logger,
-                            context=self._context, work_load_list=self._work_load_list,
-                            fixed_param_names=self._fixed_param_names,
-                            state_names=self._state_names,
-                            group2ctxs=self._group2ctxs,
-                            compression_params=self._compression_params)
-        else:
-            module = self._buckets[self._default_bucket_key]
-        module.bind(data_shapes, label_shapes, for_training, inputs_need_grad,
-                    force_rebind=False, shared_module=None, grad_req=self._grad_req)
-        self._curr_module = module
-        self._curr_bucket_key = self._default_bucket_key
-        self._buckets[self._default_bucket_key] = module
-
-        # copy back saved params, if already initialized
-        if self.params_initialized:
-            self.set_params(arg_params, aux_params)
-
-    def switch_bucket(self, bucket_key, data_shapes, label_shapes=None):
-        """Switches to a different bucket. This will change ``self.curr_module``.
-
-        Parameters
-        ----------
-        bucket_key : str (or any python object)
-            The key of the target bucket.
-        data_shapes : list of (str, tuple)
-            Typically ``data_batch.provide_data``.
-        label_shapes : list of (str, tuple)
-            Typically ``data_batch.provide_label``.
-        """
-        assert self.binded, 'call bind before switching bucket'
-        if not bucket_key in self._buckets:
-            symbol, data_names, label_names = self._call_sym_gen(bucket_key)
-            module = Module(symbol, data_names, label_names,
-                            logger=self.logger, context=self._context,
-                            work_load_list=self._work_load_list,
-                            fixed_param_names=self._fixed_param_names,
-                            state_names=self._state_names,
-                            group2ctxs=self._group2ctxs,
-                            compression_params=self._compression_params)
-            module.bind(data_shapes, label_shapes, self._curr_module.for_training,
-                        self._curr_module.inputs_need_grad,
-                        force_rebind=False, shared_module=self._buckets[self._default_bucket_key],
-                        grad_req=self._grad_req)
-            if self._monitor is not None:
-                module.install_monitor(self._monitor)
-            self._buckets[bucket_key] = module
-        else:
-            module = self._buckets[bucket_key]
-            if not module.binded:
-                module.bind(data_shapes, label_shapes, self._curr_module.for_training,
-                            self._curr_module.inputs_need_grad,
-                            force_rebind=False, shared_module=self._buckets[self._default_bucket_key],
-                            grad_req=self._grad_req)
-
-        self._curr_module = self._buckets[bucket_key]
-        self._curr_bucket_key = bucket_key
-
-    def init_optimizer(self, kvstore='local', optimizer='sgd',
-                       optimizer_params=(('learning_rate', 0.01),),
-                       force_init=False):
-        """Installs and initializes optimizers.
-
-        Parameters
-        ----------
-        kvstore : str or KVStore
-            Defaults to `'local'`.
-        optimizer : str or Optimizer
-            Defaults to `'sgd'`
-        optimizer_params : dict
-            Defaults to `(('learning_rate', 0.01),)`. The default value is not a dictionary,
-            just to avoid pylint warning of dangerous default values.
-        force_init : bool
-            Defaults to ``False``, indicating whether we should force re-initializing the
-            optimizer in the case an optimizer is already installed.
-        """
-        assert self.binded and self.params_initialized
-        if self.optimizer_initialized and not force_init:
-            self.logger.warning('optimizer already initialized, ignoring.')
-            return
-
-        self._curr_module.init_optimizer(kvstore, optimizer, optimizer_params,
-                                         force_init=force_init)
-        for mod in self._buckets.values():
-            if mod is not self._curr_module:
-                mod.borrow_optimizer(self._curr_module)
-
-        self.optimizer_initialized = True
-
-    def prepare(self, data_batch, sparse_row_id_fn=None):
-        '''Prepares the module for processing a data batch.
-
-        Usually involves switching bucket and reshaping.
-        For modules that contain `row_sparse` parameters in KVStore,
-        it prepares the `row_sparse` parameters based on the sparse_row_id_fn.
-
-        Parameters
-        ----------
-        data_batch : DataBatch
-            The current batch of data for forward computation.
-
-        sparse_row_id_fn : A callback function
-            The function  takes `data_batch` as an input and returns a dict of
-            str -> NDArray. The resulting dict is used for pulling row_sparse
-            parameters from the kvstore, where the str key is the name of the param,
-            and the value is the row id of the param to pull.
-        '''
-        # perform bind if haven't done so
-        assert self.binded and self.params_initialized
-        bucket_key = data_batch.bucket_key
-        original_bucket_key = self._curr_bucket_key
-        data_shapes = data_batch.provide_data
-        label_shapes = data_batch.provide_label
-        self.switch_bucket(bucket_key, data_shapes, label_shapes)
-        self._curr_module.prepare(data_batch, sparse_row_id_fn=sparse_row_id_fn)
-        # switch back
-        self.switch_bucket(original_bucket_key, None, None)
-
-    def forward(self, data_batch, is_train=None):
-        """Forward computation.
-
-        Parameters
-        ----------
-        data_batch : DataBatch
-        is_train : bool
-            Defaults to ``None``, in which case `is_train` is take as ``self.for_training``.
-        """
-        assert self.binded and self.params_initialized
-        self.switch_bucket(data_batch.bucket_key, data_batch.provide_data,
-                           data_batch.provide_label)
-        self._curr_module.forward(data_batch, is_train=is_train)
-
-    def backward(self, out_grads=None):
-        """Backward computation."""
-        assert self.binded and self.params_initialized
-        self._curr_module.backward(out_grads=out_grads)
-
-    def update(self):
-        """Updates parameters according to installed optimizer and the gradient computed
-        in the previous forward-backward cycle.
-
-        When KVStore is used to update parameters for multi-device or multi-machine training,
-        a copy of the parameters are stored in KVStore. Note that for `row_sparse` parameters,
-        this function does update the copy of parameters in KVStore, but doesn't broadcast the
-        updated parameters to all devices / machines. Please call `prepare` to broadcast
-        `row_sparse` parameters with the next batch of data.
-
-        """
-        assert self.binded and self.params_initialized and self.optimizer_initialized
-        self._params_dirty = True
-        self._curr_module.update()
-
-    def get_outputs(self, merge_multi_context=True):
-        """Gets outputs from a previous forward computation.
-
-        Parameters
-        ----------
-        merge_multi_context : bool
-            Defaults to ``True``. In the case when data-parallelism is used, the outputs
-            will be collected from multiple devices. A ``True`` value indicate that we
-            should merge the collected results so that they look like from a single
-            executor.
-
-        Returns
-        -------
-        list of numpy arrays or list of list of numpy arrays
-            If `merge_multi_context` is ``True``, it is like ``[out1, out2]``. Otherwise, it
-            is like ``[[out1_dev1, out1_dev2], [out2_dev1, out2_dev2]]``. All the output
-            elements are numpy arrays.
-        """
-        assert self.binded and self.params_initialized
-        return self._curr_module.get_outputs(merge_multi_context=merge_multi_context)
-
-    def get_input_grads(self, merge_multi_context=True):
-        """Gets the gradients with respect to the inputs of the module.
-
-        Parameters
-        ----------
-        merge_multi_context : bool
-            Defaults to ``True``. In the case when data-parallelism is used, the outputs
-            will be collected from multiple devices. A ``True`` value indicate that we
-            should merge the collected results so that they look like from a single
-            executor.
-
-        Returns
-        -------
-        list of NDArrays or list of list of NDArrays
-            If `merge_multi_context` is ``True``, it is like ``[grad1, grad2]``. Otherwise, it
-            is like ``[[grad1_dev1, grad1_dev2], [grad2_dev1, grad2_dev2]]``. All the output
-            elements are `NDArray`.
-        """
-        assert self.binded and self.params_initialized and self.inputs_need_grad
-        return self._curr_module.get_input_grads(merge_multi_context=merge_multi_context)
-
-    def update_metric(self, eval_metric, labels, pre_sliced=False):
-        """Evaluates and accumulates evaluation metric on outputs of the last forward computation.
-
-        Parameters
-        ----------
-        eval_metric : EvalMetric
-        labels : list of NDArray
-            Typically ``data_batch.label``.
-        """
-        assert self.binded and self.params_initialized
-        self._curr_module.update_metric(eval_metric, labels, pre_sliced)
-
-    @property
-    def symbol(self):
-        """The symbol of the current bucket being used."""
-        assert self.binded
-        return self._curr_module.symbol
-
-    def install_monitor(self, mon):
-        """Installs monitor on all executors """
-        assert self.binded
-        self._monitor = mon
-        for mod in self._buckets.values():
-            mod.install_monitor(mon)
-
-    def save_checkpoint(self, prefix, epoch, remove_amp_cast=False):
-        """Saves current progress to checkpoint for all buckets in BucketingModule
-        Use `mx.callback.module_checkpoint` as `epoch_end_callback` to save during training.
-
-        Parameters
-        ----------
-        prefix : str
-            The file prefix to checkpoint to.
-        epoch : int
-            The current epoch number.
-        """
-
-        assert len(self._buckets) > 0, "Empty BucketingModule cannot be saved"
-        param_name = "%s-%04d.params" % (prefix, epoch)
-        self.save_params(param_name)
-        for bucket_key in self._buckets:
-            symbol, _, _ = self._sym_gen(bucket_key)
-            symbol.save("%s-%s-symbol.json" % (prefix, bucket_key), remove_amp_cast=remove_amp_cast)
-        nd.save("%s.buckets" % (prefix), nd.array(list(self._buckets.keys()), dtype=np.int32))
-
-    @staticmethod
-    def load(prefix, epoch, sym_gen=None, default_bucket_key=None, **kwargs):
-        """Creates a model from previously saved checkpoint.
-
-        Parameters
-        ----------
-        prefix : str
-            path prefix of saved model files. You should have
-            "prefix-symbol.json", "prefix-xxxx.params", and
-            optionally "prefix-xxxx.states", where xxxx is the
-            epoch number.
-        epoch : int
-            epoch to load.
-        sym_gen : function
-            A function when called with a bucket key, returns a triple
-            ``(symbol, data_names, label_names)``.
-            provide sym_gen which was used when saving bucketing module.
-        logger : Logger
-            Default is `logging`.
-        context : Context or list of Context
-            Default is ``cpu()``.
-        work_load_list : list of number
-            Default ``None``, indicating uniform workload.
-        fixed_param_names: list of str
-            Default ``None``, indicating no network parameters are fixed.
-        state_names : list of str
-            States are similar to data and label, but not provided by data iterator.
-            Instead they are initialized to 0 and can be set by set_states()
-        group2ctxs : dict of str to context or list of context,
-                     or list of dict of str to context
-            Default is `None`. Mapping the `ctx_group` attribute to the context assignment.
-        compression_params : dict
-            Specifies type of gradient compression and additional arguments depending
-            on the type of compression being used. For example, 2bit compression requires a threshold.
-            Arguments would then be {'type':'2bit', 'threshold':0.5}
-            See mxnet.KVStore.set_gradient_compression method for more details on gradient compression.
-        """
-        assert sym_gen is not None, \
-            "sym_gen is required for loading BucketingModule"
-        assert default_bucket_key is not None, \
-            "default_bucket_key is required for loading BucketingModule"
-        buckets = nd.load("%s.buckets" % prefix)
-        buckets = list(buckets[0].asnumpy().astype('int32'))
-        bucketing_mod = BucketingModule(sym_gen, default_bucket_key, **kwargs)
-        for bucket_key in buckets:
-            _, data_names, label_names = sym_gen(bucket_key)
-            symbol = sym.load("%s-%s-symbol.json" % (prefix, bucket_key))
-            bucketing_mod._buckets[bucket_key] = Module(symbol, data_names, label_names, **kwargs)
-            if bucket_key == default_bucket_key:
-                bucketing_mod._curr_module = bucketing_mod._buckets[bucket_key]
-        arg_params, aux_params = load_params(prefix, epoch)
-        bucketing_mod._curr_module._arg_params = arg_params
-        bucketing_mod._curr_module._aux_params = aux_params
-        bucketing_mod._curr_module.params_initialized = True
-        bucketing_mod.params_initialized = True
-        return bucketing_mod
-
-    @staticmethod
-    def load_dict(sym_dict=None, sym_gen=None, default_bucket_key=None, arg_params=None,
-                  aux_params=None, **kwargs):
-        """Creates a model from a dict mapping bucket_key to symbols and shared arg_params
-        and aux_params.
-
-        Parameters
-        ----------
-        sym_dict : dict mapping bucket_key to symbol
-            Dict mapping bucket key to symbol
-        sym_gen : function
-            A function when called with a bucket key, returns a triple
-            ``(symbol, data_names, label_names)``.
-            provide sym_gen which was used when saving bucketing module.
-        default_bucket_key : str (or any python object)
-            The key for the default bucket.
-        arg_params : dict
-            Required for loading the BucketingModule.
-            Dict of name to parameter ndarrays.
-        aux_params : dict
-            Required for loading the BucketingModule.
-            Dict of name to auxiliary state ndarrays.
-        logger : Logger
-            Default is `logging`.
-        context : Context or list of Context
-            Default is ``cpu()``.
-        work_load_list : list of number
-            Default ``None``, indicating uniform workload.
-        fixed_param_names: list of str
-            Default ``None``, indicating no network parameters are fixed.
-        state_names : list of str
-            States are similar to data and label, but not provided by data iterator.
-            Instead they are initialized to 0 and can be set by set_states()
-        group2ctxs : dict of str to context or list of context,
-                     or list of dict of str to context
-            Default is `None`. Mapping the `ctx_group` attribute to the context assignment.
-        compression_params : dict
-            Specifies type of gradient compression and additional arguments depending
-            on the type of compression being used. For example, 2bit compression requires a threshold.
-            Arguments would then be {'type':'2bit', 'threshold':0.5}
-            See mxnet.KVStore.set_gradient_compression method for more details on gradient compression.
-        """
-
-        assert sym_dict is not None, \
-            "sym_dict needs to be provided for BucketingModule.load_dict"
-        assert arg_params is not None, \
-            "arg_params need to be provided for BucketingModule.load_dict"
-        assert aux_params is not None, \
-            "aux_params need to be provided for BucketingModule.load_dict"
-        assert default_bucket_key is not None, \
-            "default_bucket_key needs to be provided for BucketingModule.load_dict"
-
-        bucketing_mod = BucketingModule(sym_gen, default_bucket_key, **kwargs)
-        for bucket_key, loaded_sym in sym_dict.items():
-            _, data_names, label_names = sym_gen(default_bucket_key)
-            bucketing_mod._buckets[bucket_key] = Module(loaded_sym, data_names, label_names, **kwargs)
-            if bucket_key == default_bucket_key:
-                bucketing_mod._curr_module = bucketing_mod._buckets[bucket_key]
-        bucketing_mod._curr_module._arg_params = arg_params
-        bucketing_mod._curr_module._aux_params = aux_params
-        bucketing_mod._curr_module.params_initialized = True
-        bucketing_mod.params_initialized = True
-        return bucketing_mod
diff --git a/python/mxnet/module/executor_group.py b/python/mxnet/module/executor_group.py
deleted file mode 100644
index f2cb62fc8396..000000000000
--- a/python/mxnet/module/executor_group.py
+++ /dev/null
@@ -1,703 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=too-many-instance-attributes,too-many-locals
-# pylint: disable=too-many-branches,too-many-statements,too-many-arguments
-"""Executor group is a convenient tool for managing a group of executors."""
-
-import logging
-from collections import OrderedDict
-
-from .. import context as ctx
-from .. import ndarray as nd
-from ..io import DataDesc
-from ..executor_manager import _split_input_slice
-from ..ndarray import _DTYPE_MX_TO_NP
-
-
-def _load_general(data, targets, major_axis):
-    """Load a list of arrays into a list of arrays specified by slices."""
-    for d_src, d_targets, axis in zip(data, targets, major_axis): # pylint: disable=too-many-nested-blocks
-        if isinstance(d_targets, nd.NDArray):
-            d_src.copyto(d_targets)
-        elif isinstance(d_src, (list, tuple)):
-            for src, dst in zip(d_src, d_targets):
-                src.copyto(dst)
-        else:
-            for slice_idx, d_dst in d_targets:
-                if axis >= 0:
-                    # copy slice
-                    shape = d_src.shape
-                    do_crop = (slice_idx.start != 0 or shape[axis] != slice_idx.stop)
-                    # pylint: disable=no-member,protected-access
-                    if do_crop:
-                        if axis == 0:
-                            d_src[slice_idx.start:slice_idx.stop].copyto(d_dst)
-                        else:
-                            if d_src.context == d_dst.context:
-                                nd.slice_axis(d_src, axis=axis, begin=slice_idx.start,
-                                              end=slice_idx.stop, out=d_dst)
-                            else:
-                                # on different device, crop and then do cross device copy
-                                d_dst_copy = nd.slice_axis(d_src, axis=axis, begin=slice_idx.start,
-                                                           end=slice_idx.stop)
-                                d_dst_copy.copyto(d_dst)
-                    else:
-                        d_src.copyto(d_dst)
-                    # pylint: enable=no-member,protected-access
-                else:
-                    d_src.copyto(d_dst)
-
-
-def _load_data(batch, targets, major_axis):
-    """Load data into sliced arrays."""
-    if isinstance(batch, list):
-        new_batch = []
-        for i in range(len(targets)):
-            new_batch.append([b.data[i] for b in batch])
-        new_targets = [[dst for _, dst in d_target] for d_target in targets]
-        _load_general(new_batch, new_targets, major_axis)
-    else:
-        _load_general(batch.data, targets, major_axis)
-
-
-def _load_label(batch, targets, major_axis):
-    """Load label into sliced arrays."""
-    if isinstance(batch, list):
-        new_batch = []
-        for i in range(len(targets)):
-            new_batch.append([b.label[i] for b in batch])
-        new_targets = [[dst for _, dst in d_target] for d_target in targets]
-        _load_general(new_batch, new_targets, major_axis)
-    else:
-        _load_general(batch.label, targets, major_axis)
-
-
-def _merge_multi_context(outputs, major_axis):
-    """Merge outputs that lives on multiple context into one, so that they look
-    like living on one context.
-    """
-    rets = []
-    for tensors, axis in zip(outputs, major_axis):
-        if axis >= 0:
-            # pylint: disable=no-member,protected-access
-            if len(tensors) == 1:
-                rets.append(tensors[0])
-            else:
-                # Concatenate if necessary
-                rets.append(nd.concat(*[tensor.as_in_context(tensors[0].context)
-                                        for tensor in tensors],
-                                      dim=axis))
-            # pylint: enable=no-member,protected-access
-        else:
-            # negative axis means the there is no batch_size axis, and all the
-            # results should be the same on each device. We simply take the
-            # first one, without checking they are actually the same
-            rets.append(tensors[0])
-    return rets
-
-def _prepare_group2ctxs(group2ctxs, ctx_len):
-    """Prepare the group2contexts, will duplicate the context
-    if some ctx_group map to only one context.
-    """
-    if group2ctxs is None:
-        return [None] * ctx_len
-    elif isinstance(group2ctxs, list):
-        assert(len(group2ctxs) == ctx_len), "length of group2ctxs\
-            should be %d" % ctx_len
-        return group2ctxs
-    elif isinstance(group2ctxs, dict):
-        ret = [{} for i in range(ctx_len)]
-        for k, v in group2ctxs.items():
-            ctxs = None
-            if isinstance(v, ctx.Context):
-                ctxs = [v] * ctx_len
-            else:
-                if len(v) == 1:
-                    ctxs = v * ctx_len
-                else:
-                    assert(len(v) == ctx_len), "length of group2ctxs[%s]\
-                        should be %d or 1" % (k, ctx_len)
-                    ctxs = v
-            for i in range(ctx_len):
-                ret[i][k] = ctxs[i]
-        return ret
-    else:
-        assert(False), "group2ctxs should be list of dict of str to context,\
-            or dict of str to context or list of context"
-        return False
-
-class DataParallelExecutorGroup(object):
-    """A group of executors that lives on a group of devices.
-    This is a helper class used to implement data parallelization. Each mini-batch will
-    be split and run on the devices.
-
-    Parameters
-    ----------
-    symbol : Symbol
-        The common symbolic computation graph for all executors.
-    contexts : list
-        A list of contexts.
-    workload : list
-        If not ``None``, could be a list of numbers that specify the workload to be assigned
-        to different context. Larger number indicate heavier workload.
-    data_shapes : list
-        Should be a list of (name, shape) tuples, for the shapes of data. Note the order is
-        important and should be the same as the order that the `DataIter` provide the data.
-    label_shapes : list
-        Should be a list of (name, shape) tuples, for the shapes of label. Note the order is
-        important and should be the same as the order that the `DataIter` provide the label.
-    param_names : list
-        A list of strings, indicating the names of parameters (e.g. weights, filters, etc.)
-        in the computation graph.
-    for_training : bool
-        Indicate whether the executors should be bind for training. When not doing training,
-        the memory for gradients will not be allocated.
-    inputs_need_grad : bool
-        Indicate whether the gradients for the input data should be computed. This is currently
-        not used. It will be useful for implementing composition of modules.
-    shared_group : DataParallelExecutorGroup
-        Defaults to ``None``. This is used in bucketing. When not ``None``, it should be a executor
-        group corresponding to a different bucket. In other words, it will correspond to a different
-        symbol with the same set of parameters (e.g. unrolled RNNs with different lengths).
-        In this case the memory regions of the parameters will be shared.
-    logger : Logger
-        Default is `logging`.
-    fixed_param_names: list of str
-        Parameters to be fixed during training. For these parameters, not gradients
-        will be calculated and thus no space will be allocated for the gradient.
-    grad_req : str, list of str, dict of str to str
-        Requirement for gradient accumulation. Can be 'write', 'add', or 'null'
-        (default to 'write').
-        Can be specified globally (str) or for each argument (list, dict).
-    group2ctxs : dict of str to context or list of context,
-                 or list of dict of str to context
-        Default is `None`. Mapping the `ctx_group` attribute to the context assignment.
-    """
-    def __init__(self, symbol, contexts, workload, data_shapes, label_shapes, param_names,
-                 for_training, inputs_need_grad, shared_group=None, logger=logging,
-                 fixed_param_names=None, grad_req='write', state_names=None, group2ctxs=None):
-        self.param_names = param_names
-        self.arg_names = symbol.list_arguments()
-        self.aux_names = symbol.list_auxiliary_states()
-
-        self.symbol = symbol
-        self.contexts = contexts
-        self.workload = workload
-        self.group2ctxs = _prepare_group2ctxs(group2ctxs, len(contexts))
-
-        self.for_training = for_training
-        self.inputs_need_grad = inputs_need_grad
-
-        self.logger = logger
-        #In the future we should have a better way to profile memory per device (haibin)
-        self._total_exec_bytes = 0
-        self.fixed_param_names = fixed_param_names
-        if self.fixed_param_names is None:
-            self.fixed_param_names = []
-
-        self.state_names = state_names
-        if self.state_names is None:
-            self.state_names = []
-
-        if not for_training:
-            grad_req = 'null'
-
-        data_shapes = [x if isinstance(x, DataDesc) else DataDesc(*x) for x in data_shapes]
-        if label_shapes is not None:
-            label_shapes = [x if isinstance(x, DataDesc) else DataDesc(*x) for x in label_shapes]
-
-        data_names = [x.name for x in data_shapes]
-
-        if isinstance(grad_req, str):
-            self.grad_req = {}
-            for k in self.arg_names:
-                if k in self.param_names:
-                    self.grad_req[k] = 'null' if k in self.fixed_param_names else grad_req
-                elif k in data_names:
-                    self.grad_req[k] = grad_req if self.inputs_need_grad else 'null'
-                else:
-                    self.grad_req[k] = 'null'
-        elif isinstance(grad_req, (list, tuple)):
-            assert len(grad_req) == len(self.arg_names)
-            self.grad_req = dict(zip(self.arg_names, grad_req))
-        elif isinstance(grad_req, dict):
-            self.grad_req = {}
-            for k in self.arg_names:
-                if k in self.param_names:
-                    self.grad_req[k] = 'null' if k in self.fixed_param_names else 'write'
-                elif k in data_names:
-                    self.grad_req[k] = 'write' if self.inputs_need_grad else 'null'
-                else:
-                    self.grad_req[k] = 'null'
-            self.grad_req.update(grad_req)
-        else:
-            raise ValueError("grad_req must be one of str, list, tuple, or dict.")
-
-        if shared_group is not None:
-            self.shared_data_arrays = shared_group.shared_data_arrays
-        else:
-            self.shared_data_arrays = [{} for _ in contexts]
-
-        # initialize some instance variables
-        self.batch_size = None
-        self.slices = None
-        self.execs = []
-        self._default_execs = None
-        self.data_arrays = None
-        self.label_arrays = None
-        self.param_arrays = None
-        self.state_arrays = None
-        self.grad_arrays = None
-        self.aux_arrays = None
-        self.input_grad_arrays = None
-
-        self.data_shapes = None
-        self.label_shapes = None
-        self.data_names = None
-        self.label_names = None
-        self.data_layouts = None
-        self.label_layouts = None
-        self.output_names = self.symbol.list_outputs()
-        self.num_outputs = len(self.output_names)
-        self.output_layouts = [DataDesc.get_batch_axis(self.symbol[index].attr('__layout__'))
-                               for index in range(self.num_outputs)]
-
-        self.bind_exec(data_shapes, label_shapes, shared_group)
-
-    def decide_slices(self, data_shapes):
-        """Decide the slices for each context according to the workload.
-
-        Parameters
-        ----------
-        data_shapes : list
-            list of (name, shape) specifying the shapes for the input data or label.
-        """
-        assert len(data_shapes) > 0
-        major_axis = [DataDesc.get_batch_axis(x.layout) for x in data_shapes]
-
-        for (name, shape), axis in zip(data_shapes, major_axis):
-            if axis == -1:
-                continue
-
-            batch_size = shape[axis]
-            if self.batch_size is not None:
-                assert batch_size == self.batch_size, ("all data must have the same batch size: "
-                                                       + ("batch_size = %d, but " % self.batch_size)
-                                                       + ("%s has shape %s" % (name, shape)))
-            else:
-                self.batch_size = batch_size
-                self.slices = _split_input_slice(self.batch_size, self.workload)
-
-        return major_axis
-
-    def _collect_arrays(self):
-        """Collect internal arrays from executors."""
-        # convenient data structures
-
-        # check if self.slices is populated, if not then that means that there is no batch size
-        if self.slices:
-            # based on batch size, slice up data for the given contexts (self.execs)
-            self.data_arrays = [[(self.slices[i], e.arg_dict[name]) for i, e in enumerate(self.execs)]
-                                for name, _ in self.data_shapes]
-        else:
-            # just use the context index as index into the data
-            self.data_arrays = [[(slice(i, i+1), e.arg_dict[name]) for i, e in enumerate(self.execs)]
-                                for name, _ in self.data_shapes]
-
-        self.state_arrays = [[e.arg_dict[name] for e in self.execs]
-                             for name in self.state_names]
-
-        if self.label_shapes is not None:
-            self.label_arrays = [[(self.slices[i], e.arg_dict[name])
-                                  for i, e in enumerate(self.execs)]
-                                 for name, _ in self.label_shapes]
-        else:
-            self.label_arrays = None
-
-        self.param_arrays = [[exec_.arg_arrays[i] for exec_ in self.execs]
-                             for i, name in enumerate(self.arg_names)
-                             if name in self.param_names]
-        if self.for_training:
-            self.grad_arrays = [[exec_.grad_arrays[i] for exec_ in self.execs]
-                                for i, name in enumerate(self.arg_names)
-                                if name in self.param_names]
-        else:
-            self.grad_arrays = None
-
-        data_names = [x[0] for x in self.data_shapes]
-        if self.inputs_need_grad:
-            self.input_grad_arrays = [[exec_.grad_arrays[self.arg_names.index(name)]
-                                       for exec_ in self.execs]
-                                      for name in data_names if name in self.arg_names]
-        else:
-            self.input_grad_arrays = None
-
-        self.aux_arrays = [[exec_.aux_arrays[i] for exec_ in self.execs]
-                           for i in range(len(self.aux_names))]
-
-    def bind_exec(self, data_shapes, label_shapes, shared_group=None, reshape=False):
-        """Bind executors on their respective devices.
-
-        Parameters
-        ----------
-        data_shapes : list
-        label_shapes : list
-        shared_group : DataParallelExecutorGroup
-        reshape : bool
-        """
-        assert reshape or not self.execs
-        self.batch_size = None
-
-        # calculate workload and bind executors
-        self.data_layouts = self.decide_slices(data_shapes)
-        if label_shapes is not None:
-            # call it to make sure labels has the same batch size as data
-            self.label_layouts = self.decide_slices(label_shapes)
-
-        for i in range(len(self.contexts)):
-            data_shapes_i = self._sliced_shape(data_shapes, i, self.data_layouts)
-            if label_shapes is not None:
-                label_shapes_i = self._sliced_shape(label_shapes, i, self.label_layouts)
-            else:
-                label_shapes_i = []
-
-            if reshape:
-                self.execs[i] = self._default_execs[i].reshape(
-                    allow_up_sizing=True, **dict(data_shapes_i + label_shapes_i))
-            else:
-                self.execs.append(self._bind_ith_exec(i, data_shapes_i, label_shapes_i,
-                                                      shared_group))
-
-        self.data_shapes = data_shapes
-        self.label_shapes = label_shapes
-        self.data_names = [i.name for i in self.data_shapes]
-        if label_shapes is not None:
-            self.label_names = [i.name for i in self.label_shapes]
-        self._collect_arrays()
-
-    def reshape(self, data_shapes, label_shapes):
-        """Reshape executors.
-
-        Parameters
-        ----------
-        data_shapes : list
-        label_shapes : list
-        """
-        if data_shapes == self.data_shapes and label_shapes == self.label_shapes:
-            return
-        if self._default_execs is None:
-            self._default_execs = [i for i in self.execs]
-        self.bind_exec(data_shapes, label_shapes, reshape=True)
-
-    def set_params(self, arg_params, aux_params, allow_extra=False):
-        """Assign, i.e. copy parameters to all the executors.
-
-        Parameters
-        ----------
-        arg_params : dict
-            A dictionary of name to `NDArray` parameter mapping.
-        aux_params : dict
-            A dictionary of name to `NDArray` auxiliary variable mapping.
-        allow_extra : boolean, optional
-            Whether allow extra parameters that are not needed by symbol.
-            If this is True, no error will be thrown when arg_params or aux_params
-            contain extra parameters that is not needed by the executor.
-        """
-        for exec_ in self.execs:
-            exec_.copy_params_from(arg_params, aux_params, allow_extra_params=allow_extra)
-
-    def get_params(self, arg_params, aux_params):
-        """ Copy data from each executor to `arg_params` and `aux_params`.
-
-        Parameters
-        ----------
-        arg_params : list of NDArray
-            Target parameter arrays.
-        aux_params : list of NDArray
-            Target aux arrays.
-
-        Notes
-        -----
-        - This function will inplace update the NDArrays in arg_params and aux_params.
-        """
-        for name, block in zip(self.param_names, self.param_arrays):
-            weight = sum(w.copyto(ctx.cpu()) for w in block) / len(block)
-            weight.astype(arg_params[name].dtype).copyto(arg_params[name])
-        for name, block in zip(self.aux_names, self.aux_arrays):
-            weight = sum(w.copyto(ctx.cpu()) for w in block) / len(block)
-            weight.astype(aux_params[name].dtype).copyto(aux_params[name])
-
-    def forward(self, data_batch, is_train=None):
-        """Split `data_batch` according to workload and run forward on each devices.
-
-        Parameters
-        ----------
-        data_batch : DataBatch
-            Or could be any object implementing similar interface.
-        is_train : bool
-            The hint for the backend, indicating whether we are during training phase.
-            Default is `None`, then the value `self.for_training` will be used.
-        Returns
-        -------
-
-        """
-        _load_data(data_batch, self.data_arrays, self.data_layouts)
-        if is_train is None:
-            is_train = self.for_training
-
-        if isinstance(data_batch, list):
-            if self.label_arrays is not None and data_batch is not None and data_batch[0].label:
-                _load_label(data_batch, self.label_arrays, self.label_layouts)
-        else:
-            if self.label_arrays is not None and data_batch.label:
-                _load_label(data_batch, self.label_arrays, self.label_layouts)
-
-        for exec_ in self.execs:
-            exec_.forward(is_train=is_train)
-
-    def get_output_shapes(self):
-        """Get the shapes of the outputs."""
-        outputs = self.execs[0].outputs
-        shapes = [out.shape for out in outputs]
-
-        concat_shapes = []
-        for key, the_shape, axis in zip(self.symbol.list_outputs(), shapes, self.output_layouts):
-            the_shape = list(the_shape)
-            if axis >= 0:
-                the_shape[axis] = self.batch_size
-            concat_shapes.append((key, tuple(the_shape)))
-        return concat_shapes
-
-    def get_outputs(self, merge_multi_context=True, begin=0, end=None):
-        """Get outputs of the previous forward computation.
-        If begin or end is specified, return [begin, end)-th outputs,
-        otherwise return all outputs.
-
-        Parameters
-        ----------
-        merge_multi_context : bool
-            Default is `True`. In the case when data-parallelism is used, the outputs
-            will be collected from multiple devices. A `True` value indicate that we
-            should merge the collected results so that they look like from a single
-            executor.
-        begin : int
-            starting index of returned outputs in all outputs
-        end : int or None
-            ending index (excluded) of returned outputs.
-
-        Returns
-        -------
-        If `merge_multi_context` is ``True``, it is like ``[out1, out2]``. Otherwise, it
-        is like ``[[out1_dev1, out1_dev2], [out2_dev1, out2_dev2]]``. All the output
-        elements are `NDArray`.
-        """
-        if end is None:
-            end = self.num_outputs
-        outputs = [[exec_.outputs[i] for exec_ in self.execs]
-                   for i in range(begin, end)]
-        if merge_multi_context:
-            outputs = _merge_multi_context(outputs, self.output_layouts)
-        return outputs
-
-    def get_states(self, merge_multi_context=True):
-        """Get states from all devices.
-
-        Parameters
-        ----------
-        merge_multi_context : bool
-            Default is ``True``. In the case when data-parallelism is used, the states
-            will be collected from multiple devices. A ``True`` value indicate that we
-            should merge the collected results so that they look like from a single
-            executor.
-
-        Returns
-        -------
-        If `merge_multi_context` is ``True``, it is like ``[out1, out2]``. Otherwise, it
-        is like ``[[out1_dev1, out1_dev2], [out2_dev1, out2_dev2]]``. All the output
-        elements are `NDArray`.
-        """
-        assert not merge_multi_context, \
-            "merge_multi_context=True is not supported for get_states yet."
-        return self.state_arrays
-
-    def set_states(self, states=None, value=None):
-        """Set value for states. Only one of states & value can be specified.
-
-        Parameters
-        ----------
-        states : list of list of NDArrays
-            source states arrays formatted like [[state1_dev1, state1_dev2],
-            [state2_dev1, state2_dev2]].
-        value : number
-            a single scalar value for all state arrays.
-        """
-        if states is not None:
-            assert value is None, "Only one of states & value can be specified."
-            _load_general(states, self.state_arrays, (0,)*len(states))
-        else:
-            assert value is not None, "At least one of states & value must be specified."
-            assert states is None, "Only one of states & value can be specified."
-            for d_dst in self.state_arrays:
-                for dst in d_dst:
-                    dst[:] = value
-
-    def get_input_grads(self, merge_multi_context=True):
-        """Get the gradients with respect to the inputs of the module.
-
-        Parameters
-        ----------
-        merge_multi_context : bool
-            Defaults to ``True``. In the case when data-parallelism is used, the outputs
-            will be collected from multiple devices. A `True` value indicate that we
-            should merge the collected results so that they look like from a single
-            executor.
-
-        Returns
-        -------
-        If `merge_multi_context` is ``True``, it is like ``[grad1, grad2]``. Otherwise, it
-        is like ``[[grad1_dev1, grad1_dev2], [grad2_dev1, grad2_dev2]]``. All the output
-        elements are `NDArray`.
-        """
-        assert self.inputs_need_grad
-        if merge_multi_context:
-            return _merge_multi_context(self.input_grad_arrays, self.data_layouts)
-        return self.input_grad_arrays
-
-    def backward(self, out_grads=None):
-        """Run backward on all devices. A backward should be called after
-        a call to the forward function. Backward cannot be called unless
-        ``self.for_training`` is ``True``.
-
-        Parameters
-        ----------
-        out_grads : NDArray or list of NDArray, optional
-            Gradient on the outputs to be propagated back.
-            This parameter is only needed when bind is called
-            on outputs that are not a loss function.
-        """
-        assert self.for_training, 're-bind with for_training=True to run backward'
-        if out_grads is None:
-            out_grads = []
-
-        for i, (exec_, islice) in enumerate(zip(self.execs, self.slices)):
-            out_grads_slice = []
-            for grad, axis in zip(out_grads, self.output_layouts):
-                if axis >= 0:
-                    # pylint: disable=no-member
-                    og_my_slice = nd.slice_axis(grad, axis=axis, begin=islice.start,
-                                                end=islice.stop)
-                    out_grads_slice.append(og_my_slice.as_in_context(self.contexts[i]))
-                    # pylint: enable=no-member
-                else:
-                    out_grads_slice.append(grad.copyto(self.contexts[i]))
-            exec_.backward(out_grads=out_grads_slice)
-
-    def update_metric(self, eval_metric, labels, pre_sliced):
-        """Accumulate the performance according to `eval_metric` on all devices
-        by comparing outputs from [begin, end) to labels. By default use all
-        outputs.
-
-        Parameters
-        ----------
-        eval_metric : EvalMetric
-            The metric used for evaluation.
-        labels : list of NDArray
-            Typically comes from `label` of a `DataBatch`.
-        pre_sliced : bool
-            Whether labels are already sliced.
-        begin : int
-            Starting index of used outputs.
-        end : int or None
-            Ending index of used outputs.
-        """
-        for current_exec, (texec, islice) in enumerate(zip(self.execs, self.slices)):
-            if not pre_sliced:
-                labels_slice = []
-                for label, axis in zip(labels, self.label_layouts):
-                    if axis == 0:
-                        # slicing NDArray along axis 0 can avoid copying
-                        labels_slice.append(label[islice])
-                    elif axis > 0:
-                        # pylint: disable=no-member
-                        label_my_slice = nd.slice_axis(label, axis=axis, begin=islice.start,
-                                                       end=islice.stop).as_in_context(label.context)
-                        # pylint: enable=no-member
-                        labels_slice.append(label_my_slice)
-                    else:
-                        labels_slice.append(label)
-            else:
-                labels_slice = labels[current_exec]
-
-            labels_ = OrderedDict(zip(self.label_names, labels_slice))
-            preds = OrderedDict(zip(self.output_names, texec.outputs))
-            eval_metric.update_dict(labels_, preds)
-
-    def _bind_ith_exec(self, i, data_shapes, label_shapes, shared_group):
-        """Internal utility function to bind the i-th executor.
-        This function utilizes simple_bind python interface.
-        """
-        shared_exec = None if shared_group is None else shared_group.execs[i]
-        context = self.contexts[i]
-        shared_data_arrays = self.shared_data_arrays[i]
-
-        input_shapes = dict(data_shapes)
-        if label_shapes is not None:
-            input_shapes.update(dict(label_shapes))
-
-        input_types = {x.name: x.dtype for x in data_shapes}
-        attr_dict = self.symbol.attr_dict()
-
-        for sym_name in self.symbol.list_inputs():
-            if sym_name in input_types and sym_name in attr_dict \
-            and "__dtype__" in attr_dict[sym_name] and attr_dict[sym_name]["__dtype__"] != "-1":
-                input_types[sym_name] = _DTYPE_MX_TO_NP[int(attr_dict[sym_name]["__dtype__"])]
-
-        if label_shapes is not None:
-            input_types.update({x.name: x.dtype for x in label_shapes})
-
-        group2ctx = self.group2ctxs[i]
-
-        executor = self.symbol.simple_bind(ctx=context, grad_req=self.grad_req,
-                                           type_dict=input_types, shared_arg_names=self.param_names,
-                                           shared_exec=shared_exec, group2ctx=group2ctx,
-                                           shared_buffer=shared_data_arrays, **input_shapes)
-        self._total_exec_bytes += int(executor.debug_str().split('\n')[-3].split()[1])
-        return executor
-
-    def _sliced_shape(self, shapes, i, major_axis):
-        """Get the sliced shapes for the i-th executor.
-
-        Parameters
-        ----------
-        shapes : list of (str, tuple)
-            The original (name, shape) pairs.
-        i : int
-            Which executor we are dealing with.
-        """
-        sliced_shapes = []
-        for desc, axis in zip(shapes, major_axis):
-            shape = list(desc.shape)
-            if axis >= 0:
-                shape[axis] = self.slices[i].stop - self.slices[i].start
-            sliced_shapes.append(DataDesc(desc.name, tuple(shape), desc.dtype, desc.layout))
-        return sliced_shapes
-
-    def install_monitor(self, mon):
-        """Install monitor on all executors"""
-        for exe in self.execs:
-            mon.install(exe)
diff --git a/python/mxnet/module/module.py b/python/mxnet/module/module.py
deleted file mode 100644
index 3ba141e94f62..000000000000
--- a/python/mxnet/module/module.py
+++ /dev/null
@@ -1,870 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=too-many-instance-attributes, too-many-arguments, protected-access, too-many-branches
-# pylint: disable=too-many-public-methods
-"""A `Module` implement the `BaseModule` API by wrapping a `Symbol` and one or
-more `Executor` for data parallelization.
-"""
-
-import logging
-import warnings
-
-from .. import context as ctx
-from .. import optimizer as opt
-from .. import ndarray as nd
-
-from .executor_group import DataParallelExecutorGroup
-from ..model import _create_kvstore, _initialize_kvstore, _update_params, _update_params_on_kvstore
-from ..model import load_checkpoint
-from ..initializer import Uniform, InitDesc
-from ..io import DataDesc
-from ..ndarray import zeros
-
-from .base_module import BaseModule, _check_input_names, _parse_data_desc
-
-class Module(BaseModule):
-    """Module is a basic module that wrap a `Symbol`. It is functionally the same
-    as the `FeedForward` model, except under the module API.
-
-    Parameters
-    ----------
-    symbol : Symbol
-    data_names : list of str
-        Defaults to `('data')` for a typical model used in image classification.
-    label_names : list of str
-        Defaults to `('softmax_label')` for a typical model used in image
-        classification.
-    logger : Logger
-        Defaults to `logging`.
-    context : Context or list of Context
-        Defaults to ``mx.cpu()``.
-    work_load_list : list of number
-        Default ``None``, indicating uniform workload.
-    fixed_param_names: list of str
-        Default ``None``, indicating no network parameters are fixed.
-    state_names : list of str
-        states are similar to data and label, but not provided by data iterator.
-        Instead they are initialized to 0 and can be set by `set_states()`.
-    group2ctxs : dict of str to context or list of context,
-                 or list of dict of str to context
-        Default is `None`. Mapping the `ctx_group` attribute to the context assignment.
-    compression_params : dict
-        Specifies type of gradient compression and additional arguments depending
-        on the type of compression being used. For example, 2bit compression requires a threshold.
-        Arguments would then be {'type':'2bit', 'threshold':0.5}
-        See mxnet.KVStore.set_gradient_compression method for more details on gradient compression.
-    """
-    def __init__(self, symbol, data_names=('data',), label_names=('softmax_label',),
-                 logger=logging, context=ctx.cpu(), work_load_list=None,
-                 fixed_param_names=None, state_names=None, group2ctxs=None,
-                 compression_params=None):
-        super(Module, self).__init__(logger=logger)
-
-        if isinstance(context, ctx.Context):
-            context = [context]
-        self._context = context
-        if work_load_list is None:
-            work_load_list = [1] * len(self._context)
-        assert len(work_load_list) == len(self._context)
-        self._work_load_list = work_load_list
-
-        self._group2ctxs = group2ctxs
-
-        self._symbol = symbol
-
-        data_names = list(data_names) if data_names is not None else []
-        label_names = list(label_names) if label_names is not None else []
-        state_names = list(state_names) if state_names is not None else []
-        fixed_param_names = list(fixed_param_names) if fixed_param_names is not None else []
-
-        _check_input_names(symbol, data_names, "data", True)
-        _check_input_names(symbol, label_names, "label", False)
-        _check_input_names(symbol, state_names, "state", True)
-        _check_input_names(symbol, fixed_param_names, "fixed_param", True)
-
-        arg_names = symbol.list_arguments()
-        input_names = data_names + label_names + state_names
-        self._param_names = [x for x in arg_names if x not in input_names]
-        self._fixed_param_names = fixed_param_names
-        self._aux_names = symbol.list_auxiliary_states()
-        self._data_names = data_names
-        self._label_names = label_names
-        self._state_names = state_names
-        self._output_names = symbol.list_outputs()
-
-        self._arg_params = None
-        self._aux_params = None
-        self._params_dirty = False
-
-        self._compression_params = compression_params
-        self._optimizer = None
-        self._kvstore = None
-        self._update_on_kvstore = None
-        self._updater = None
-        self._preload_opt_states = None
-        self._grad_req = None
-
-        self._exec_group = None
-        self._data_shapes = None
-        self._label_shapes = None
-
-    @staticmethod
-    def load(prefix, epoch, load_optimizer_states=False, **kwargs):
-        """Creates a model from previously saved checkpoint.
-
-        Parameters
-        ----------
-        prefix : str
-            path prefix of saved model files. You should have
-            "prefix-symbol.json", "prefix-xxxx.params", and
-            optionally "prefix-xxxx.states", where xxxx is the
-            epoch number.
-        epoch : int
-            epoch to load.
-        load_optimizer_states : bool
-            whether to load optimizer states. Checkpoint needs
-            to have been made with save_optimizer_states=True.
-        data_names : list of str
-            Default is `('data')` for a typical model used in image classification.
-        label_names : list of str
-            Default is `('softmax_label')` for a typical model used in image
-            classification.
-        logger : Logger
-            Default is `logging`.
-        context : Context or list of Context
-            Default is ``cpu()``.
-        work_load_list : list of number
-            Default ``None``, indicating uniform workload.
-        fixed_param_names: list of str
-            Default ``None``, indicating no network parameters are fixed.
-        """
-        sym, args, auxs = load_checkpoint(prefix, epoch)
-        mod = Module(symbol=sym, **kwargs)
-        mod._arg_params = args
-        mod._aux_params = auxs
-        mod.params_initialized = True
-        if load_optimizer_states:
-            mod._preload_opt_states = '%s-%04d.states'%(prefix, epoch)
-        return mod
-
-    def save_checkpoint(self, prefix, epoch, save_optimizer_states=False, remove_amp_cast=True):
-        """Saves current progress to checkpoint.
-        Use `mx.callback.module_checkpoint` as `epoch_end_callback` to save during training.
-
-        Parameters
-        ----------
-        prefix : str
-            The file prefix to checkpoint to.
-        epoch : int
-            The current epoch number.
-        save_optimizer_states : bool
-            Whether to save optimizer states to continue training.
-        """
-        self._symbol.save('%s-symbol.json'%prefix, remove_amp_cast=remove_amp_cast)
-        param_name = '%s-%04d.params' % (prefix, epoch)
-        self.save_params(param_name)
-        logging.info('Saved checkpoint to \"%s\"', param_name)
-        if save_optimizer_states:
-            state_name = '%s-%04d.states' % (prefix, epoch)
-            self.save_optimizer_states(state_name)
-            logging.info('Saved optimizer state to \"%s\"', state_name)
-
-    def _reset_bind(self):
-        """Internal function to reset binded state."""
-        self.binded = False
-        self._exec_group = None
-        self._data_shapes = None
-        self._label_shapes = None
-
-    @property
-    def data_names(self):
-        """A list of names for data required by this module."""
-        return self._data_names
-
-    @property
-    def label_names(self):
-        """A list of names for labels required by this module."""
-        return self._label_names
-
-    @property
-    def output_names(self):
-        """A list of names for the outputs of this module."""
-        return self._output_names
-
-    @property
-    def data_shapes(self):
-        """Gets data shapes.
-
-        Returns
-        -------
-        A list of `(name, shape)` pairs.
-        """
-        assert self.binded
-        return self._data_shapes
-
-    @property
-    def label_shapes(self):
-        """Gets label shapes.
-
-        Returns
-        -------
-        A list of `(name, shape)` pairs.
-            The return value could be ``None`` if
-            the module does not need labels, or if the module is not bound for
-            training (in this case, label information is not available).
-        """
-        assert self.binded
-        return self._label_shapes
-
-    @property
-    def output_shapes(self):
-        """Gets output shapes.
-
-        Returns
-        -------
-        A list of `(name, shape)` pairs.
-        """
-        assert self.binded
-        return self._exec_group.get_output_shapes()
-
-    def get_params(self):
-        """Gets current parameters.
-
-        Returns
-        -------
-        `(arg_params, aux_params)`
-            A pair of dictionaries each mapping parameter names to NDArray values.
-        """
-        assert self.params_initialized
-
-        if self._params_dirty:
-            self._sync_params_from_devices()
-        return (self._arg_params, self._aux_params)
-
-    def init_params(self, initializer=Uniform(0.01), arg_params=None, aux_params=None,
-                    allow_missing=False, force_init=False, allow_extra=False):
-        """Initializes the parameters and auxiliary states.
-
-        Parameters
-        ----------
-        initializer : Initializer
-            Called to initialize parameters if needed.
-        arg_params : dict
-            If not ``None``, should be a dictionary of existing arg_params. Initialization
-            will be copied from that.
-        aux_params : dict
-            If not ``None``, should be a dictionary of existing aux_params. Initialization
-            will be copied from that.
-        allow_missing : bool
-            If ``True``, params could contain missing values, and the initializer will be
-            called to fill those missing params.
-        force_init : bool
-            If ``True``, will force re-initialize even if already initialized.
-        allow_extra : boolean, optional
-            Whether allow extra parameters that are not needed by symbol.
-            If this is True, no error will be thrown when arg_params or aux_params
-            contain extra parameters that is not needed by the executor.
-        """
-        if self.params_initialized and not force_init:
-            warnings.warn("Parameters already initialized and force_init=False. "
-                          "init_params call ignored.", stacklevel=2)
-            return
-        assert self.binded, 'call bind before initializing the parameters'
-
-        def _impl(name, arr, cache):
-            """Internal helper for parameter initialization"""
-            if cache is not None:
-                if name in cache:
-                    cache_arr = cache[name]
-
-                    # just in case the cached array is just the target itself
-                    if cache_arr is not arr:
-                        cache_arr.copyto(arr)
-                else:
-                    if not allow_missing:
-                        raise RuntimeError("%s is not presented" % name)
-                    if initializer is not None:
-                        initializer(name, arr)
-            else:
-                initializer(name, arr)
-
-        attrs = self._symbol.attr_dict()
-        for name, arr in sorted(self._arg_params.items()):
-            desc = InitDesc(name, attrs.get(name, None))
-            _impl(desc, arr, arg_params)
-
-        for name, arr in sorted(self._aux_params.items()):
-            desc = InitDesc(name, attrs.get(name, None))
-            _impl(desc, arr, aux_params)
-
-        self.params_initialized = True
-        self._params_dirty = False
-
-        # copy the initialized parameters to devices
-        self._exec_group.set_params(self._arg_params, self._aux_params,
-                                    allow_extra=allow_extra)
-
-    def set_params(self, arg_params, aux_params, allow_missing=False, force_init=True,
-                   allow_extra=False):
-        """Assigns parameter and aux state values.
-
-        Parameters
-        ----------
-        arg_params : dict
-            Dictionary of name to `NDArray`.
-        aux_params : dict
-            Dictionary of name to `NDArray`.
-        allow_missing : bool
-            If ``True``, params could contain missing values, and the initializer will be
-            called to fill those missing params.
-        force_init : bool
-            If ``True``, will force re-initialize even if already initialized.
-        allow_extra : boolean, optional
-            Whether allow extra parameters that are not needed by symbol.
-            If this is True, no error will be thrown when arg_params or aux_params
-            contain extra parameters that is not needed by the executor.
-        Examples
-        --------
-        >>> # An example of setting module parameters.
-        >>> sym, arg_params, aux_params = mx.model.load_checkpoint(model_prefix, n_epoch_load)
-        >>> mod.set_params(arg_params=arg_params, aux_params=aux_params)
-        """
-        if not allow_missing:
-            self.init_params(initializer=None, arg_params=arg_params, aux_params=aux_params,
-                             allow_missing=allow_missing, force_init=force_init,
-                             allow_extra=allow_extra)
-            return
-
-        if self.params_initialized and not force_init:
-            warnings.warn("Parameters already initialized and force_init=False. "
-                          "set_params call ignored.", stacklevel=2)
-            return
-
-        self._exec_group.set_params(arg_params, aux_params, allow_extra=allow_extra)
-
-        # because we didn't update self._arg_params, they are dirty now.
-        self._params_dirty = True
-        self.params_initialized = True
-
-    def bind(self, data_shapes, label_shapes=None, for_training=True,
-             inputs_need_grad=False, force_rebind=False, shared_module=None,
-             grad_req='write'):
-        """Binds the symbols to construct executors. This is necessary before one
-        can perform computation with the module.
-
-        Parameters
-        ----------
-        data_shapes : list of (str, tuple)
-            Typically is ``data_iter.provide_data``.
-        label_shapes : list of (str, tuple)
-            Typically is ``data_iter.provide_label``.
-        for_training : bool
-            Default is ``True``. Whether the executors should be bound for training.
-        inputs_need_grad : bool
-            Default is ``False``. Whether the gradients to the input data need to be computed.
-            Typically this is not needed. But this might be needed when implementing composition
-            of modules.
-        force_rebind : bool
-            Default is ``False``. This function does nothing if the executors are already
-            bound. But with this ``True``, the executors will be forced to rebind.
-        shared_module : Module
-            Default is ``None``. This is used in bucketing. When not ``None``, the shared module
-            essentially corresponds to a different bucket -- a module with different symbol
-            but with the same sets of parameters (e.g. unrolled RNNs with different lengths).
-        """
-        # force rebinding is typically used when one want to switch from
-        # training to prediction phase.
-        if force_rebind:
-            self._reset_bind()
-
-        if self.binded:
-            self.logger.warning('Already bound, ignoring bind()')
-            return
-
-        self.for_training = for_training
-        self.inputs_need_grad = inputs_need_grad
-        self._grad_req = grad_req
-
-        if not for_training:
-            assert not inputs_need_grad
-        else:
-            pass
-            # this is not True, as some module might not contains a loss function
-            # that consumes the labels
-            # assert label_shapes is not None
-
-        self._data_shapes, self._label_shapes = _parse_data_desc(
-            self.data_names, self.label_names, data_shapes, label_shapes)
-
-        if shared_module is not None:
-            assert isinstance(shared_module, Module) and \
-                    shared_module.binded and shared_module.params_initialized
-            shared_group = shared_module._exec_group
-            assert len(shared_group.execs) >= len(self._context)
-        else:
-            shared_group = None
-
-        self._exec_group = DataParallelExecutorGroup(self._symbol, self._context,
-                                                     self._work_load_list, self._data_shapes,
-                                                     self._label_shapes, self._param_names,
-                                                     for_training, inputs_need_grad,
-                                                     shared_group, logger=self.logger,
-                                                     fixed_param_names=self._fixed_param_names,
-                                                     grad_req=grad_req, group2ctxs=self._group2ctxs,
-                                                     state_names=self._state_names)
-        self._total_exec_bytes = self._exec_group._total_exec_bytes
-        if shared_module is not None:
-            self.params_initialized = True
-            self._arg_params = shared_module._arg_params
-            self._aux_params = shared_module._aux_params
-        elif self.params_initialized:
-            # if the parameters are already initialized, we are re-binding
-            # so automatically copy the already initialized params
-            self._exec_group.set_params(self._arg_params, self._aux_params)
-        else:
-            assert self._arg_params is None and self._aux_params is None
-            param_arrays = [
-                zeros(shape=x[0].shape, dtype=x[0].dtype, stype=x[0].stype)
-                for x in self._exec_group.param_arrays
-            ]
-            self._arg_params = {name:arr for name, arr in zip(self._param_names, param_arrays)}
-
-            aux_arrays = [
-                zeros(x[0].shape, dtype=x[0].dtype)
-                for x in self._exec_group.aux_arrays
-            ]
-            self._aux_params = {name:arr for name, arr in zip(self._aux_names, aux_arrays)}
-
-        if shared_module is not None and shared_module.optimizer_initialized:
-            self.borrow_optimizer(shared_module)
-
-        self.binded = True
-
-    def reshape(self, data_shapes, label_shapes=None):
-        """Reshapes the module for new input shapes.
-
-        Parameters
-        ----------
-        data_shapes : list of (str, tuple)
-            Typically is ``data_iter.provide_data``.
-        label_shapes : list of (str, tuple)
-            Typically is ``data_iter.provide_label``.
-        """
-        assert self.binded
-        self._data_shapes, self._label_shapes = _parse_data_desc(
-            self.data_names, self.label_names, data_shapes, label_shapes)
-
-        self._exec_group.reshape(self._data_shapes, self._label_shapes)
-
-    def init_optimizer(self, kvstore='local', optimizer='sgd',
-                       optimizer_params=(('learning_rate', 0.01),), force_init=False):
-        """Installs and initializes optimizers.
-
-        Parameters
-        ----------
-        kvstore : str or KVStore
-            Default `'local'`.
-        optimizer : str or Optimizer
-            Default `'sgd'`
-        optimizer_params : dict
-            Default `(('learning_rate', 0.01),)`. The default value is not a dictionary,
-            just to avoid pylint warning of dangerous default values.
-        force_init : bool
-            Default ``False``, indicating whether we should force re-initializing the
-            optimizer in the case an optimizer is already installed.
-        """
-        assert self.binded and self.params_initialized
-
-        if self.optimizer_initialized and not force_init:
-            self.logger.warning('optimizer already initialized, ignoring...')
-            return
-
-        if self._params_dirty:
-            self._sync_params_from_devices()
-
-        (kvstore, update_on_kvstore) = \
-                _create_kvstore(kvstore, len(self._context), self._arg_params)
-
-        batch_size = self._exec_group.batch_size
-        if kvstore and 'dist' in kvstore.type and '_sync' in kvstore.type:
-            batch_size *= kvstore.num_workers
-        rescale_grad = 1.0/batch_size
-
-        idx2name = {}
-        if update_on_kvstore:
-            idx2name.update(enumerate(self._exec_group.param_names))
-        else:
-            for k in range(len(self._context)):
-                idx2name.update({i*len(self._context)+k: n
-                                 for i, n in enumerate(self._exec_group.param_names)})
-        if isinstance(optimizer, str):
-            optimizer_params = dict(optimizer_params)
-            if 'rescale_grad' not in optimizer_params:
-                optimizer_params['rescale_grad'] = rescale_grad
-            optimizer = opt.create(optimizer,
-                                   sym=self.symbol, param_idx2name=idx2name,
-                                   **optimizer_params)
-        else:
-            assert isinstance(optimizer, opt.Optimizer)
-            if optimizer.rescale_grad != rescale_grad:
-                #pylint: disable=no-member
-                warnings.warn(
-                    "Optimizer created manually outside Module but rescale_grad " +
-                    "is not normalized to 1.0/batch_size/num_workers (%s vs. %s). "%(
-                        optimizer.rescale_grad, rescale_grad) +
-                    "Is this intended?", stacklevel=2)
-            if not optimizer.idx2name:
-                optimizer.idx2name = idx2name.copy()
-
-        self._optimizer = optimizer
-        self._kvstore = kvstore
-        self._update_on_kvstore = update_on_kvstore
-        self._updater = None
-
-        if kvstore:
-            if self._compression_params:
-                kvstore.set_gradient_compression(self._compression_params)
-            if update_on_kvstore:
-                kvstore.set_optimizer(self._optimizer)
-            # copy initialized local parameters to kvstore
-            _initialize_kvstore(kvstore=kvstore,
-                                param_arrays=self._exec_group.param_arrays,
-                                arg_params=self._arg_params,
-                                param_names=self._param_names,
-                                update_on_kvstore=update_on_kvstore)
-
-        if not update_on_kvstore:
-            self._updater = opt.get_updater(optimizer)
-
-        self.optimizer_initialized = True
-
-        if self._preload_opt_states is not None:
-            self.load_optimizer_states(self._preload_opt_states)
-            self._preload_opt_states = None
-
-    def borrow_optimizer(self, shared_module):
-        """Borrows optimizer from a shared module. Used in bucketing, where exactly the same
-        optimizer (esp. kvstore) is used.
-
-        Parameters
-        ----------
-        shared_module : Module
-        """
-        assert shared_module.optimizer_initialized
-        self._optimizer = shared_module._optimizer
-        self._kvstore = shared_module._kvstore
-        self._update_on_kvstore = shared_module._update_on_kvstore
-        self._updater = shared_module._updater
-        self.optimizer_initialized = True
-
-    def forward(self, data_batch, is_train=None):
-        """Forward computation. It supports data batches with different shapes, such as
-        different batch sizes or different image sizes.
-        If reshaping of data batch relates to modification of symbol or module, such as
-        changing image layout ordering or switching from training to predicting, module
-        rebinding is required.
-
-        See Also
-        ----------
-        :meth:`BaseModule.forward`.
-
-        Parameters
-        ----------
-        data_batch : DataBatch
-            Could be anything with similar API implemented.
-        is_train : bool
-            Default is ``None``, which means ``is_train`` takes the value of ``self.for_training``.
-        """
-        assert self.binded and self.params_initialized
-
-        curr_data_shapes = tuple(i.shape for i in self._data_shapes)
-        if isinstance(data_batch, list):
-            assert data_batch is not None, "Encountered empty data batch"
-            new_data_shapes = []
-            for i in range(len(data_batch[0].data)):
-                shape = data_batch[0].data[i].shape
-                for db in data_batch:
-                    assert shape == db.data[i].shape, \
-                        "All data batches in a list need to have the same shape"
-                new_batch_size = len(data_batch) * shape[0]
-                new_data_shapes.append((new_batch_size,) + shape[1:])
-            new_data_shapes = tuple(new_data_shapes)
-        else:
-            new_data_shapes = tuple(i.shape for i in data_batch.data)
-
-        if curr_data_shapes != new_data_shapes:
-            if hasattr(data_batch, "provide_data") and data_batch.provide_data:
-                new_dshape = data_batch.provide_data
-            else:
-                new_dshape = [DataDesc(i.name, shape, i.dtype, i.layout) \
-                              for i, shape in zip(self._data_shapes, new_data_shapes)]
-
-            if hasattr(data_batch, "provide_label") and data_batch.provide_label:
-                new_lshape = data_batch.provide_label
-            elif hasattr(data_batch, "label") and data_batch.label:
-                new_lshape = [DataDesc(i.name, j.shape, i.dtype, i.layout) \
-                              for i, j in zip(self._label_shapes, data_batch.label)]
-            else:
-                new_lshape = None
-
-            self.reshape(new_dshape, new_lshape)
-
-        self._exec_group.forward(data_batch, is_train)
-
-    def backward(self, out_grads=None):
-        """Backward computation.
-
-        See Also
-        ----------
-        :meth:`BaseModule.backward`.
-
-        Parameters
-        ----------
-        out_grads : NDArray or list of NDArray, optional
-            Gradient on the outputs to be propagated back.
-            This parameter is only needed when bind is called
-            on outputs that are not a loss function.
-        """
-        assert self.binded and self.params_initialized
-        self._exec_group.backward(out_grads=out_grads)
-
-    def update(self):
-        """Updates parameters according to the installed optimizer and the gradients computed
-        in the previous forward-backward batch.
-
-        When KVStore is used to update parameters for multi-device or multi-machine training,
-        a copy of the parameters are stored in KVStore. Note that for `row_sparse` parameters,
-        this function does update the copy of parameters in KVStore, but doesn't broadcast the
-        updated parameters to all devices / machines. Please call `prepare` to broadcast
-        `row_sparse` parameters with the next batch of data.
-
-        See Also
-        ----------
-        :meth:`BaseModule.update`.
-        """
-        assert self.binded and self.params_initialized and self.optimizer_initialized
-
-        self._params_dirty = True
-        if self._update_on_kvstore:
-            _update_params_on_kvstore(self._exec_group.param_arrays,
-                                      self._exec_group.grad_arrays,
-                                      self._kvstore, self._exec_group.param_names)
-        else:
-            _update_params(self._exec_group.param_arrays,
-                           self._exec_group.grad_arrays,
-                           updater=self._updater,
-                           num_device=len(self._context),
-                           kvstore=self._kvstore,
-                           param_names=self._exec_group.param_names)
-
-    def get_outputs(self, merge_multi_context=True):
-        """Gets outputs of the previous forward computation.
-
-        If ``merge_multi_context`` is ``True``, it is like ``[out1, out2]``. Otherwise, it
-        is like ``[[out1_dev1, out1_dev2], [out2_dev1, out2_dev2]]``. All the output
-        elements are `NDArray`. When `merge_multi_context` is `False`, those `NDArray`
-        might live on different devices.
-
-        Parameters
-        ----------
-        merge_multi_context : bool
-            Default is ``True``. In the case when data-parallelism is used, the outputs
-            will be collected from multiple devices. A ``True`` value indicate that we
-            should merge the collected results so that they look like from a single
-            executor.
-
-        Returns
-        -------
-        list of NDArray or list of list of NDArray
-            Output.
-        """
-        assert self.binded and self.params_initialized
-        return self._exec_group.get_outputs(merge_multi_context=merge_multi_context)
-
-    def get_input_grads(self, merge_multi_context=True):
-        """Gets the gradients with respect to the inputs of the module.
-
-        If ``merge_multi_context`` is ``True``, it is like ``[grad1, grad2]``. Otherwise, it
-        is like ``[[grad1_dev1, grad1_dev2], [grad2_dev1, grad2_dev2]]``. All the output
-        elements are `NDArray`.
-
-        Parameters
-        ----------
-        merge_multi_context : bool
-            Default is ``True``. In the case when data-parallelism is used, the outputs
-            will be collected from multiple devices. A ``True`` value indicate that we
-            should merge the collected results so that they look like from a single
-            executor.
-
-        Returns
-        -------
-        list of NDArray or list of list of NDArray
-              Input gradients
-        """
-        assert self.binded and self.params_initialized and self.inputs_need_grad
-        return self._exec_group.get_input_grads(merge_multi_context=merge_multi_context)
-
-    def get_states(self, merge_multi_context=True):
-        """Gets states from all devices.
-
-        If `merge_multi_context` is ``True``, it is like ``[out1, out2]``. Otherwise, it
-        is like ``[[out1_dev1, out1_dev2], [out2_dev1, out2_dev2]]``. All the output
-        elements are `NDArray`.
-
-        Parameters
-        ----------
-        merge_multi_context : bool
-            Default is ``True``. In the case when data-parallelism is used, the states
-            will be collected from multiple devices. A ``True`` value indicate that we
-            should merge the collected results so that they look like from a single
-            executor.
-
-        Returns
-        -------
-        list of NDArray or list of list of NDArray
-            States
-        """
-        assert self.binded and self.params_initialized
-        return self._exec_group.get_states(merge_multi_context=merge_multi_context)
-
-    def set_states(self, states=None, value=None):
-        """Sets value for states. Only one of the states & value can be specified.
-
-        Parameters
-        ----------
-        states : list of list of NDArrays
-            source states arrays formatted like ``[[state1_dev1, state1_dev2],
-            [state2_dev1, state2_dev2]]``.
-        value : number
-            a single scalar value for all state arrays.
-        """
-        assert self.binded and self.params_initialized
-        self._exec_group.set_states(states, value)
-
-    def update_metric(self, eval_metric, labels, pre_sliced=False):
-        """Evaluates and accumulates evaluation metric on outputs of the last forward computation.
-
-        See Also
-        ----------
-        :meth:`BaseModule.update_metric`.
-
-        Parameters
-        ----------
-        eval_metric : EvalMetric
-            Evaluation metric to use.
-        labels : list of NDArray if `pre_sliced` parameter is set to `False`,
-            list of lists of NDArray otherwise. Typically `data_batch.label`.
-        pre_sliced: bool
-            Whether the labels are already sliced per device (default: False).
-        """
-        self._exec_group.update_metric(eval_metric, labels, pre_sliced)
-
-    def _sync_params_from_devices(self):
-        """Synchronizes parameters from devices to CPU. This function should be called after
-        calling `update` that updates the parameters on the devices, before one can read the
-        latest parameters from ``self._arg_params`` and ``self._aux_params``.
-
-        For row_sparse parameters on devices, ther are pulled from KVStore with all row ids.
-
-        """
-        self._exec_group.get_params(self._arg_params, self._aux_params)
-        if self._kvstore and self._update_on_kvstore:
-            for param_name, param_val in sorted(self._arg_params.items()):
-                if param_val.stype == 'row_sparse':
-                    row_ids = nd.arange(0, param_val.shape[0], dtype='int64')
-                    self._kvstore.row_sparse_pull(param_name, param_val, row_ids=row_ids)
-        self._params_dirty = False
-
-    def save_optimizer_states(self, fname):
-        """Saves optimizer (updater) state to a file.
-
-        Parameters
-        ----------
-        fname : str
-            Path to output states file.
-        """
-        assert self.optimizer_initialized
-
-        if self._update_on_kvstore:
-            self._kvstore.save_optimizer_states(fname)
-        else:
-            with open(fname, 'wb') as fout:
-                fout.write(self._updater.get_states())
-
-    def load_optimizer_states(self, fname):
-        """Loads optimizer (updater) state from a file.
-
-        Parameters
-        ----------
-        fname : str
-            Path to input states file.
-        """
-        assert self.optimizer_initialized
-
-        if self._update_on_kvstore:
-            self._kvstore.load_optimizer_states(fname)
-        else:
-            self._updater.set_states(open(fname, 'rb').read())
-
-    def install_monitor(self, mon):
-        """Installs monitor on all executors. """
-        assert self.binded
-        self._exec_group.install_monitor(mon)
-
-    def prepare(self, data_batch, sparse_row_id_fn=None):
-        '''Prepares the module for processing a data batch.
-
-        Usually involves switching bucket and reshaping.
-        For modules that contain `row_sparse` parameters in KVStore,
-        it prepares the `row_sparse` parameters based on the sparse_row_id_fn.
-
-        When KVStore is used to update parameters for multi-device or multi-machine training,
-        a copy of the parameters are stored in KVStore. Note that for `row_sparse` parameters,
-        the `update()` updates the copy of parameters in KVStore, but doesn't broadcast
-        the updated parameters to all devices / machines. The `prepare` function is used to
-        broadcast `row_sparse` parameters with the next batch of data.
-
-        Parameters
-        ----------
-        data_batch : DataBatch
-            The current batch of data for forward computation.
-
-        sparse_row_id_fn : A callback function
-            The function  takes `data_batch` as an input and returns a dict of
-            str -> NDArray. The resulting dict is used for pulling row_sparse
-            parameters from the kvstore, where the str key is the name of the param,
-            and the value is the row id of the param to pull.
-        '''
-        assert self.binded
-        if sparse_row_id_fn is not None:
-            if not self._kvstore or not self._update_on_kvstore:
-                warnings.warn(UserWarning("Parameters are not updated in the KVStore. "
-                                          "No need to call sparse_row_id_fn."))
-            else:
-                row_ids = sparse_row_id_fn(data_batch)
-                assert(isinstance(row_ids, dict)), "Expected dict output from sparse_row_id_fn"
-                for param_name, row_id in row_ids.items():
-                    param_idx = self._exec_group.param_names.index(param_name)
-                    param_val = self._exec_group.param_arrays[param_idx]
-                    assert(isinstance(param_val, (tuple, list)))
-                    if param_val[0].stype != 'row_sparse':
-                        warnings.warn(UserWarning("%s.stype is not 'row_sparse'. No need to "
-                                                  "perform row_sparse_pull." % param_name))
-                    else:
-                        self._kvstore.row_sparse_pull(param_name, param_val, row_ids=row_id,
-                                                      priority=-param_idx)
diff --git a/python/mxnet/module/python_module.py b/python/mxnet/module/python_module.py
deleted file mode 100644
index a5d6f157e6a5..000000000000
--- a/python/mxnet/module/python_module.py
+++ /dev/null
@@ -1,362 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=too-many-instance-attributes, too-many-arguments, unnecessary-pass
-"""Provide some handy classes for user to implement a simple computation module
-in Python easily.
-"""
-import logging
-
-from .base_module import BaseModule
-from ..initializer import Uniform
-from .. import ndarray as nd
-
-class PythonModule(BaseModule):
-    """A convenient module class that implements many of the module APIs as
-    empty functions.
-
-    Parameters
-    ----------
-    data_names : list of str
-        Names of the data expected by the module.
-    label_names : list of str
-        Names of the labels expected by the module. Could be ``None`` if the
-        module does not need labels.
-    output_names : list of str
-        Names of the outputs.
-    """
-    def __init__(self, data_names, label_names, output_names, logger=logging):
-        super(PythonModule, self).__init__(logger=logger)
-
-        if isinstance(data_names, tuple):
-            data_names = list(data_names)
-        if isinstance(label_names, tuple):
-            label_names = list(label_names)
-
-        self._data_names = data_names
-        self._label_names = label_names
-        self._output_names = output_names
-
-        self._data_shapes = None
-        self._label_shapes = None
-        self._output_shapes = None
-
-    ################################################################################
-    # Symbol information
-    ################################################################################
-    @property
-    def data_names(self):
-        """A list of names for data required by this module."""
-        return self._data_names
-
-    @property
-    def output_names(self):
-        """A list of names for the outputs of this module."""
-        return self._output_names
-
-    ################################################################################
-    # Input/Output information
-    ################################################################################
-    @property
-    def data_shapes(self):
-        """A list of (name, shape) pairs specifying the data inputs to this module."""
-        return self._data_shapes
-
-    @property
-    def label_shapes(self):
-        """A list of (name, shape) pairs specifying the label inputs to this module.
-        If this module does not accept labels -- either it is a module without loss
-        function, or it is not bound for training, then this should return an empty
-        list ``[]```.
-        """
-        return self._label_shapes
-
-    @property
-    def output_shapes(self):
-        """A list of (name, shape) pairs specifying the outputs of this module."""
-        return self._output_shapes
-
-    ################################################################################
-    # Parameters of a module
-    ################################################################################
-    def get_params(self):
-        """Gets parameters, those are potentially copies of the actual parameters used
-        to do computation on the device. Subclass should override this method if contains
-        parameters.
-
-        Returns
-        -------
-        ``({}, {})``, a pair of empty dict.
-        """
-        return (dict(), dict())
-
-    def init_params(self, initializer=Uniform(0.01), arg_params=None, aux_params=None,
-                    allow_missing=False, force_init=False, allow_extra=False):
-        """Initializes the parameters and auxiliary states. By default this function
-        does nothing. Subclass should override this method if contains parameters.
-
-        Parameters
-        ----------
-        initializer : Initializer
-            Called to initialize parameters if needed.
-        arg_params : dict
-            If not ``None``, should be a dictionary of existing `arg_params`. Initialization
-            will be copied from that.
-        aux_params : dict
-            If not ``None``, should be a dictionary of existing `aux_params`. Initialization
-            will be copied from that.
-        allow_missing : bool
-            If ``True``, params could contain missing values, and the initializer will be
-            called to fill those missing params.
-        force_init : bool
-            If ``True``, will force re-initialize even if already initialized.
-        allow_extra : boolean, optional
-            Whether allow extra parameters that are not needed by symbol.
-            If this is True, no error will be thrown when arg_params or aux_params
-            contain extra parameters that is not needed by the executor.
-        """
-        pass
-
-    def update(self):
-        """Updates parameters according to the installed optimizer and the gradients computed
-        in the previous forward-backward batch. Currently we do nothing here. Subclass should
-        override this method if contains parameters.
-        """
-        pass
-
-    def update_metric(self, eval_metric, labels, pre_sliced=False):
-        """Evaluates and accumulates evaluation metric on outputs of the last forward computation.
-        Subclass should override this method if needed.
-
-        Parameters
-        ----------
-        eval_metric : EvalMetric
-        labels : list of NDArray
-            Typically ``data_batch.label``.
-        """
-        if self._label_shapes is None:
-            # since we do not need labels, we are probably not a module with a loss
-            # function or predictions, so just ignore this call
-            return
-
-        if pre_sliced:
-            raise RuntimeError("PythonModule does not support presliced labels")
-
-        # by default we expect our outputs are some scores that could be evaluated
-        eval_metric.update(labels, self.get_outputs())
-
-    ################################################################################
-    # module setup
-    ################################################################################
-    def bind(self, data_shapes, label_shapes=None, for_training=True,
-             inputs_need_grad=False, force_rebind=False, shared_module=None,
-             grad_req='write'):
-        """Binds the symbols to construct executors. This is necessary before one
-        can perform computation with the module.
-
-        Parameters
-        ----------
-        data_shapes : list of (str, tuple)
-            Typically is ``data_iter.provide_data``.
-        label_shapes : list of (str, tuple)
-            Typically is ``data_iter.provide_label``.
-        for_training : bool
-            Default is ``True``. Whether the executors should be bind for training.
-        inputs_need_grad : bool
-            Default is ``False``. Whether the gradients to the input data need to be computed.
-            Typically this is not needed. But this might be needed when implementing composition
-            of modules.
-        force_rebind : bool
-            Default is ``False``. This function does nothing if the executors are already
-            bound. But with this ``True``, the executors will be forced to rebind.
-        shared_module : Module
-            Default is ``None``. This is used in bucketing. When not ``None``, the shared module
-            essentially corresponds to a different bucket -- a module with different symbol
-            but with the same sets of parameters (e.g. unrolled RNNs with different lengths).
-        grad_req : str, list of str, dict of str to str
-            Requirement for gradient accumulation. Can be 'write', 'add', or 'null'
-            (default to 'write').
-            Can be specified globally (str) or for each argument (list, dict).
-        """
-        if self.binded and not force_rebind:
-            self.logger.warning('Already bound, ignoring bind()')
-            return
-
-        assert grad_req == 'write', "Python module only support write gradient"
-        self.for_training = for_training
-        self.inputs_need_grad = inputs_need_grad
-
-        assert len(data_shapes) == len(self._data_names)
-        assert [x[0] for x in data_shapes] == self._data_names
-        self._data_shapes = data_shapes
-
-        self._label_shapes = label_shapes
-        if label_shapes is not None:
-            assert self._label_names is not None
-            assert len(self._label_names) == len(label_shapes)
-            assert [x[0] for x in label_shapes] == self._label_names
-
-        self._output_shapes = self._compute_output_shapes()
-
-    def _compute_output_shapes(self):
-        """The subclass should implement this method to compute the shape of
-        outputs. This method can assume that the ``data_shapes`` and ``label_shapes``
-        are already initialized.
-        """
-        raise NotImplementedError()
-
-    def init_optimizer(self, kvstore='local', optimizer='sgd',
-                       optimizer_params=(('learning_rate', 0.01),), force_init=False):
-        """Installs and initializes optimizers. By default we do nothing. Subclass should
-        override this method if needed.
-
-        Parameters
-        ----------
-        kvstore : str or KVStore
-            Default `'local'`.
-        optimizer : str or Optimizer
-            Default `'sgd'`
-        optimizer_params : dict
-            Default `(('learning_rate', 0.01),)`. The default value is not a dictionary,
-            just to avoid pylint warning of dangerous default values.
-        force_init : bool
-            Default `False`, indicating whether we should force re-initializing the
-            optimizer in the case an optimizer is already installed.
-        """
-        pass
-
-
-class PythonLossModule(PythonModule):
-    """A convenient module class that implements many of the module APIs as
-    empty functions.
-
-    Parameters
-    ----------
-    name : str
-        Names of the module. The outputs will be named `[name + '_output']`.
-    data_names : list of str
-        Defaults to ``['data']``. Names of the data expected by this module.
-        Should be a list of only one name.
-    label_names : list of str
-        Default ``['softmax_label']``. Names of the labels expected by the module.
-        Should be a list of only one name.
-    grad_func : function
-        Optional. If not ``None``, should be a function that takes `scores`
-        and `labels`, both of type `NDArray`, and return the gradients with
-        respect to the scores according to this loss function. The return
-        value could be a numpy array or an `NDArray`.
-    """
-    def __init__(self, name='pyloss', data_names=('data',), label_names=('softmax_label',),
-                 logger=logging, grad_func=None):
-        super(PythonLossModule, self).__init__(data_names, label_names,
-                                               [name + '_output'], logger=logger)
-        self._name = name
-        assert len(data_names) == 1
-        assert len(label_names) == 1
-
-        self._scores = None
-        self._labels = None
-        self._scores_grad = None
-
-        if grad_func is not None:
-            assert callable(grad_func)
-        self._grad_func = grad_func
-
-    def _compute_output_shapes(self):
-        """Computes the shapes of outputs. As a loss module with outputs, we simply
-        output whatever we receive as inputs (i.e. the scores).
-        """
-        return [(self._name + '_output', self._data_shapes[0][1])]
-
-    def forward(self, data_batch, is_train=None):
-        """Forward computation. Here we do nothing but to keep a reference to
-        the scores and the labels so that we can do backward computation.
-
-        Parameters
-        ----------
-        data_batch : DataBatch
-            Could be anything with similar API implemented.
-        is_train : bool
-            Default is ``None``, which means `is_train` takes the value of ``self.for_training``.
-        """
-        self._scores = data_batch.data[0]
-
-        if is_train is None:
-            is_train = self.for_training
-
-        if is_train:
-            self._labels = data_batch.label[0]
-
-    def get_outputs(self, merge_multi_context=True):
-        """Gets outputs of the previous forward computation. As a output loss module,
-        we treat the inputs to this module as scores, and simply return them.
-
-        Parameters
-        ----------
-        merge_multi_context : bool
-            Should always be ``True``, because we do not use multiple contexts for computing.
-        """
-        assert merge_multi_context is True
-        return [self._scores]
-
-    def backward(self, out_grads=None):
-        """Backward computation.
-
-        Parameters
-        ----------
-        out_grads : NDArray or list of NDArray, optional
-            Gradient on the outputs to be propagated back.
-            This parameter is only needed when bind is called
-            on outputs that are not a loss function.
-        """
-        assert out_grads is None, 'For a loss module, out_grads should be None'
-        assert self.for_training
-
-        self._backward_impl()
-
-    def _backward_impl(self):
-        """Actual implementation of the backward computation. The computation
-        should take ``self._scores`` and ``self._labels`` and then compute the
-        gradients with respect to the scores, store it as an `NDArray` in
-        ``self._scores_grad``.
-
-        Instead of defining a subclass and overriding this function,
-        a more convenient way is to pass in a `grad_func` when constructing
-        the module object. Then it will be called to compute the gradients.
-        """
-        if self._grad_func is not None:
-            grad = self._grad_func(self._scores, self._labels)
-            if not isinstance(grad, nd.NDArray):
-                grad = nd.array(grad)
-            self._scores_grad = grad
-        else:
-            raise NotImplementedError()
-
-    def get_input_grads(self, merge_multi_context=True):
-        """Gets the gradients to the inputs, computed in the previous backward computation.
-
-        Parameters
-        ----------
-        merge_multi_context : bool
-            Should always be ``True`` because we do not use multiple context for computation.
-        """
-        assert merge_multi_context is True
-        return [self._scores_grad]
-
-    def install_monitor(self, mon):
-        """Installs monitor on all executors."""
-        raise NotImplementedError()
diff --git a/python/mxnet/module/sequential_module.py b/python/mxnet/module/sequential_module.py
deleted file mode 100644
index 507f3f730f2e..000000000000
--- a/python/mxnet/module/sequential_module.py
+++ /dev/null
@@ -1,440 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=too-many-arguments, too-many-locals, too-many-instance-attributes
-"""`SequentialModule` is a container module that chains a number of modules together."""
-
-import logging
-import copy
-
-from ..initializer import Uniform
-
-from .base_module import BaseModule
-
-class SequentialModule(BaseModule):
-    """A SequentialModule is a container module that can chain multiple modules together.
-
-    .. note::
-
-        Building a computation graph with this kind of imperative container is less
-        flexible and less efficient than the symbolic graph. So, this should be only used as a
-        handy utility.
-    """
-
-    META_TAKE_LABELS = 'take_labels'
-    META_AUTO_WIRING = 'auto_wiring'
-
-    def __init__(self, logger=logging):
-        super(SequentialModule, self).__init__(logger=logger)
-        self._modules = []
-        self._metas = []
-
-        self._label_shapes = None
-        self._data_shapes = None
-        self._meta_keys = set([getattr(SequentialModule, x)
-                               for x in dir(SequentialModule)
-                               if x.startswith('META_')])
-
-    def add(self, module, **kwargs):
-        """Add a module to the chain.
-
-        Parameters
-        ----------
-        module : BaseModule
-            The new module to add.
-        kwargs : ``**keywords``
-            All the keyword arguments are saved as meta information
-            for the added module. The currently known meta includes
-
-            - `take_labels`: indicating whether the module expect to
-                take labels when doing computation. Note any module in
-                the chain can take labels (not necessarily only the top
-                most one), and they all take the same labels passed
-                from the original data batch for the `SequentialModule`.
-
-
-        Returns
-        -------
-        self
-            This function returns `self` to allow us to easily chain a
-            series of `add` calls.
-        Examples
-        --------
-        >>> # An example of addinging two modules to a chain.
-        >>> seq_mod = mx.mod.SequentialModule()
-        >>> seq_mod.add(mod1)
-        >>> seq_mod.add(mod2)
-
-        """
-        self._modules.append(module)
-
-        # a sanity check to avoid typo
-        for key in kwargs:
-            assert key in self._meta_keys, ('Unknown meta "%s", a typo?' % key)
-
-        self._metas.append(kwargs)
-
-        # after adding new modules, we are reset back to raw states, needs
-        # to bind, init_params, etc.
-        self.binded = False
-        self.params_initialized = False
-        self.optimizer_initialized = False
-
-        return self # for easier chaining
-
-    @property
-    def data_names(self):
-        """A list of names for data required by this module."""
-        if len(self._modules) > 0:
-            return self._modules[0].data_names
-        return []
-
-    @property
-    def output_names(self):
-        """A list of names for the outputs of this module."""
-        if len(self._modules) > 0:
-            return self._modules[-1].output_names
-        return []
-
-    @property
-    def data_shapes(self):
-        """Gets data shapes.
-
-        Returns
-        -------
-        list
-            A list of `(name, shape)` pairs. The data shapes of the first module
-            is the data shape of a `SequentialModule`.
-        """
-        assert self.binded
-        return self._modules[0].data_shapes
-
-    @property
-    def label_shapes(self):
-        """Gets label shapes.
-
-        Returns
-        -------
-        list
-            A list of `(name, shape)` pairs. The return value could be `None` if
-            the module does not need labels, or if the module is not bound for
-            training (in this case, label information is not available).
-        """
-        assert self.binded
-        return self._label_shapes
-
-    @property
-    def output_shapes(self):
-        """Gets output shapes.
-
-        Returns
-        -------
-        list
-            A list of `(name, shape)` pairs. The output shapes of the last
-            module is the output shape of a `SequentialModule`.
-        """
-        assert self.binded
-        return self._modules[-1].output_shapes
-
-    def get_params(self):
-        """Gets current parameters.
-
-        Returns
-        -------
-        (arg_params, aux_params)
-            A pair of dictionaries each mapping parameter names to NDArray values. This
-            is a merged dictionary of all the parameters in the modules.
-        """
-        assert self.binded and self.params_initialized
-
-        arg_params = dict()
-        aux_params = dict()
-
-        for module in self._modules:
-            arg, aux = module.get_params()
-            arg_params.update(arg)
-            aux_params.update(aux)
-
-        return (arg_params, aux_params)
-
-    def init_params(self, initializer=Uniform(0.01), arg_params=None, aux_params=None,
-                    allow_missing=False, force_init=False, allow_extra=False):
-        """Initializes parameters.
-
-        Parameters
-        ----------
-        initializer : Initializer
-        arg_params : dict
-            Default ``None``. Existing parameters. This has higher priority
-            than `initializer`.
-        aux_params : dict
-            Default ``None``. Existing auxiliary states. This has higher priority
-            than `initializer`.
-        allow_missing : bool
-            Allow missing values in `arg_params` and `aux_params` (if not ``None``).
-            In this case, missing values will be filled with `initializer`.
-        force_init : bool
-            Default ``False``.
-        allow_extra : boolean, optional
-            Whether allow extra parameters that are not needed by symbol.
-            If this is True, no error will be thrown when arg_params or aux_params
-            contain extra parameters that is not needed by the executor.
-        """
-        if self.params_initialized and not force_init:
-            return
-        assert self.binded, 'call bind before initializing the parameters'
-
-        for module in self._modules:
-            module.init_params(initializer=initializer, arg_params=arg_params,
-                               aux_params=aux_params, allow_missing=allow_missing,
-                               force_init=force_init, allow_extra=allow_extra)
-
-        # make sure we do not have duplicated parameter names
-        def _check_name(known_names, new_names, modules, i):
-            """Internal function to help checking duplicated names."""
-            for name in new_names:
-                assert not name in known_names, "Duplicated parameter names: " + \
-                    ('name "%s" in layer %d (%s) is already ' % (name, i, type(modules[i]))) + \
-                    ('used in layer %d (%s).' % (known_names[name],
-                                                 type(modules[known_names[name]])))
-                known_names[name] = i
-
-        arg_names = dict()
-        aux_names = dict()
-        for i_layer, module in enumerate(self._modules):
-            arg_params, aux_params = module.get_params()
-            _check_name(arg_names, arg_params.keys(), self._modules, i_layer)
-            _check_name(aux_names, aux_params.keys(), self._modules, i_layer)
-
-        self.params_initialized = True
-
-    def bind(self, data_shapes, label_shapes=None, for_training=True,
-             inputs_need_grad=False, force_rebind=False, shared_module=None,
-             grad_req='write'):
-        """Binds the symbols to construct executors. This is necessary before one
-        can perform computation with the module.
-
-        Parameters
-        ----------
-        data_shapes : list of (str, tuple)
-            Typically is `data_iter.provide_data`.
-        label_shapes : list of (str, tuple)
-            Typically is `data_iter.provide_label`.
-        for_training : bool
-            Default is ``True``. Whether the executors should be bind for training.
-        inputs_need_grad : bool
-            Default is ``False``. Whether the gradients to the input data need to be computed.
-            Typically this is not needed. But this might be needed when implementing composition
-            of modules.
-        force_rebind : bool
-            Default is ``False``. This function does nothing if the executors are already
-            bound. But with this ``True``, the executors will be forced to rebind.
-        shared_module : Module
-            Default is ``None``. Currently shared module is not supported for `SequentialModule`.
-        grad_req : str, list of str, dict of str to str
-            Requirement for gradient accumulation. Can be 'write', 'add', or 'null'
-            (default to 'write').
-            Can be specified globally (str) or for each argument (list, dict).
-        """
-        if self.binded and not force_rebind:
-            self.logger.warning('Already bound, ignoring bind()')
-            return
-
-        if inputs_need_grad:
-            assert for_training is True
-        assert shared_module is None, 'Shared module is not supported'
-        assert len(self._modules) > 0, 'Attempting to bind an empty SequentialModule'
-
-        self.binded = True
-
-        # the same label shapes are used for all chained modules
-        self._label_shapes = label_shapes
-
-        my_data_shapes = data_shapes
-        anybody_ever_needs_label = False
-        for i_layer, module in enumerate(self._modules):
-            meta = self._metas[i_layer]
-            if SequentialModule.META_TAKE_LABELS in meta and \
-                    meta[SequentialModule.META_TAKE_LABELS]:
-                my_label_shapes = label_shapes
-                anybody_ever_needs_label = True
-            else:
-                my_label_shapes = None
-
-            my_inputs_need_grad = bool(inputs_need_grad or
-                                       (for_training and i_layer > 0))
-
-            if meta.get(SequentialModule.META_AUTO_WIRING, False):
-                data_names = module.data_names
-                assert len(data_names) == len(my_data_shapes)
-                my_data_shapes = [(new_name, shape) for (new_name, (_, shape))
-                                  in zip(data_names, my_data_shapes)]
-
-            module.bind(data_shapes=my_data_shapes, label_shapes=my_label_shapes,
-                        for_training=for_training, inputs_need_grad=my_inputs_need_grad,
-                        force_rebind=force_rebind, shared_module=None, grad_req=grad_req)
-
-            # the output of the previous module is the data of the next module
-            my_data_shapes = module.output_shapes
-
-        if not anybody_ever_needs_label:
-            # then I do not need label either
-            self._label_shapes = None
-
-    def init_optimizer(self, kvstore='local', optimizer='sgd',
-                       optimizer_params=(('learning_rate', 0.01),),
-                       force_init=False):
-        """Installs and initializes optimizers.
-
-        Parameters
-        ----------
-        kvstore : str or KVStore
-            Default `'local'`.
-        optimizer : str or Optimizer
-            Default `'sgd'`
-        optimizer_params : dict
-            Default ``(('learning_rate', 0.01),)``. The default value is not a dictionary,
-            just to avoid pylint warning of dangerous default values.
-        force_init : bool
-            Default ``False``, indicating whether we should force re-initializing the
-            optimizer in the case an optimizer is already installed.
-        """
-        assert self.binded and self.params_initialized
-        if self.optimizer_initialized and not force_init:
-            self.logger.warning('optimizer already initialized, ignoring.')
-            return
-
-        for module in self._modules:
-            module.init_optimizer(kvstore=kvstore, optimizer=optimizer,
-                                  optimizer_params=optimizer_params, force_init=force_init)
-
-        self.optimizer_initialized = True
-
-    def forward(self, data_batch, is_train=None):
-        """Forward computation.
-
-        Parameters
-        ----------
-        data_batch : DataBatch
-        is_train : bool
-            Default is ``None``, in which case `is_train` is take as ``self.for_training``.
-        """
-        assert self.binded and self.params_initialized
-
-        # make a shallow copy, just to maintain necessary properties (if any) like
-        # bucket_key, pad, etc.
-        data_batch = copy.copy(data_batch)
-
-        for i_layer, module in enumerate(self._modules):
-            module.forward(data_batch, is_train=is_train)
-
-            if i_layer+1 == len(self._modules):
-                # the last layer, do not need to do the followings
-                break
-
-            data_batch.data = module.get_outputs()
-            if hasattr(data_batch, 'provide_data'):
-                # need to update this, in case the internal module is using bucketing
-                # or whatever
-                data_names = [x[0] for x in module.output_shapes]
-                assert len(data_names) == len(data_batch.data)
-                data_batch.provide_data = [(name, x.shape) for name, x in
-                                           zip(data_names, data_batch.data)]
-
-    def backward(self, out_grads=None):
-        """Backward computation."""
-        assert self.binded and self.params_initialized
-
-        for i_layer, module in reversed(list(zip(range(len(self._modules)), self._modules))):
-            module.backward(out_grads=out_grads)
-            if i_layer == 0:
-                break
-
-            out_grads = module.get_input_grads()
-
-    def update(self):
-        """Updates parameters according to installed optimizer and the gradient computed
-        in the previous forward-backward cycle.
-        """
-        assert self.binded and self.params_initialized and self.optimizer_initialized
-
-        for module in self._modules:
-            module.update()
-
-    def get_outputs(self, merge_multi_context=True):
-        """Gets outputs from a previous forward computation.
-
-        Parameters
-        ----------
-        merge_multi_context : bool
-            Default is ``True``. In the case when data-parallelism is used, the outputs
-            will be collected from multiple devices. A ``True`` value indicate that we
-            should merge the collected results so that they look like from a single
-            executor.
-
-        Returns
-        -------
-        list of NDArray or list of list of NDArray
-            If `merge_multi_context` is ``True``, it is like ``[out1,
-            out2]``. Otherwise, it is like ``[[out1_dev1, out1_dev2], [out2_dev1,
-            out2_dev2]]``. All the output elements are numpy arrays.
-        """
-        assert self.binded and self.params_initialized
-        return self._modules[-1].get_outputs(merge_multi_context=merge_multi_context)
-
-    def get_input_grads(self, merge_multi_context=True):
-        """Gets the gradients with respect to the inputs of the module.
-
-        Parameters
-        ----------
-        merge_multi_context : bool
-            Default is ``True``. In the case when data-parallelism is used, the outputs
-            will be collected from multiple devices. A ``True`` value indicate that we
-            should merge the collected results so that they look like from a single
-            executor.
-
-        Returns
-        -------
-        list of NDArrays or list of list of NDArrays
-            If `merge_multi_context` is ``True``, it is like ``[grad1, grad2]``. Otherwise, it
-            is like ``[[grad1_dev1, grad1_dev2], [grad2_dev1, grad2_dev2]]``. All the output
-            elements are `NDArray`.
-        """
-        assert self.binded and self.params_initialized and self.inputs_need_grad
-        return self._modules[0].get_input_grads(merge_multi_context=merge_multi_context)
-
-    def update_metric(self, eval_metric, labels, pre_sliced=False):
-        """Evaluates and accumulates evaluation metric on outputs of the last forward computation.
-
-        Parameters
-        ----------
-        eval_metric : EvalMetric
-        labels : list of NDArray
-            Typically ``data_batch.label``.
-        """
-        assert self.binded and self.params_initialized
-
-        for meta, module in zip(self._metas, self._modules):
-            if SequentialModule.META_TAKE_LABELS in meta and \
-                    meta[SequentialModule.META_TAKE_LABELS]:
-                module.update_metric(eval_metric, labels, pre_sliced)
-
-    def install_monitor(self, mon):
-        """Installs monitor on all executors."""
-        assert self.binded
-        for module in self._modules:
-            module.install_monitor(mon)
diff --git a/tests/nightly/model_backwards_compatibility_check/common.py b/tests/nightly/model_backwards_compatibility_check/common.py
index cbdc61948b88..1791b732df22 100644
--- a/tests/nightly/model_backwards_compatibility_check/common.py
+++ b/tests/nightly/model_backwards_compatibility_check/common.py
@@ -53,18 +53,6 @@ def get_model_path(model_name):
     return os.path.join(os.getcwd(), 'models', str(mxnet_version), model_name)
 
 
-def get_module_api_model_definition():
-    input = mx.symbol.Variable('data')
-    input = mx.symbol.Flatten(data=input)
-
-    fc1 = mx.symbol.FullyConnected(data=input, name='fc1', num_hidden=128)
-    act1 = mx.sym.Activation(data=fc1, name='relu1', act_type="relu")
-    fc2 = mx.symbol.FullyConnected(data=act1, name='fc2', num_hidden=2)
-    op = mx.symbol.SoftmaxOutput(data=fc2, name='softmax')
-    model = mx.mod.Module(symbol=op, context=ctx, data_names=['data'], label_names=['softmax_label'])
-    return model
-
-
 def save_inference_results(inference_results, model_name):
     assert (isinstance(inference_results, mx.ndarray.ndarray.NDArray))
     save_path = os.path.join(get_model_path(model_name), ''.join([model_name, '-inference']))
diff --git a/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py
index ee966dd291f1..3091edf5b2a6 100644
--- a/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py
+++ b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_inference.py
@@ -19,38 +19,6 @@
 
 from common import *
 
-
-def test_module_checkpoint_api():
-    model_name = 'module_checkpoint_api'
-    print ('Performing inference for model/API %s' % model_name)
-
-    # For each MXNet version that has the saved models
-    for folder in get_top_level_folders_in_bucket(s3, model_bucket_name):
-        logging.info('Fetching files for MXNet version : %s and model %s' % (folder, model_name))
-        model_files = download_model_files_from_s3(model_name, folder)
-        if len(model_files) == 0:
-            logging.warn('No training files found for %s for MXNet version : %s' % (model_name, folder))
-            continue
-
-        data = mx.nd.load(''.join([model_name, '-data']))
-        data_iter = mx.io.NDArrayIter(data['data'], data['labels'], batch_size=10)
-        # Load the model and perform inference
-        loaded_model = get_module_api_model_definition()
-
-        sym, arg_params, aux_params = mx.model.load_checkpoint(model_name, 1)
-        loaded_model.bind(data_shapes=data_iter.provide_data, label_shapes=data_iter.provide_label)
-        loaded_model.set_params(arg_params, aux_params)
-
-        old_inference_results = load_inference_results(model_name)
-        inference_results = loaded_model.predict(data_iter)
-        # Check whether they are equal or not ?
-        assert_almost_equal(inference_results.asnumpy(), old_inference_results.asnumpy(), rtol=rtol_default, atol=atol_default)
-        clean_model_files(model_files, model_name)
-        logging.info('=================================')
-
-    logging.info('Assertion passed for model : %s' % model_name)
-
-
 def test_lenet_gluon_load_params_api():
     model_name = 'lenet_gluon_save_params_api'
     logging.info('Performing inference for model/API %s' % model_name)
@@ -131,7 +99,6 @@ def test_lstm_gluon_load_parameters_api():
 
 
 if __name__ == '__main__':
-    test_module_checkpoint_api()
     test_lenet_gluon_load_params_api()
     test_lenet_gluon_hybrid_imports_api()
     test_lstm_gluon_load_parameters_api()
diff --git a/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_train.py b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_train.py
index f454eb7d3393..573119a8f731 100644
--- a/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_train.py
+++ b/tests/nightly/model_backwards_compatibility_check/model_backwards_compat_train.py
@@ -20,29 +20,6 @@
 from common import *
 
 
-def train_module_checkpoint_api():
-    model_name = 'module_checkpoint_api'
-    create_model_folder(model_name)
-    logging.info('Saving files for model %s' % model_name)
-    # Prepare data
-    test_data = mx.nd.array(np.random.uniform(-1, 1, size=(20, 1)))
-    test_label = mx.nd.array(np.random.randint(0, 2, size=(20,)), dtype='float32')
-    data_iter = mx.io.NDArrayIter(test_data, test_label, batch_size=10)
-
-    mod = get_module_api_model_definition()
-    mod.bind(data_shapes=data_iter.provide_data, label_shapes=data_iter.provide_label)
-    weights = mx.initializer.Xavier(magnitude=2.57)
-    mod.init_params(weights)
-
-    mod.save_checkpoint(os.path.join(get_model_path(model_name), model_name), 1)
-
-    inference_results = mod.predict(data_iter)
-    # Save inference_results
-    # Save the model files
-    save_data_and_labels(test_data, test_label, model_name)
-    save_inference_results(inference_results, model_name)
-
-
 def train_lenet_gluon_save_params_api():
     model_name = 'lenet_gluon_save_params_api'
     create_model_folder(model_name)
@@ -120,8 +97,6 @@ def create_root_folder():
 
 if __name__ == '__main__':
     create_root_folder()
-
-    train_module_checkpoint_api()
     train_lenet_gluon_save_params_api()
     train_lenet_gluon_hybrid_export_api()
     train_lstm_gluon_save_parameters_api()
diff --git a/tests/nightly/test_optimizer.py b/tests/nightly/test_optimizer.py
index 9c2fcb8a62cf..0cba4d78e539 100644
--- a/tests/nightly/test_optimizer.py
+++ b/tests/nightly/test_optimizer.py
@@ -50,41 +50,3 @@ def lenet5():
     lenet = mx.sym.SoftmaxOutput(data=fc2, name='softmax')
     #pylint: enable=no-member
     return lenet
-
-@with_seed()
-def test_lars():
-    num_epochs = 10
-    batch_size = 8000
-    mnist = mx.test_utils.get_mnist()
-    train_iter = mx.io.NDArrayIter(mnist['train_data'],
-                                   mnist['train_label'],
-                                   batch_size,
-                                   shuffle=True)
-    test_iter = mx.io.NDArrayIter(mnist['test_data'], mnist['test_label'], batch_size)
-    ctx = mx.gpu(0)
-    lenet_model = mx.mod.Module(lenet5(), context=ctx)
-    warmup_epochs = 1
-    epoch_it = int(train_iter.num_data / batch_size)
-    # LARS works best with Polynomial scheduler and warmup
-    base_lr = 0.01
-    optimizer_params={
-        'learning_rate': base_lr,
-        'lr_scheduler': mx.lr_scheduler.PolyScheduler(base_lr=base_lr,
-                                                      max_update=epoch_it * num_epochs,
-                                                      warmup_steps=epoch_it * warmup_epochs),
-        'momentum': 0.9,
-        'eta': 14.,
-      }
-    lenet_model.fit(train_iter,
-                    eval_data=test_iter,
-                    optimizer='lars',
-                    optimizer_params=optimizer_params,
-                    eval_metric='acc',
-                    num_epoch=num_epochs)
-
-    # predict accuracy for lenet
-    acc = mx.gluon.metric.Accuracy()
-    lenet_model.score(test_iter, acc)
-    accuracy = acc.get()[1]
-    assert accuracy > 0.98, "LeNet-5 training accuracy on MNIST was too low"
-
diff --git a/tests/nightly/test_tlocal_racecondition.py b/tests/nightly/test_tlocal_racecondition.py
deleted file mode 100644
index 986e1f464bfb..000000000000
--- a/tests/nightly/test_tlocal_racecondition.py
+++ /dev/null
@@ -1,110 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import mxnet as mx
-from mxnet import gluon
-from mxnet import image
-from mxnet import nd
-import numpy as np
-import logging
-
-logger = logging.getLogger()
-logger.setLevel(logging.DEBUG)
-
-root_url = ('https://apache-mxnet.s3-accelerate.amazonaws.com/'
-            'gluon/dataset/pikachu/')
-data_dir = './data/pikachu/'
-dataset = {'train.rec': 'e6bcb6ffba1ac04ff8a9b1115e650af56ee969c8',
-          'train.idx': 'dcf7318b2602c06428b9988470c731621716c393',
-          'val.rec': 'd6c33f799b4d058e82f2cb5bd9a976f69d72d520'}
-for k, v in dataset.items():
-    gluon.utils.download(root_url+k, data_dir+k, sha1_hash=v)
-
-T = 1
-devs = [mx.gpu(i) for i in range(4)]
-data_shape = 224 * T
-batch_size = 20 * len(devs)
-rgb_mean = np.array([1,2,3])
-
-class_names = ['pikachu']
-num_class = len(class_names)
-
-def get_iterators(data_shape, batch_size):
-    train_iter = image.ImageDetIter(
-        batch_size=batch_size,
-        data_shape=(3, data_shape, data_shape),
-        path_imgrec=data_dir+'train.rec',
-        path_imgidx=data_dir+'train.idx',
-        shuffle=True,
-        mean=True,
-        rand_crop=1,
-        min_object_covered=0.95,
-        max_attempts=200)
-    val_iter = image.ImageDetIter(
-        batch_size=batch_size,
-        data_shape=(3, data_shape, data_shape),
-        path_imgrec=data_dir+'val.rec',
-        shuffle=False,
-        mean=True)
-    return train_iter, val_iter, class_names, num_class
-
-train_data, test_data, class_names, num_class = get_iterators(
-    data_shape, batch_size)
-
-
-class MyCustom(mx.operator.CustomOp):
-    def __init__(self):
-        super(MyCustom, self).__init__()
-    def forward(self, is_train, req, in_data, out_data, aux):
-        self.assign(out_data[0], req[0], 0)
-    def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
-        self.assign(in_grad[0], req[0], 0)
-        self.assign(in_grad[1], req[1], 0)
-
-@mx.operator.register("MyCustom")
-class MyCustomProp(mx.operator.CustomOpProp):
-    def __init__(self):
-        super(MyCustomProp, self).__init__(need_top_grad = False)
-    def list_arguments(self):
-        return ["data", "label"]
-    def list_outputs(self):
-        return ["loss"]
-    def infer_shape(self, in_shape):
-        return [in_shape[0], in_shape[1]], [(1, )], []
-    def infer_type(self, in_type):
-        dtype = in_type[0]
-        return [dtype, dtype], [dtype], []
-    def create_operator(self, ctx, shapes, dtypes):
-        return MyCustom()
-
-class MyMetric(mx.gluon.metric.EvalMetric):
-    def __init__(self):
-        super(MyMetric, self).__init__("MyMetric")
-        self.name = ['empty']
-    def update(self, labels, preds):
-        pass
-    def get(self):
-        return self.name, [0]
-
-if __name__ == '__main__':
-    x = mx.sym.Variable("data")
-    label = mx.sym.Variable("label")
-    x = mx.sym.FullyConnected(data = x, num_hidden = 100)
-    label = mx.sym.Reshape(data = label, shape = (0, -1))
-    sym = mx.sym.Custom(data = x, label = label, op_type = "MyCustom")
-    model = mx.module.Module(context = devs, symbol = sym, data_names = ('data',), label_names = ('label',))
-    model.fit(train_data = train_data, begin_epoch = 0, num_epoch = 20, allow_missing = True, batch_end_callback = mx.callback.Speedometer(batch_size, 5), eval_metric = MyMetric())
diff --git a/tests/python/gpu/test_contrib_amp.py b/tests/python/gpu/test_contrib_amp.py
index d1d91629b406..f856c8fb2b1b 100644
--- a/tests/python/gpu/test_contrib_amp.py
+++ b/tests/python/gpu/test_contrib_amp.py
@@ -216,46 +216,6 @@ def check_amp_convert_symbol():
         exe2.outputs[0].wait_to_read()
 
 
-    def check_amp_convert_model():
-        # Test with real world model, default inputs for convert_model
-        dir_path = os.path.dirname(os.path.realpath(__file__))
-        model_path = os.path.join(dir_path, 'model')
-        if not os.path.isdir(model_path):
-            os.mkdir(model_path)
-        prefix, epoch = download_model("imagenet1k-resnet-18", dst_dir=model_path)
-
-        sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
-
-        # Test with real world model, tweak inputs for convert_model
-        result_sym, result_arg_params, result_aux_params = amp.convert_model(sym,
-                                                                             arg_params,
-                                                                             aux_params,
-                                                                             target_dtype="float16",
-                                                                             target_dtype_ops=["Convolution"])
-        mod = mx.mod.Module(result_sym, data_names=["data"], label_names=["softmax_label"], context=mx.gpu())
-        mod.bind(data_shapes=[['data', (1, 3, 224, 224)]], label_shapes=[['softmax_label', (1,)]])
-
-        mod.set_params(result_arg_params, result_aux_params)
-        mod.forward(mx.io.DataBatch(data=[mx.nd.ones((1, 3, 224, 224))],
-                                    label=[mx.nd.ones((1,))]))
-        mod.get_outputs()[0].asnumpy()
-        assert mod._arg_params["stage2_unit1_conv2_weight"].dtype == np.float32
-
-        # Call convert_model with cast_optional_params set to True
-        result_sym, result_arg_params, result_aux_params = amp.convert_model(sym,
-                                                                             arg_params,
-                                                                             aux_params,
-                                                                             target_dtype="float16",
-                                                                             target_dtype_ops=["Convolution"], cast_optional_params=True)
-        mod = mx.mod.Module(result_sym, data_names=["data"], label_names=["softmax_label"], context=mx.gpu())
-        mod.bind(data_shapes=[['data', (1, 3, 224, 224)]], label_shapes=[['softmax_label', (1,)]])
-        mod.set_params(result_arg_params, result_aux_params)
-        mod.forward(mx.io.DataBatch(data=[mx.nd.ones((1, 3, 224, 224))],
-                                    label=[mx.nd.ones((1,))]))
-        mod.get_outputs()[0].asnumpy()
-        assert mod._arg_params["stage2_unit1_conv2_weight"].dtype == np.float16
-
-
     def check_amp_convert_hybrid_block():
         # Test conversion for hybrid block on CPU
         model_cpu = get_model("resnet50_v1")
@@ -315,7 +275,6 @@ def check_amp_convert_hybrid_block():
 
     with mx.Context(mx.gpu(0)):
         check_amp_convert_symbol()
-        check_amp_convert_model()
         check_amp_convert_hybrid_block()
 
 @with_seed()
@@ -334,50 +293,6 @@ def test_amp_conversion_rnn(amp_tests):
         mx.test_utils.assert_almost_equal(out.asnumpy(), out2.asnumpy(), atol=1e-2, rtol=1e-2)
 
 
-@with_seed()
-@pytest.mark.skip(reason='Error during waitall(). Tracked in #18099')
-def test_module_backward_compatibility(amp_tests):
-    channel_num = 10
-    conv_layer_filter_dims = [2, 3]
-    conv_layer_strides = [1, 1]
-    dimension = 5
-    data_len = 10
-
-    data = mx.sym.var("data")
-    conv = mx.sym.Convolution(data,
-                              num_filter=channel_num,
-                              kernel=tuple(conv_layer_filter_dims),
-                              stride=tuple(conv_layer_strides))
-
-    bn = mx.sym.BatchNorm(conv,
-                          eps=0.001,
-                          momentum=0.9,
-                          fix_gamma=False,
-                          use_global_stats=False,
-                          output_mean_var=False,
-                          name="conv0_batchnorm")
-    fc = mx.sym.FullyConnected(bn, num_hidden=10, name="fullyconnected")
-    mod = mx.mod.Module(fc, data_names=["data"], context=mx.gpu(0))
-    mod.bind(data_shapes=[['data', (1, 3, 224, 224)]])
-    mod.init_params()
-
-    arg_params, aux_params = mod.get_params()
-    for param_key, param_val in arg_params.items():
-        assert param_val.dtype == np.float32, "Incorrect inference type for arg_params," \
-                                               "please check simple_bind for module executor"
-    for param_key, param_val in aux_params.items():
-        assert param_val.dtype == np.float32, "Incorrect inference type for aux_params," \
-                                               "please check simple_bind for module executor"
-
-
-    sym, arg_params, aux_params = amp.convert_model(mod._symbol, mod._arg_params, mod._aux_params, target_dtype_ops=["Convolution"])
-    mod = mx.mod.Module(sym, data_names=["data"], context=mx.gpu(0))
-    mod.bind(data_shapes=[['data', (1, 3, 224, 224)]])
-    mod.set_params(arg_params, aux_params)
-    assert arg_params["fullyconnected_weight"].dtype == np.float16, \
-        "Module API is overwriting the inferred dtype for a mixed precision model"
-
-
 @with_seed()
 @pytest.mark.skip(reason='Error during waitall(). Tracked in #18099')
 def test_fp16_casting(amp_tests):
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index b6d0011f1a2f..6e43559697b0 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -1716,30 +1716,6 @@ def test_take_with_type():
                               arg_params=arg_params)
 
 
-def check_rnn_consistency(cell1, cell2):
-    dshape = (32, 5, 200)
-    data = mx.sym.Variable('data')
-
-    sym1, _ = cell1.unroll(5, data, merge_outputs=True)
-    mod1 = mx.mod.Module(sym1, label_names=None, context=mx.gpu(0))
-    mod1.bind(data_shapes=[('data', dshape)], label_shapes=None)
-
-    sym2, _ = cell2.unroll(5, data, merge_outputs=True)
-    mod2 = mx.mod.Module(sym2, label_names=None, context=mx.gpu(0))
-    mod2.bind(data_shapes=[('data', dshape)], label_shapes=None)
-
-    mod1.init_params()
-    args, auxs = mod1.get_params()
-    args = cell1.unpack_weights(args)
-    args = cell2.pack_weights(args)
-    mod2.set_params(args, auxs)
-
-    batch=mx.io.DataBatch(data=[mx.random.uniform(shape=dshape)], label=[])
-    mod1.forward(batch, is_train=False)
-    mod2.forward(batch, is_train=False)
-
-    mx.test_utils.assert_allclose(mod1.get_outputs()[0], mod2.get_outputs()[0], rtol=1e-2, atol=1e-4)
-
 @with_seed()
 @pytest.mark.serial
 def test_psroipooling_with_type():
diff --git a/tests/python/gpu/test_predictor.py b/tests/python/gpu/test_predictor.py
index 592733a90174..b1a4d2ef1df6 100644
--- a/tests/python/gpu/test_predictor.py
+++ b/tests/python/gpu/test_predictor.py
@@ -62,63 +62,3 @@ def test_predictor_with_dtype():
     predictor_out1 = predictor.get_output(0)
 
     assert_almost_equal(out1.asnumpy(), predictor_out1, rtol=1e-5, atol=1e-6)
-
-def compare_module_cpredict(result_sym, result_arg_params, result_aux_params, monitor_callback=False):
-    # Dummmy inputs
-    input1 = np.ones((1, 3, 224, 224))
-    input1 = input1.astype(np.float32)
-    nd_dict = {}
-    def pred_mon_callback(name, arr):
-        nd_dict[name] = arr
-    mod = mx.mod.Module(result_sym, data_names=["data"], label_names=["softmax_label"], context=mx.gpu())
-    mod.bind(data_shapes=[['data', (1, 3, 224, 224)]], label_shapes=[['softmax_label', (1,)]], for_training=False)
-    mod.set_params(result_arg_params, result_aux_params)
-    mod.forward(mx.io.DataBatch(data=[mx.nd.array(input1, ctx=mx.gpu())],
-                                label=[mx.nd.ones((1,), ctx=mx.gpu())]))
-    prefix = "test_predictor_amp"
-    mod.save_checkpoint(prefix, 0, remove_amp_cast=False)
-    sym_file = "{}-symbol.json".format(prefix)
-    params_file = "{}-0000.params".format(prefix)
-    predictor = Predictor(open(sym_file, "r").read(),
-                          open(params_file, "rb").read(),
-                          {'data': (1, 3, 224, 224),
-                           'softmax_label': (1,)},
-                          dev_type="gpu",
-                          dev_id=0)
-    if monitor_callback:
-        predictor.set_monitor_callback(pred_mon_callback, monitor_all=True)
-    predictor.forward(data=input1, softmax_label=mx.nd.ones((1,)).asnumpy())
-    predictor_out1 = predictor.get_output(0)
-    if monitor_callback:
-        assert len(nd_dict) > 0, "Callback not called"
-    assert_almost_equal(mod.get_outputs()[0].asnumpy(), predictor_out1, atol=1e-1, rtol=1e-1)
-
-
-@with_seed()
-def test_predictor_amp():
-    dir_path = os.path.dirname(os.path.realpath(__file__))
-    model_path = os.path.join(dir_path, 'model')
-    if not os.path.isdir(model_path):
-        os.mkdir(model_path)
-    prefix, epoch = download_model("imagenet1k-resnet-18", dst_dir=model_path)
-
-    sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
-
-
-    # Convert model to mixed precision model, params in FP32
-    result_sym, result_arg_params, result_aux_params = amp.convert_model(sym,
-                                                                         arg_params,
-                                                                         aux_params,
-                                                                         target_dtype="float16",
-                                                                         target_dtype_ops=["Convolution"])
-    compare_module_cpredict(result_sym, result_arg_params, result_aux_params)
-
-    # Convert model to mixed precision model, params in FP16
-    result_sym, result_arg_params, result_aux_params = amp.convert_model(sym,
-                                                                         arg_params,
-                                                                         aux_params,
-                                                                         target_dtype="float16",
-                                                                         target_dtype_ops=["Convolution"],
-                                                                         cast_optional_params=True)
-    compare_module_cpredict(result_sym, result_arg_params, result_aux_params, monitor_callback=True)
-
diff --git a/tests/python/mkl/test_contrib_amp.py b/tests/python/mkl/test_contrib_amp.py
index ec88851751ff..e63424c46a80 100644
--- a/tests/python/mkl/test_contrib_amp.py
+++ b/tests/python/mkl/test_contrib_amp.py
@@ -211,46 +211,6 @@ def check_amp_convert_symbol():
         exe2.forward(is_train=False, **inputs)
         exe2.outputs[0].wait_to_read()
 
-    def check_amp_convert_model():
-        # Test with real world model, default inputs for convert_model
-        dir_path = os.path.dirname(os.path.realpath(__file__))
-        model_path = os.path.join(dir_path, 'model')
-        if not os.path.isdir(model_path):
-            os.mkdir(model_path)
-        prefix, epoch = download_model("imagenet1k-resnet-18", dst_dir=model_path)
-
-        sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
-
-        # Test with real world model, tweak inputs for convert_model
-        result_sym, result_arg_params, result_aux_params = amp.convert_model(sym,
-                                                                             arg_params,
-                                                                             aux_params,
-                                                                             target_dtype="bfloat16",
-                                                                             target_dtype_ops=["Convolution"])
-        mod = mx.mod.Module(result_sym, data_names=["data"], label_names=["softmax_label"], context=mx.cpu())
-        mod.bind(data_shapes=[['data', (1, 3, 224, 224)]], label_shapes=[['softmax_label', (1,)]])
-
-        mod.set_params(result_arg_params, result_aux_params)
-        mod.forward(mx.io.DataBatch(data=[mx.nd.ones((1, 3, 224, 224))],
-                                    label=[mx.nd.ones((1,))]))
-        mod.get_outputs()[0].asnumpy()
-        assert mod._arg_params["stage2_unit1_conv2_weight"].dtype == np.float32
-
-        # Call convert_model with cast_optional_params set to True
-        result_sym, result_arg_params, result_aux_params = amp.convert_model(sym,
-                                                                             arg_params,
-                                                                             aux_params,
-                                                                             target_dtype="bfloat16",
-                                                                             target_dtype_ops=["Convolution"], cast_optional_params=True)
-        mod = mx.mod.Module(result_sym, data_names=["data"], label_names=["softmax_label"], context=mx.cpu())
-        mod.bind(data_shapes=[['data', (1, 3, 224, 224)]], label_shapes=[['softmax_label', (1,)]])
-        mod.set_params(result_arg_params, result_aux_params)
-        mod.forward(mx.io.DataBatch(data=[mx.nd.ones((1, 3, 224, 224))],
-                                    label=[mx.nd.ones((1,))]))
-        mod.get_outputs()[0].asnumpy()
-        assert mod._arg_params["stage2_unit1_conv2_weight"].dtype == bfloat16
-
-
     def check_amp_convert_hybrid_block():
         # Test conversion for hybrid block on CPU
         model_cpu = get_model("resnet50_v1")
@@ -308,120 +268,8 @@ def check_amp_convert_hybrid_block():
         assert params["stage2_unit1_conv2_weight"].dtype == bfloat16
 
     check_amp_convert_symbol()
-    check_amp_convert_model()
     check_amp_convert_hybrid_block()
 
-
-def test_amp_accuracy():
-    def check_amp_convert_conv_accuracy(data_shape, kernel, num_filter, pad, stride, no_bias, cast_optional_params):
-        Batch = collections.namedtuple('Batch',['data'])
-        data = mx.sym.Variable(name='data')
-        data_low = 0.0
-        data_high = 100.0
-        conv2d = mx.sym.Convolution(data=data, kernel=kernel, num_filter=num_filter, pad=pad, stride=stride,
-                                    no_bias=no_bias, cudnn_off=False, name='conv2d')
-        conv_exe_fp32 = mx.mod.Module(symbol=conv2d, label_names=None, context=mx.cpu())
-        conv_exe_fp32.bind(data_shapes=[('data', data_shape)])
-        conv_exe_fp32.init_params()
-        data_fp32 = [mx.nd.random.uniform(low=data_low, high=data_high, shape=data_shape).astype('float32')]
-        conv_exe_fp32.forward(Batch(data_fp32), is_train=False)
-        arg_params, aux_params = conv_exe_fp32.get_params()
-        output_fp32 = conv_exe_fp32.get_outputs()[0]
-
-        conv2d_bf16, arg_params_bf16, aux_params_bf16 = amp.convert_model(conv2d, arg_params, aux_params,
-                                                                        target_dtype="bfloat16",
-                                                                        target_dtype_ops=["Convolution"],
-                                                                        cast_optional_params=cast_optional_params)
-
-        conv_exe_bf16 = mx.mod.Module(symbol=conv2d_bf16, label_names=None, context=mx.cpu())
-        conv_exe_bf16.bind(data_shapes=[('data', data_shape)])
-        conv_exe_bf16.set_params(arg_params=arg_params_bf16, aux_params=aux_params_bf16)
-        conv_exe_bf16.forward(Batch(data_fp32), is_train=False)
-        output_bf16 = conv_exe_bf16.get_outputs()[0]
-        output_bf16_2_fp32 = mx.nd.amp_cast(output_bf16, dtype="float32")
-
-        assert_almost_equal(output_bf16_2_fp32, output_fp32, rtol=1e-1, atol = 2e-1)
-
-    def check_amp_convert_fc_accuracy(data_shape, num_hidden, cast_optional_params):
-        Batch = collections.namedtuple('Batch',['data'])
-        data = mx.sym.Variable(name='data')
-        data_low = 0.0
-        data_high = 100.0
-        fc = mx.sym.FullyConnected(data=data, num_hidden=num_hidden, name='fc')
-        fc_exe_fp32 = mx.mod.Module(symbol=fc, label_names=None, context=mx.cpu())
-        fc_exe_fp32.bind(data_shapes=[('data', data_shape)])
-        fc_exe_fp32.init_params()
-        data_fp32 = [mx.nd.random.uniform(low=data_low, high=data_high, shape=data_shape).astype('float32')]
-        fc_exe_fp32.forward(Batch(data_fp32), is_train=False)
-        arg_params, aux_params = fc_exe_fp32.get_params()
-        output_fp32 = fc_exe_fp32.get_outputs()[0]
-
-        fc_bf16, arg_params_bf16, aux_params_bf16 = amp.convert_model(fc, arg_params, aux_params,
-                                                                    target_dtype="bfloat16",
-                                                                    target_dtype_ops=["FullyConnected"], cast_optional_params=cast_optional_params)
-
-        fc_exe_bf16 = mx.mod.Module(symbol=fc_bf16, label_names=None, context=mx.cpu())
-        fc_exe_bf16.bind(data_shapes=[('data', data_shape)])
-        fc_exe_bf16.set_params(arg_params_bf16, aux_params_bf16)
-        fc_exe_bf16.forward(Batch(data_fp32), is_train=False)
-
-        output_bf16 = fc_exe_bf16.get_outputs()[0]
-        output_bf16_2_fp32 = mx.nd.amp_cast(output_bf16, dtype="float32")
-
-        assert_almost_equal(output_bf16_2_fp32, output_fp32, rtol=1e-1, atol=2e-1)
-
-    check_amp_convert_conv_accuracy(data_shape=(3, 4, 28, 28), kernel=(3, 3), num_filter=128, pad=(1, 1), stride=(1, 1), no_bias=True, cast_optional_params=False)
-    check_amp_convert_conv_accuracy(data_shape=(512, 10, 28, 28), kernel=(1, 1), num_filter=16, pad=(0, 0), stride=(1, 1), no_bias=True, cast_optional_params=True)
-    check_amp_convert_conv_accuracy(data_shape=(128, 56, 14, 14), kernel=(3, 3), num_filter=28, pad=(1, 1), stride=(1, 1), no_bias=False, cast_optional_params=False)
-
-    check_amp_convert_fc_accuracy(data_shape=(1024, 32), num_hidden=1000, cast_optional_params=False)
-    check_amp_convert_fc_accuracy(data_shape=(40, 32), num_hidden=10, cast_optional_params=True)
-
-
-@with_seed()
-def test_module_backward_compatibility():
-    channel_num = 10
-    conv_layer_filter_dims = [2, 3]
-    conv_layer_strides = [1, 1]
-    dimension = 5
-    data_len = 10
-
-    data = mx.sym.var("data")
-    conv = mx.sym.Convolution(data,
-                              num_filter=channel_num,
-                              kernel=tuple(conv_layer_filter_dims),
-                              stride=tuple(conv_layer_strides))
-
-    bn = mx.sym.BatchNorm(conv,
-                          eps=0.001,
-                          momentum=0.9,
-                          fix_gamma=False,
-                          use_global_stats=False,
-                          output_mean_var=False,
-                          name="conv0_batchnorm")
-    fc = mx.sym.FullyConnected(bn, num_hidden=10, name="fullyconnected")
-    mod = mx.mod.Module(fc, data_names=["data"], context=mx.cpu())
-    mod.bind(data_shapes=[['data', (1, 3, 224, 224)]])
-    mod.init_params()
-
-    arg_params, aux_params = mod.get_params()
-    for param_key, param_val in arg_params.items():
-        assert param_val.dtype == np.float32, "Incorrect inference type for arg_params," \
-                                               "please check simple_bind for module executor"
-    for param_key, param_val in aux_params.items():
-        assert param_val.dtype == np.float32, "Incorrect inference type for aux_params," \
-                                               "please check simple_bind for module executor"
-
-
-    sym, arg_params, aux_params = amp.convert_model(mod._symbol, mod._arg_params, mod._aux_params,
-                                                    target_dtype="bfloat16", target_dtype_ops=["Convolution"])
-    mod = mx.mod.Module(sym, data_names=["data"], context=mx.cpu())
-    mod.bind(data_shapes=[['data', (1, 3, 224, 224)]])
-    mod.set_params(arg_params, aux_params)
-    assert arg_params["fullyconnected_weight"].dtype == bfloat16, \
-        "Module API is overwriting the inferred dtype for a mixed precision model"
-
-
 @with_seed()
 def test_bf16_casting():
     data = mx.sym.var("data")
diff --git a/tests/python/mkl/test_mkldnn.py b/tests/python/mkl/test_mkldnn.py
index 2ca788f2dce7..6b075dec463c 100644
--- a/tests/python/mkl/test_mkldnn.py
+++ b/tests/python/mkl/test_mkldnn.py
@@ -24,7 +24,6 @@
 import mxnet as mx
 import pytest
 from mxnet.test_utils import rand_ndarray, assert_almost_equal
-from mxnet.module import Module
 from mxnet import gluon
 from mxnet.gluon import nn
 from mxnet.test_utils import *
@@ -617,23 +616,6 @@ def hybrid_forward(self, F, x):
     output = net(data)
     a = output.asnumpy()
 
-@with_seed()
-def test_weight_async_reorder():
-    data = mx.sym.Variable("data")
-    w1 = mx.sym.Variable("1_weight")
-    w2 = mx.sym.Variable("2_weight")
-    conv1 = mx.sym.Convolution(data=data, weight=w1 + w1, num_filter=32, no_bias=True, kernel=(3, 3))
-    conv2 = mx.sym.Convolution(data=conv1, weight=w2 + w2, num_filter=32, no_bias=True, kernel=(1, 1))
-    mod = Module(symbol=conv2, label_names=None, context=mx.current_context())
-    mod.bind(for_training=False, data_shapes=[('data', (10, 16, 50, 50))])
-    mod.init_params(initializer=mx.init.Xavier(magnitude=2.))
-    data = [mx.random.uniform(-1.0, 1.0, shape=(10, 16, 50, 50), ctx=mx.current_context())]
-    batch=mx.io.DataBatch(data, [])
-    for i in range(2):
-        mod.forward(batch, is_train=False)
-        for output in mod.get_outputs():
-            output.wait_to_read()
-
 @with_seed()
 def test_concat():
     def ref_concat(a, b, axis):
diff --git a/tests/python/mkl/test_subgraph.py b/tests/python/mkl/test_subgraph.py
index d902e539a132..9e5d4776c30e 100644
--- a/tests/python/mkl/test_subgraph.py
+++ b/tests/python/mkl/test_subgraph.py
@@ -21,7 +21,6 @@
 import numpy as np
 import unittest
 import ctypes
-from mxnet.module import Module
 from mxnet.symbol import Symbol
 from importlib import import_module
 from numpy.testing import assert_allclose
@@ -82,27 +81,6 @@ def check_qsym_scale_align(qsym):
         assert max_calib_range == v['max_calib_range']
 
 
-
-def check_qsym_forward(qsym, qarg_params, qaux_params, batch, data_shape):
-  mod = Module(symbol=qsym, label_names=None, context=mx.current_context())
-  mod.bind(for_training=False,
-           data_shapes=[('data', data_shape)])
-  mod.set_params(qarg_params, qaux_params)
-  mod.forward(batch, is_train=False)
-  for output in mod.get_outputs():
-    output.wait_to_read()
-  return mod.get_outputs()
-
-def check_qsym_dummy_forward(qsym, batch, data_shape):
-  mod = Module(symbol=qsym, label_names=None, context=mx.current_context())
-  mod.bind(for_training=False,
-           data_shapes=[('data', data_shape)])
-  mod.init_params(initializer=mx.init.Xavier(magnitude=2.))
-  mod.forward(batch, is_train=False)
-  for output in mod.get_outputs():
-    output.wait_to_read()
-  return mod.get_outputs()
-
 def check_qsym_gluon_forward(path, qsym, qarg_params, qaux_params, data_shape):
   # save qsym to JSON file
   _, json_path = tempfile.mkstemp(suffix='-symbol.json', dir=path)
@@ -133,164 +111,6 @@ def __iter__(self):
         yield self.batch
 
 
-def check_quantize(sym, data_shape, out_type, path, name='conv',
-                   check_calibration=True, gluon_forward=False, check_scale_align=False):
-  quantize_granularity_list = ['tensor-wise']
-  if name == 'fc':
-    quantize_granularity_list += ['channel-wise']
-
-  if name in config:
-    name = config[name][OP_NAME]
-  sym_sg = sym.get_backend_symbol(QUANTIZE_SG_PASS_NAME)
-  mod = Module(symbol=sym, label_names=None)
-  mod.bind(for_training=False,
-            data_shapes=[('data', data_shape)])
-  mod.init_params(mx.init.Normal(0.5))
-  arg_params, aux_params = mod.get_params()
-
-  if out_type == 'uint8':
-    data = [mx.random.uniform(0.0, 1.0, shape=shape, ctx=mx.current_context()) for _, shape in mod.data_shapes]
-  else:
-    data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=mx.current_context()) for _, shape in mod.data_shapes]
-  batch = mx.io.DataBatch(data, [])
-
-  mod.forward(batch, is_train=False)
-  for output in mod.get_outputs():
-      output.wait_to_read()
-  ref_out = mod.get_outputs()
-
-  excluded_sym_names = []
-  excluded_op_names = []
-  if mx.current_context() == mx.cpu() and gluon_forward == True:
-    excluded_op_names += ['_sg_mkldnn_fully_connected']
-
-  calib_data = CalibIter(batch, data_shape, 1)
-
-  for quantize_granularity in quantize_granularity_list:
-    qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=sym_sg,
-                                                                    arg_params=arg_params,
-                                                                    aux_params=aux_params,
-                                                                    ctx=mx.current_context(),
-                                                                    excluded_sym_names=excluded_sym_names,
-                                                                    excluded_op_names=excluded_op_names,
-                                                                    quantized_dtype=out_type,
-                                                                    calib_mode='naive',
-                                                                    calib_data=calib_data,
-                                                                    label_names=None,
-                                                                    num_calib_examples=1,
-                                                                    quantize_mode='full',
-                                                                    quantize_granularity=quantize_granularity)
-    qsym = qsym.get_backend_symbol(QUANTIZE_SG_PASS_NAME)
-    if check_calibration:
-      check_qsym_calibrated(qsym, out_type, name=name)
-    if check_scale_align:
-      check_qsym_scale_align(qsym)
-    if gluon_forward == True:
-      check_qsym_gluon_forward(path, qsym, qarg_params, qaux_params, data_shape)
-    else:
-      quantized_out = check_qsym_forward(qsym, qarg_params, qaux_params, batch, data_shape)
-      for i in range(len(ref_out)):
-        min_range = mx.nd.min(ref_out[i]).asscalar()
-        max_range = mx.nd.max(ref_out[i]).asscalar()
-        atol = 0.1 * max(abs(min_range), abs(max_range))
-        assert_almost_equal_with_err(quantized_out[i].asnumpy(), ref_out[i].asnumpy(), rtol=0.1, atol=atol, etol=0.2)
-      check_qsym_dummy_forward(qsym, batch, data_shape)
-
-@with_seed()
-def check_quantize_whole_model_with_forward():
-  def check_qsym_forward(qsym, qarg_params, qaux_params, data_shape):
-    mod = Module(symbol=qsym, label_names=None, context=mx.current_context())
-    mod.bind(for_training=False,
-             data_shapes=[('data', data_shape)])
-    mod.set_params(qarg_params, qaux_params)
-    data = [mx.random.uniform(-1.0, 1.0, shape=shape) for _, shape in mod.data_shapes]
-    batch = mx.io.DataBatch(data, [])
-    mod.forward(batch, is_train=False)
-    for output in mod.get_outputs():
-        output.wait_to_read()
-
-  def check_quantize_whole_model(out_type):
-    batch_size = 4
-    data_shape = (batch_size, 4, 10, 10)
-    data = mx.sym.Variable('data')
-    conv0 = mx.sym.Convolution(data, kernel=(1, 1), num_filter=16, name='conv0')
-    sym = mx.sym.Convolution(conv0, kernel=(1, 1), num_filter=16, name='conv1')
-    sym_sg = sym.get_backend_symbol('MKLDNN_QUANTIZE')
-    mod = Module(symbol=sym, label_names=None)
-    mod.bind(for_training=False,
-             data_shapes=[('data', data_shape)])
-
-    mod.init_params(mx.init.Normal(0.5))
-    arg_params, aux_params = mod.get_params()
-
-    excluded_sym_names = []
-
-    calib_data = mx.nd.random.uniform(shape=data_shape)
-    calib_data = mx.io.NDArrayIter(data=calib_data)
-    calib_data = DummyIter(calib_data)
-    qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=sym_sg,
-                                                                     arg_params=arg_params,
-                                                                     aux_params=aux_params,
-                                                                     ctx=mx.current_context(),
-                                                                     excluded_sym_names=excluded_sym_names,
-                                                                     quantized_dtype=out_type,
-                                                                     calib_mode='naive',
-                                                                     calib_data=calib_data,
-                                                                     label_names=None,
-                                                                     num_calib_examples=1,
-                                                                     quantize_mode='full')
-    qsym = qsym.get_backend_symbol('MKLDNN_QUANTIZE')
-    check_qsym_forward(qsym, qarg_params, qaux_params, data_shape)
-
-  for qdtype in ['uint8', 'int8', 'auto']:
-    check_quantize_whole_model(qdtype)
-
-@with_seed()
-def check_fusion(sym, data_shape, attrs_dict, path, check_fp32_fusion=True, check_quantization=True, out_types=['uint8', 'int8', 'auto']):
-  if check_fp32_fusion:
-    data_min = -1.0
-    data_max = 1.0
-    if ''.join(sym.get_internals().list_outputs()).find('sqrt') != -1:
-      check_quantization = False
-      data_min = 0
-
-    sym_sg = sym.get_backend_symbol(SG_PASS_NAME)
-    for name, attrs in attrs_dict.items():
-      if name in config:
-        op_name = config[name][OP_NAME]
-      else:
-        op_name = name
-      assert ''.join(sym_sg.get_internals().list_outputs()).find(op_name) != -1
-      if len(attrs):
-          found = False
-          for k, v in sym_sg.attr_dict().items():
-            if k.find(op_name) != -1:
-              found = True
-              for attr_name, attr_value in attrs.items():
-                assert v[attr_name].lower() == attr_value.lower()
-          assert found
-    arg_shapes, _, aux_shapes = sym.infer_shape()
-    arg_array = [mx.nd.random.uniform(data_min, data_max, shape=shape) for shape in arg_shapes]
-    aux_array = [mx.nd.random.uniform(shape=shape) for shape in aux_shapes]
-    exe = sym.bind(ctx=mx.current_context(), args=arg_array, aux_states=aux_array, grad_req='null')
-    exe.forward()
-    os.environ['MXNET_SUBGRAPH_BACKEND'] = SG_PASS_NAME
-    exe_sg = sym.bind(ctx=mx.current_context(), args=arg_array, aux_states=aux_array, grad_req='null')
-    exe_sg.forward()
-    del os.environ['MXNET_SUBGRAPH_BACKEND']
-    for i in range(len(exe.outputs)):
-      assert_almost_equal(exe.outputs[i].asnumpy(), exe_sg.outputs[i].asnumpy(), rtol=1e-3, atol=1e-1)
-
-  if check_quantization:
-    # fp32 to int8
-    for out_type in out_types:
-      check_quantize(sym, data_shape, out_type, path, name=name)
-      # TODO(ciyong), since quantized fc save its params in int8, while gluon treat the default
-      # variable from symbol file as fp32 which results in mismatch dtype of params.
-      # Skip quantized fc in gluon pass.
-      if name != 'fc':
-        check_quantize(sym, data_shape, out_type, path, name=name, gluon_forward=True)
-
 def check_neg_fusion(syms, attrs_name=None, excluded_attrs=None,
                      date_shape=(4,4,10,10), name='conv'):
   op_name = config[name][OP_NAME]
@@ -312,16 +132,6 @@ def head_symbol(data_shape):
   weight = mx.symbol.Variable('weight', dtype='float32')
   return data, weight
 
-@with_seed()
-@pytest.mark.parametrize('data_shape', DATA_SHAPE)
-@pytest.mark.parametrize('no_bias', [True, False])
-def test_pos_single_conv(no_bias, data_shape, tmpdir):
-# single conv fusion case
-    attr = {'conv': []}
-    data, weight = head_symbol(data_shape)
-    conv = mx.symbol.Convolution(data=data, weight=weight, name='conv', num_filter=64,
-                                 kernel=(3, 3), stride=(1, 1), no_bias=no_bias)
-    check_fusion(conv, data_shape, attr, str(tmpdir))
 
 # conv + bn fusion case
 def conv_bn(no_bias, data_shape):
@@ -367,147 +177,7 @@ def conv_act_sum(no_bias, data_shape, alg):
   sum = relu + conv1
   return sum, attr
 
-# conv + add fusion case
-@with_seed()
-@pytest.mark.parametrize('data_shape', DATA_SHAPE)
-@pytest.mark.parametrize('no_bias', [True, False])
-def test_pos_conv_add(no_bias, data_shape, tmpdir):
-    attr = {'conv': {'with_sum': 'true'}}
-    data, weight = head_symbol(data_shape)
-    conv1 = mx.symbol.Convolution(data=data, weight=weight, name='conv1', num_filter=64,
-                               kernel=(3, 3), stride=(1, 1), no_bias=no_bias)
-    conv2 = mx.symbol.Convolution(data=data, name='conv2', num_filter=64,
-                               kernel=(3, 3), stride=(1, 1))
-    pool = mx.sym.Pooling(data=conv2, kernel=(1, 1), pool_type='avg', name='pool')
-    sum = conv1 + pool
-    check_fusion(sum, data_shape, attr, str(tmpdir))
-
-# conv + add fusion case 2
-@with_seed()
-@pytest.mark.parametrize('data_shape', DATA_SHAPE)
-@pytest.mark.parametrize('no_bias', [True, False])
-def test_pos_conv_add2(no_bias, data_shape, tmpdir):
-    attr = {'conv': {'with_sum': 'true'}}
-    data, weight = head_symbol(data_shape)
-    conv1 = mx.symbol.Convolution(data=data, weight=weight, name='conv1', num_filter=64,
-                               kernel=(3, 3), stride=(1, 1), no_bias=no_bias)
-    conv2 = mx.symbol.Convolution(data=data, name='conv2', num_filter=64,
-                               kernel=(3, 3), stride=(1, 1))
-    pool = mx.sym.Pooling(data=conv2, kernel=(1, 1), pool_type='avg', name='pool')
-    sum = pool + conv1
-    check_fusion(sum, data_shape, attr, str(tmpdir))
-
-# conv + bn + act fusion case
-@with_seed()
-@pytest.mark.parametrize('data_shape', DATA_SHAPE)
-@pytest.mark.parametrize('alg,quantize', [
-    ("relu", True),
-    ("sigmoid", True),
-    ("tanh", True),
-    ("softrelu", True),
-    ("relu6", True),
-    ("leakyrelu", True),
-    ("gelu", True)
-])
-@pytest.mark.parametrize('no_bias', [True, False])
-def test_pos_conv_bn_act(no_bias, data_shape, alg, quantize, tmpdir):
-  attr = {'conv': {'with_bn': 'true', 'with_act': 'true'}}
-  data, weight = head_symbol(data_shape)
-  conv = mx.symbol.Convolution(data=data, weight=weight, name='conv', num_filter=64,
-                               kernel=(3, 3), stride=(1, 1), no_bias=no_bias)
-  bn1 = mx.symbol.BatchNorm(data=conv, name="bn1")
-  if alg == "relu6":
-    relu = mx.symbol.clip(data=bn1, name='relu6', a_min=0, a_max=6)
-  elif alg == "leakyrelu":
-    relu = mx.symbol.LeakyReLU(data=bn1, slope=0.25, act_type='leaky')
-  elif alg == "gelu":
-    relu = mx.symbol.LeakyReLU(data=bn1, act_type='gelu')
-  else:
-    relu = mx.symbol.Activation(data=bn1, name=alg, act_type=alg)
-  check_fusion(relu, data_shape, attr, str(tmpdir), check_quantization=quantize)
-
-# conv + bn + add + act fusion case
-@with_seed()
-@pytest.mark.parametrize('data_shape', DATA_SHAPE)
-@pytest.mark.parametrize('alg,quantize', [
-    ("relu", True),
-    ("sigmoid", True),
-    ("tanh", True),
-    ("softrelu", True),
-    ("relu6", False),
-    ("leakyrelu", True),
-    ("gelu", False)
-])
-@pytest.mark.parametrize('no_bias', [True, False])
-def test_pos_conv_bn_sum_act(no_bias, data_shape, alg, quantize, tmpdir):
-  attr = {'conv': {'with_sum': 'true', 'with_postsum_act': 'true', 'with_bn': 'true'}}
-  data, weight = head_symbol(data_shape)
-  conv = mx.symbol.Convolution(data=data, weight=weight, name='conv', num_filter=64,
-                               kernel=(3, 3), stride=(1, 1), no_bias=no_bias)
-  bn1 = mx.symbol.BatchNorm(data=conv, name="bn1")
-  conv1 = mx.symbol.Convolution(data=data, weight=weight, name='conv1', num_filter=64,
-                                kernel=(3, 3), stride=(1, 1))
-  sum1 = bn1 + conv1
-  if alg == "relu6":
-    relu = mx.symbol.clip(data=sum1, name='relu6', a_min=0, a_max=6)
-  elif alg == "leakyrelu":
-    relu = mx.symbol.LeakyReLU(data=sum1, slope=0.25, act_type='leaky')
-  elif alg == "gelu":
-    relu = mx.symbol.LeakyReLU(data=sum1, act_type='gelu')
-  else:
-    relu = mx.symbol.Activation(data=sum1, name=alg, act_type=alg)
-  check_fusion(relu, data_shape, attr, str(tmpdir), check_quantization=quantize)
-
-# single concat case
-@with_seed()
-@pytest.mark.parametrize('data_shape', DATA_SHAPE)
-@pytest.mark.parametrize('input_num,dim', [
-    (2, -1),
-    (2, 1),
-    (4, 2),
-    (4, 3)
-])
-@pytest.mark.parametrize('out_type', ['int8', 'auto'])
-@pytest.mark.parametrize('gluon_forward', [False, True])
-def test_pos_single_concat(data_shape, input_num, dim, gluon_forward, out_type, tmpdir):
-    data = mx.symbol.Variable('data', shape=data_shape, dtype='float32')
-    inputs = []
-    for i in range(input_num):
-        inputs.append(data)
-        concat = mx.symbol.Concat(*inputs, name="concat", dim=dim)
-    check_quantize(concat, data_shape, out_type, str(tmpdir), name='conv',
-                   check_calibration=False, gluon_forward=gluon_forward)
-
-@pytest.mark.parametrize('data_shape', DATA_SHAPE)
-@pytest.mark.parametrize('out_type', ['int8', 'auto'])
-def test_pos_single_concat_pos_neg(data_shape, out_type, tmpdir):
-    data, weight = head_symbol(data_shape)
-    conv = mx.symbol.Convolution(data=data, weight=weight, name='conv', num_filter=4,
-                                 kernel=(1, 1), stride=(1, 1), no_bias=True)
-    relu = mx.symbol.Activation(data=conv, name='relu', act_type='relu')
-    inputs = [data, relu]
-    concat = mx.symbol.Concat(*inputs, name="concat", dim=1)
-    check_quantize(concat, data_shape, out_type, str(tmpdir), name='', check_calibration=False)
-
-# concat scale alignment case
 
-@with_seed()
-@pytest.mark.parametrize('data_shape', DATA_SHAPE)
-@pytest.mark.parametrize('out_type', ['int8', 'auto'])
-@pytest.mark.parametrize('gluon_forward', [False, True])
-def test_pos_concat_scale_align(data_shape, out_type, gluon_forward, tmpdir):
-    data, weight = head_symbol(data_shape)
-    conv1 = mx.symbol.Convolution(data=data, weight=weight, name='conv1', num_filter=64,
-                               kernel=(3, 3), stride=(1, 1), no_bias=True)
-    conv2 = mx.symbol.Convolution(data=data, weight=weight * 2, name='conv2', num_filter=64,
-                               kernel=(3, 3), stride=(1, 1), no_bias=True)
-    conv3 = mx.symbol.Convolution(data=data, weight=weight * 3, name='conv3', num_filter=64,
-                               kernel=(3, 3), stride=(1, 1), no_bias=True)
-    conv4 = mx.symbol.Convolution(data=data, weight=weight * 4, name='conv4', num_filter=64,
-                               kernel=(3, 3), stride=(1, 1), no_bias=True)
-    concat = mx.symbol.Concat(*[conv1, conv2, conv3, conv4], name="concat", dim=1)
-    check_quantize(concat, data_shape, out_type, str(tmpdir), check_calibration=True,
-                   check_scale_align=True, gluon_forward=gluon_forward)
 
 
 # mobilenetv2 case
@@ -757,36 +427,6 @@ def neg_fc_relu(no_bias, data_shape, flatten=True):
   excluded_attrs.append([])
   return syms, attrs, excluded_attrs
 
-@with_seed()
-def test_pos_conv_act(tmpdir):
-  act_list = {"relu": True,
-              "sigmoid": True,
-              "tanh": True,
-              "softrelu": True,
-              "relu6": True,
-              "leakyrelu": True,
-              "gelu": True}
-  for data_shape in DATA_SHAPE:
-    for (alg, quantize) in act_list.items():
-      net, attrs = conv_act(False, data_shape, alg)
-      check_fusion(net, data_shape, attrs, str(tmpdir), check_quantization=quantize)
-      net, attrs = conv_act(True, data_shape, alg)
-      check_fusion(net, data_shape, attrs, str(tmpdir), check_quantization=quantize)
-
-@with_seed()
-def test_pos_conv_bn(tmpdir):
-  for data_shape in DATA_SHAPE:
-    net, attrs = conv_bn(False, data_shape)
-    check_fusion(net, data_shape, attrs, str(tmpdir))
-    net, attrs = conv_bn(True, data_shape)
-    check_fusion(net, data_shape, attrs, str(tmpdir))
-
-@with_seed()
-@pytest.mark.parametrize('data_shape', DATA_SHAPE)
-def test_mobilenetv2_struct(data_shape, tmpdir):
-      net, attrs = mobilenetv2_struct(data_shape)
-      check_fusion(net, data_shape, attrs, str(tmpdir), out_types=['int8', 'auto'])
-
 @with_seed()
 @pytest.mark.parametrize('data_shape', DATA_SHAPE)
 def test_neg_conv_bn(data_shape):
@@ -817,22 +457,6 @@ def test_neg_conv_bn_add_relu(data_shape):
     syms, attrs, excluded_attrs = neg_conv_bn_add_relu(data_shape)
     check_neg_fusion(syms, attrs, excluded_attrs, data_shape)
 
-@with_seed()
-@pytest.mark.parametrize('data_shape', DATA_SHAPE)
-@pytest.mark.parametrize('no_bias', [True, False])
-@pytest.mark.parametrize('flatten', [True, False])
-def test_single_fc(data_shape, no_bias, flatten, tmpdir):
-    syms, attrs = single_fc(no_bias, data_shape, flatten)
-    check_fusion(syms, data_shape, attrs, str(tmpdir), check_quantization=flatten)
-
-@with_seed()
-@pytest.mark.parametrize('data_shape', DATA_SHAPE)
-@pytest.mark.parametrize('no_bias', [True, False])
-@pytest.mark.parametrize('flatten', [True, False])
-@pytest.mark.parametrize('alg', fc_post_ops_list)
-def test_fc_eltwise(data_shape, no_bias, flatten, alg, tmpdir):
-    syms, attrs = fc_eltwise(no_bias, data_shape, flatten, alg)
-    check_fusion(syms, data_shape, attrs, str(tmpdir), check_quantization=flatten)
 
 @with_seed()
 @pytest.mark.parametrize('data_shape', DATA_SHAPE)
@@ -862,112 +486,3 @@ def test_float64_fallback():
     )
     ex.forward()
     ex.outputs[0].wait_to_read()
-
-
-@with_seed()
-@pytest.mark.parametrize('data_min,data_max,weight_min,weight_max', [
-    (-1, 1, 0, 0),
-    (-1, 1, -1e-6, +1e-6),
-    (0, 0, 1, 1),
-    (-1e-6, +1e-6, -1, 1),
-    (-1e-6, +1e-6, -1e-6, +1e-6),
-    (0, 0, 0, 0)
-])
-def test_quantized_conv_bias_overflow(data_min, data_max, weight_min, weight_max):
-  data_shape = (1, 32, 2, 2)
-  data = mx.symbol.Variable('data', shape=data_shape, dtype='float32')
-  weight = mx.symbol.Variable('weight', dtype='float32')
-  bias = mx.symbol.Variable('bias', dtype='float32')
-  sym = mx.symbol.Convolution(data=data, weight=weight, bias=bias, name='conv', num_filter=64,
-                               kernel=(1, 1), stride=(1, 1))
-  data_nd = mx.random.uniform(data_min, data_max, shape=data_shape, ctx=mx.cpu())
-  weight_nd = mx.random.uniform(weight_min, weight_max, shape=[64, 32, 1, 1], ctx=mx.cpu())
-  bias_nd = mx.random.uniform(-1, +1, shape=[64], ctx=mx.cpu())
-  arg_params = {
-      'data': data_nd,
-      'weight': weight_nd,
-      'bias': bias_nd
-  }
-
-  ex = sym.bind(mx.cpu(), arg_params, args_grad=None)
-  ex.forward()
-  ex.outputs[0].wait_to_read()
-  sym_sg = sym.get_backend_symbol(QUANTIZE_SG_PASS_NAME)
-  batch = mx.io.DataBatch([data_nd], [])
-  calib_data = CalibIter(batch, data_shape, 1)
-  qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=sym_sg,
-                                                                   arg_params={
-                                                                       'weight': weight_nd,
-                                                                       'bias': bias_nd
-                                                                   },
-                                                                   aux_params={},
-                                                                   ctx=mx.cpu(),
-                                                                   excluded_sym_names=None,
-                                                                   excluded_op_names=None,
-                                                                   quantized_dtype='int8',
-                                                                   calib_mode='naive',
-                                                                   calib_data=calib_data,
-                                                                   label_names=None,
-                                                                   num_calib_examples=1,
-                                                                   quantize_mode='full')
-  qsym = qsym.get_backend_symbol(QUANTIZE_SG_PASS_NAME)
-  qarg_params['data'] = data_nd
-  qex = qsym.bind(mx.cpu(), qarg_params, args_grad=None)
-  qex.forward()
-  qex.outputs[0].wait_to_read()
-  assert_almost_equal_with_err(ex.outputs[0].asnumpy(), qex.outputs[0].asnumpy(),
-                               rtol=1e-2, atol=1e-2, etol=0.01)
-
-@with_seed()
-@pytest.mark.parametrize('data_min,data_max,weight_min,weight_max', [
-    (-1, 1, 0, 0),
-    (-1, 1, -1e-6, +1e-6),
-    (0, 0, 1, 1),
-    (-1e-6, +1e-6, -1, 1),
-    (-1e-6, +1e-6, -1e-6, +1e-6),
-    (0, 0, 0, 0)
-])
-def test_quantized_fc_bias_overflow(data_min, data_max, weight_min, weight_max):
-  data_shape = (1, 32)
-  data = mx.symbol.Variable('data', shape=data_shape, dtype='float32')
-  weight = mx.symbol.Variable('weight', dtype='float32')
-  bias = mx.symbol.Variable('bias', dtype='float32')
-  sym = mx.symbol.FullyConnected(data=data, weight=weight, bias=bias, name='fc', num_hidden=64)
-  data_nd = mx.random.uniform(data_min, data_max, shape=data_shape, ctx=mx.cpu())
-  weight_nd = mx.random.uniform(weight_min, weight_max, shape=[64, 32], ctx=mx.cpu())
-  bias_nd = mx.random.uniform(-1, +1, shape=[64], ctx=mx.cpu())
-  arg_params = {
-      'data': data_nd,
-      'weight': weight_nd,
-      'bias': bias_nd
-  }
-
-  ex = sym.bind(mx.cpu(), arg_params, args_grad=None)
-  ex.forward()
-  ex.outputs[0].wait_to_read()
-  sym_sg = sym.get_backend_symbol(QUANTIZE_SG_PASS_NAME)
-  batch = mx.io.DataBatch([data_nd], [])
-  calib_data = CalibIter(batch, data_shape, 1)
-  qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=sym_sg,
-                                                                   arg_params={
-                                                                       'weight': weight_nd,
-                                                                       'bias': bias_nd
-                                                                   },
-                                                                   aux_params={},
-                                                                   ctx=mx.cpu(),
-                                                                   excluded_sym_names=None,
-                                                                   excluded_op_names=None,
-                                                                   quantized_dtype='int8',
-                                                                   calib_mode='naive',
-                                                                   calib_data=calib_data,
-                                                                   label_names=None,
-                                                                   num_calib_examples=1,
-                                                                   quantize_mode='full')
-  qarg_params['data'] = data_nd
-  qsym = qsym.get_backend_symbol(QUANTIZE_SG_PASS_NAME)
-  qex = qsym.bind(mx.cpu(), qarg_params, args_grad=None)
-  qex.forward()
-  qex.outputs[0].wait_to_read()
-  assert_almost_equal_with_err(ex.outputs[0].asnumpy(), qex.outputs[0].asnumpy(),
-                               rtol=1e-2, atol=1e-2, etol=0.01)
-
diff --git a/tests/python/quantization/test_quantization.py b/tests/python/quantization/test_quantization.py
index 5034d90c2263..317e8cc65d86 100644
--- a/tests/python/quantization/test_quantization.py
+++ b/tests/python/quantization/test_quantization.py
@@ -24,7 +24,6 @@
 from mxnet.gluon.model_zoo import vision
 from mxnet.test_utils import assert_almost_equal, assert_exception, rand_ndarray, rand_shape_nd, same, DummyIter
 from common import with_seed, xfail_when_nonstandard_decimal_separator
-from mxnet.module import Module
 from mxnet.io import NDArrayIter
 import unittest
 import operator
@@ -718,88 +717,6 @@ def check_quantized_act(data_shape, qdtype):
         check_quantized_act((10, 15, 18), qdtype)
         check_quantized_act((3, 4, 23, 23), qdtype)
 
-@with_seed()
-def test_quantized_bn():
-    def get_mean_var(data):
-        mean = mx.ndarray.mean(data, axis=1, exclude=1)
-        mean_broad = mx.ndarray.expand_dims(mean, axis=0)
-        mean_broad = mx.ndarray.expand_dims(mean_broad, axis=2)
-        mean_broad = mx.ndarray.expand_dims(mean_broad, axis=3)
-        mean_broad = mx.ndarray.broadcast_like(mean_broad, data)
-        var = mx.ndarray.multiply(data - mean_broad, data - mean_broad)
-        var = mx.ndarray.mean(var, axis=1, exclude=1)
-        return mean, var
-
-    def check_quantized_bn(data_shape, qdtype):
-        if is_test_for_native_cpu():
-            print('skipped testing quantize_bn for native cpu since it is not supported yet')
-            return
-        elif is_test_for_gpu():
-            print('skipped testing quantize_bn for gpu since it is not supported yet')
-            return
-
-        # qdtype = uint8
-        if qdtype == 'uint8':
-            data_low = 0.0
-            data_high = 255.0
-        else:
-            data_low = -127.0
-            data_high = 127.0
-
-        # run fp32 bn
-        data_sym = mx.sym.Variable(name='data', shape=data_shape, dtype='float32')
-        bn_fp32 = mx.sym.BatchNorm(data=data_sym, name='bn', use_global_stats=True, fix_gamma=False)
-        arg_shapes, out_shapes, aux_shapes = bn_fp32.infer_shape(data=data_shape)
-        arg_names = bn_fp32.list_arguments()
-        aux_names = bn_fp32.list_auxiliary_states()
-
-        data = mx.nd.random.uniform(low=data_low, high=data_high, shape=data_shape)
-        gamma = mx.nd.random.uniform(low=data_low, high=data_high, shape=arg_shapes[1])
-        beta = mx.nd.random.uniform(low=data_low, high=data_high, shape=arg_shapes[2])
-        moving_mean, moving_var = get_mean_var(data)
-
-        bn_fp32_exe = bn_fp32.simple_bind(ctx=mx.current_context(), grad_req='null')
-        bn_fp32_exe.arg_dict[arg_names[0]][:] = data
-        bn_fp32_exe.arg_dict[arg_names[1]][:] = gamma
-        bn_fp32_exe.arg_dict[arg_names[2]][:] = beta
-        bn_fp32_exe.aux_dict[aux_names[0]][:] = moving_mean
-        bn_fp32_exe.aux_dict[aux_names[1]][:] = moving_var
-
-        output= bn_fp32_exe.forward()[0]
-
-        # generate int8 bn from fp32 bn
-        arg_params = dict()
-        for k,v in bn_fp32_exe.arg_dict.items():
-            if 'data' in k or 'softmax_label' in k:
-                continue
-            arg_params[k] = v
-
-        calib_data = NDArrayIter(data=data, batch_size=data_shape[0])
-        calib_data = DummyIter(calib_data)
-        qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=bn_fp32,
-                                                                             arg_params=arg_params,
-                                                                             aux_params=bn_fp32_exe.aux_dict,
-                                                                             ctx=mx.current_context(),
-                                                                             quantized_dtype=qdtype,
-                                                                             quantize_mode='full',
-                                                                             calib_mode='naive',
-                                                                             calib_data=calib_data,
-                                                                             num_calib_examples=20)
-
-        mod = mx.mod.Module(symbol=qsym, label_names=None, context=mx.current_context())
-        mod.bind(for_training=False, data_shapes=[('data', data_shape)])
-        mod.set_params(qarg_params, qaux_params)
-        batch = mx.io.DataBatch([data], [])
-        mod.forward(batch, is_train=False)
-        output_int8_to_fp32 = mod.get_outputs()[0]
-
-        assert_almost_equal(output.asnumpy(), output_int8_to_fp32.asnumpy(), rtol=1e-1, atol=8)
-
-    for qdtype in ['int8', 'uint8']:
-      check_quantized_bn((32, 512, 4, 4), qdtype)
-      check_quantized_bn((32, 1024, 8, 8), qdtype)
-      check_quantized_bn((32, 3, 224, 224), qdtype)
-
 @with_seed()
 def test_quantize_params():
     if is_test_for_native_cpu():
@@ -870,144 +787,6 @@ def get_fp32_sym_with_multiple_outputs(length=1):
                               out_grad=False, preserve_shape=False, use_ignore=False, name='softmax')
     return sym
 
-@xfail_when_nonstandard_decimal_separator
-@with_seed()
-def test_quantize_model():
-    def check_quantize_model(qdtype):
-        if is_test_for_native_cpu():
-            print('skipped testing quantize_model for native cpu since it is not supported yet')
-            return
-        elif qdtype == 'int8' and is_test_for_mkldnn():
-            print('skipped testing quantize_model for mkldnn cpu int8 since it is not supported yet')
-            return
-        elif qdtype == 'uint8' and is_test_for_gpu():
-            print('skipped testing quantize_model for gpu uint8 since it is not supported yet')
-            return
-
-        def check_params(params, qparams, qsym=None):
-            if qsym is None:
-                assert len(params) == len(qparams)
-                for k, v in params.items():
-                    assert k in qparams
-                    assert same(v.asnumpy(), qparams[k].asnumpy())
-            else:
-                qparams_ground_truth = mx.contrib.quant._quantize_params(qsym, params, th_dict = {})
-                assert len(qparams) == len(qparams_ground_truth)
-                for k, v in qparams_ground_truth.items():
-                    assert k in qparams
-                    assert same(v.asnumpy(), qparams[k].asnumpy())
-
-        def check_qsym_calibrated(qsym):
-            attrs = qsym.attr_dict()
-            for k, v in attrs.items():
-                if k.find('requantize_') != -1:
-                    assert 'min_calib_range' in v
-                    assert 'max_calib_range' in v
-
-        def check_qsym_qdtype(qsym, qdtype):
-            attrs = qsym.attr_dict()
-            for k, v in attrs.items():
-                if k.find('_quantize') != -1:
-                    assert 'out_type' in v
-                    assert v['out_type'] == qdtype
-
-        sym = get_fp32_sym()
-        batch_size = 4
-        label_shape = (batch_size, 10)
-        data_shape = (batch_size, 4, 10, 10)
-
-        length = batch_size  # specify num of outputs from split op
-        msym = get_fp32_sym_with_multiple_outputs(length)
-        msym_label_shape = (length, 10)
-        msym_data_shape = (length, 4, 4, 10, 10)
-
-        for s, dshape, lshape in zip((sym, msym), (data_shape, msym_data_shape),
-                                     (label_shape, msym_label_shape)):
-            mod = Module(symbol=s)
-            mod.bind(data_shapes=[('data', dshape)], label_shapes=[('softmax_label', lshape)])
-            mod.init_params()
-            arg_params, aux_params = mod.get_params()
-            qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=s,
-                                                                             arg_params=arg_params,
-                                                                             aux_params=aux_params,
-                                                                             ctx=mx.current_context(),
-                                                                             quantized_dtype=qdtype,
-                                                                             calib_mode='none',
-                                                                             quantize_mode='full')
-            check_params(arg_params, qarg_params, qsym)
-            check_params(aux_params, qaux_params)
-
-            calib_data = mx.nd.random.uniform(shape=dshape)
-            calib_data = NDArrayIter(data=calib_data, batch_size=batch_size)
-            calib_data = DummyIter(calib_data)
-            qsym, qarg_params, qaux_params = mx.contrib.quant.quantize_model(sym=s,
-                                                                             arg_params=arg_params,
-                                                                             aux_params=aux_params,
-                                                                             ctx=mx.current_context(),
-                                                                             quantized_dtype=qdtype,
-                                                                             calib_mode='naive',
-                                                                             calib_data=calib_data,
-                                                                             num_calib_examples=20,
-                                                                             quantize_mode='full')
-            check_params(arg_params, qarg_params, qsym)
-            check_params(aux_params, qaux_params)
-            check_qsym_calibrated(qsym)
-            check_qsym_qdtype(qsym, qdtype)
-
-    for qdtype in ['int8', 'uint8']:
-        check_quantize_model(qdtype)
-
-
-@with_seed()
-def test_quantize_gluon_with_forward():
-    def check_quantize_net(qdtype):
-        if is_test_for_native_cpu():
-            print('skipped testing test_quantize_model_with_forward for native cpu since it is not supported yet')
-            return
-        elif is_test_for_gpu():
-            print('skipped testing test_quantize_model_with_forward for gpu uint8 since it is not supported yet')
-            return
-
-        data_shape = (32, 3, 224, 224)
-        data_shapes = [mx.io.DataDesc(name='data', shape=data_shape)]
-        label_shape = (32, 1)
-        batch_size = 1
-        resnet18_v1 = vision.resnet18_v1(pretrained=True)
-        resnet18_v1.collect_params().reset_ctx(mx.current_context())
-        excluded_names_match = []
-        if mx.current_context() == mx.gpu():
-            excluded_names_match += ['activation', 'relu', 'conv0']
-        num_calib_examples = 5
-
-        random_data = mx.random.uniform(shape=data_shape)
-        random_label = mx.random.uniform(shape=label_shape)
-        dataset = mx.gluon.data.dataset.ArrayDataset(random_data, random_label)
-        calib_data = mx.gluon.data.DataLoader(dataset, batch_size=batch_size)
-
-        quantized_resnet18_v1 = mx.contrib.quant.quantize_net(resnet18_v1, quantized_dtype=qdtype,
-                                                              exclude_layers=None,
-                                                              exclude_layers_match=excluded_names_match,
-                                                              calib_mode='none',
-                                                              data_shapes=data_shapes,
-                                                              ctx=mx.current_context())
-        quantized_resnet18_v1.hybridize(static_alloc=True, static_shape=True)
-        quantized_resnet18_v1(random_data)
-
-        for mode in ['naive', 'entropy']:
-            qdtype = qdtype if mode is 'naive' else 'auto'
-            quantized_resnet18_v1 = mx.contrib.quant.quantize_net(resnet18_v1, quantized_dtype=qdtype,
-                                                                  exclude_layers=None,
-                                                                  exclude_layers_match=excluded_names_match,
-                                                                  calib_data=calib_data,
-                                                                  calib_mode=mode,
-                                                                  num_calib_examples=num_calib_examples,
-                                                                  ctx=mx.current_context())
-            quantized_resnet18_v1.hybridize(static_alloc=True, static_shape=True)
-            quantized_resnet18_v1(random_data)
-
-    for qdtype in ['int8', 'uint8']:
-        check_quantize_net(qdtype)
-
 @xfail_when_nonstandard_decimal_separator
 @with_seed()
 def test_quantize_sym_with_calib():
@@ -1091,4 +870,3 @@ def get_threshold(nd):
         th_dict = mx.contrib.quant._get_optimal_thresholds(hist_dict, dtype)
         assert 'layer1' in th_dict
         assert_almost_equal(np.array([th_dict['layer1'][1]]), expected_threshold, rtol=1e-2, atol=1e-4)
-
diff --git a/tests/python/tensorrt/lenet5_train.py b/tests/python/tensorrt/lenet5_train.py
deleted file mode 100644
index 5603180e1347..000000000000
--- a/tests/python/tensorrt/lenet5_train.py
+++ /dev/null
@@ -1,101 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import os
-import mxnet as mx
-import numpy as np
-from tempfile import TemporaryDirectory
-
-def get_iters(mnist, batch_size):
-    """Get MNIST iterators."""
-    train_iter = mx.io.NDArrayIter(mnist['train_data'],
-                                   mnist['train_label'],
-                                   batch_size,
-                                   shuffle=True)
-    val_iter = mx.io.NDArrayIter(mnist['test_data'], mnist['test_label'], batch_size)
-    test_iter = mx.io.NDArrayIter(mnist['test_data'], mnist['test_label'], batch_size)
-    all_test_labels = np.array(mnist['test_label'])
-    return train_iter, val_iter, test_iter, all_test_labels
-
-def lenet5():
-    """LeNet-5 Symbol"""
-    #pylint: disable=no-member
-    data = mx.sym.Variable('data')
-    data = mx.sym.Cast(data, 'float16')
-    conv1 = mx.sym.Convolution(data=data, kernel=(5, 5), num_filter=20)
-    tanh1 = mx.sym.Activation(data=conv1, act_type="tanh")
-    pool1 = mx.sym.Pooling(data=tanh1, pool_type="max",
-                           kernel=(2, 2), stride=(2, 2))
-    # second conv
-    conv2 = mx.sym.Convolution(data=pool1, kernel=(5, 5), num_filter=50)
-    tanh2 = mx.sym.Activation(data=conv2, act_type="tanh")
-    pool2 = mx.sym.Pooling(data=tanh2, pool_type="max",
-                           kernel=(2, 2), stride=(2, 2))
-    # first fullc
-    flatten = mx.sym.Flatten(data=pool2)
-    fc1 = mx.sym.FullyConnected(data=flatten, num_hidden=500)
-    tanh3 = mx.sym.Activation(data=fc1, act_type="tanh")
-    # second fullc
-    fc2 = mx.sym.FullyConnected(data=tanh3, num_hidden=10)
-    fc2 = mx.sym.Cast(fc2, 'float32')
-    # loss
-    lenet = mx.sym.SoftmaxOutput(data=fc2, name='softmax')
-    #pylint: enable=no-member
-    return lenet
-
-
-def train_lenet5(num_epochs, batch_size, train_iter, val_iter, test_iter):
-    """train LeNet-5 model on MNIST data"""
-    ctx = mx.gpu(0)
-    lenet_model = mx.mod.Module(lenet5(), context=ctx)
-
-    lenet_model.fit(train_iter,
-                    eval_data=val_iter,
-                    optimizer='sgd',
-                    optimizer_params={'learning_rate': 0.1, 'momentum': 0.9},
-                    eval_metric='acc',
-                    batch_end_callback=mx.callback.Speedometer(batch_size, 1),
-                    num_epoch=num_epochs)
-
-    # predict accuracy for lenet
-    acc = mx.gluon.metric.Accuracy()
-    lenet_model.score(test_iter, acc)
-    accuracy = acc.get()[1]
-    assert accuracy > 0.95, "LeNet-5 training accuracy on MNIST was too low"
-    return lenet_model
-
-
-if __name__ == '__main__':
-    num_epochs = 10
-    batch_size = 128
-    model_name = 'lenet5'
-    model_dir = os.getenv("LENET_MODEL_DIR", "/tmp")
-    model_file = '%s/%s-symbol.json' % (model_dir, model_name)
-    params_file = '%s/%s-%04d.params' % (model_dir, model_name, num_epochs)
-
-    if not (os.path.exists(model_file) and os.path.exists(params_file)):
-        with TemporaryDirectory() as path:
-            mnist = mx.test_utils.get_mnist(path)
-
-            _, _, _, all_test_labels = get_iters(mnist, batch_size)
-
-            trained_lenet = train_lenet5(num_epochs, batch_size,
-                                        *get_iters(mnist, batch_size)[:-1])
-            trained_lenet.save_checkpoint(model_name, num_epochs)
diff --git a/tests/python/tensorrt/test_tensorrt_lenet5.py b/tests/python/tensorrt/test_tensorrt_lenet5.py
deleted file mode 100644
index a37ddc31bc67..000000000000
--- a/tests/python/tensorrt/test_tensorrt_lenet5.py
+++ /dev/null
@@ -1,119 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import os
-import numpy as np
-import mxnet as mx
-from ctypes.util import find_library
-
-def check_tensorrt_installation():
-    assert find_library('nvinfer') is not None, "Can't find the TensorRT shared library"
-
-def get_iters(mnist, batch_size):
-    """Get MNIST iterators."""
-    train_iter = mx.io.NDArrayIter(mnist['train_data'],
-                                   mnist['train_label'],
-                                   batch_size,
-                                   shuffle=True)
-    val_iter = mx.io.NDArrayIter(mnist['test_data'], mnist['test_label'], batch_size)
-    test_iter = mx.io.NDArrayIter(mnist['test_data'], mnist['test_label'], batch_size)
-    all_test_labels = np.array(mnist['test_label'])
-    return train_iter, val_iter, test_iter, all_test_labels
-
-def run_inference(sym, arg_params, aux_params, mnist, all_test_labels, batch_size, use_tensorrt):
-    """Run inference with either MXNet or TensorRT"""
-
-    data_size = (batch_size,) + mnist['test_data'].shape[1:]
-    type_dict = {'data': 'float32', 'softmax_label': 'float32'}
-
-    if use_tensorrt:
-        _sym = sym.get_backend_symbol('TensorRT')
-        arg_params, aux_params = mx.contrib.tensorrt.init_tensorrt_params(_sym, arg_params,
-                                                                          aux_params)
-    else:
-        _sym = sym
-    for k, v in arg_params.items():
-        type_dict[k] = v.dtype
-    for k, v in aux_params.items():
-        type_dict[k] = v.dtype
-    executor = _sym.simple_bind(ctx=mx.gpu(0),
-                                type_dict=type_dict,
-                                data=data_size,
-                                softmax_label=(batch_size,),
-                                grad_req='null',
-                                force_rebind=True)
-    executor.copy_params_from(arg_params, aux_params)
-
-    # Get this value from all_test_labels
-    # Also get classes from the dataset
-    num_ex = 10000
-    all_preds = np.zeros([num_ex, 10])
-    test_iter = mx.io.NDArrayIter(mnist['test_data'], mnist['test_label'], batch_size)
-
-    example_ct = 0
-
-    for idx, dbatch in enumerate(test_iter):
-        executor.arg_dict["data"][:] = dbatch.data[0]
-        executor.forward(is_train=False)
-        offset = idx*batch_size
-        extent = batch_size if num_ex - offset > batch_size else num_ex - offset
-        all_preds[offset:offset+extent, :] = executor.outputs[0].asnumpy()[:extent]
-        example_ct += extent
-
-    all_preds = np.argmax(all_preds, axis=1)
-    matches = (all_preds[:example_ct] == all_test_labels[:example_ct]).sum()
-
-    percentage = 100.0 * matches / example_ct
-
-    return percentage
-
-
-def test_tensorrt_inference(tmpdir):
-    """Run LeNet-5 inference comparison between MXNet and TensorRT."""
-    check_tensorrt_installation()
-    path = str(tmpdir)
-    mnist = mx.test_utils.get_mnist(path)
-    num_epochs = 10
-    batch_size = 128
-    model_name = 'lenet5'
-    model_dir = os.getenv("LENET_MODEL_DIR", "/tmp")
-    model_file = '%s/%s-symbol.json' % (model_dir, model_name)
-    params_file = '%s/%s-%04d.params' % (model_dir, model_name, num_epochs)
-
-    _, _, _, all_test_labels = get_iters(mnist, batch_size)
-
-    # Load serialized MXNet model (model-symbol.json + model-epoch.params)
-    sym, arg_params, aux_params = mx.model.load_checkpoint(model_name, num_epochs)
-
-    print("LeNet-5 test")
-    print("Running inference in MXNet")
-    mx_pct = run_inference(sym, arg_params, aux_params, mnist, all_test_labels,
-                           batch_size=batch_size, use_tensorrt=False)
-
-    print("Running inference in MXNet-TensorRT")
-    trt_pct = run_inference(sym, arg_params, aux_params, mnist, all_test_labels,
-                            batch_size=batch_size, use_tensorrt=True)
-
-    print("MXNet accuracy: %f" % mx_pct)
-    print("MXNet-TensorRT accuracy: %f" % trt_pct)
-
-    absolute_accuracy_diff = abs(mx_pct - trt_pct)
-    epsilon = 3e-2
-    assert absolute_accuracy_diff < epsilon, \
-        """Absolute diff. between MXNet & TensorRT accuracy (%f) exceeds threshold (%f):
-           MXNet = %f, TensorRT = %f""" % (absolute_accuracy_diff, epsilon, mx_pct, trt_pct)
-
diff --git a/tests/python/train/test_dtype.py b/tests/python/train/test_dtype.py
deleted file mode 100644
index c1b5f5429893..000000000000
--- a/tests/python/train/test_dtype.py
+++ /dev/null
@@ -1,263 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import mxnet as mx
-import numpy as np
-import os, pickle, gzip
-import logging
-from mxnet.test_utils import get_cifar10
-
-import pytest
-
-
-@pytest.mark.garbage_expected
-def test_cifar10(tmpdir):
-    batch_size = 128
-
-    # small mlp network
-    def get_net():
-        data = mx.symbol.Variable('data')
-        float_data = mx.symbol.Cast(data=data, dtype="float32")
-        fc1 = mx.symbol.FullyConnected(float_data, name='fc1', num_hidden=128)
-        act1 = mx.symbol.Activation(fc1, name='relu1', act_type="relu")
-        fc2 = mx.symbol.FullyConnected(act1, name = 'fc2', num_hidden = 64)
-        act2 = mx.symbol.Activation(fc2, name='relu2', act_type="relu")
-        fc3 = mx.symbol.FullyConnected(act2, name='fc3', num_hidden=10)
-        softmax = mx.symbol.SoftmaxOutput(fc3, name="softmax")
-        return softmax
-
-    # check data
-    path = str(tmpdir)
-    get_cifar10(path)
-
-    def get_iterator_uint8(kv):
-        data_shape = (3, 28, 28)
-
-        train = mx.io.ImageRecordUInt8Iter(
-            path_imgrec = os.path.join(path, 'cifar', 'train.rec'),
-            data_shape  = data_shape,
-            batch_size  = batch_size,
-            rand_crop   = True,
-            rand_mirror = True,
-            num_parts   = kv.num_workers,
-            part_index  = kv.rank)
-        train = mx.io.PrefetchingIter(train)
-
-        val = mx.io.ImageRecordUInt8Iter(
-            path_imgrec = os.path.join(path, 'cifar', 'test.rec'),
-            rand_crop   = False,
-            rand_mirror = False,
-            data_shape  = data_shape,
-            batch_size  = batch_size,
-            num_parts   = kv.num_workers,
-            part_index  = kv.rank)
-
-        return (train, val)
-
-    def get_iterator_uint8_with_param(kv, ctx):
-        data_shape = (3, 28, 28)
-
-        train = mx.io.ImageRecordIter(
-            path_imgrec = os.path.join(path, 'cifar', 'train.rec'),
-            data_shape  = data_shape,
-            batch_size  = batch_size,
-            rand_crop   = True,
-            rand_mirror = True,
-            num_parts   = kv.num_workers,
-            part_index  = kv.rank,
-            dtype       ='uint8',
-            ctx         = ctx)
-        train = mx.io.PrefetchingIter(train)
-
-        val = mx.io.ImageRecordIter(
-            path_imgrec = os.path.join(path, 'cifar', 'test.rec'),
-            rand_crop   = False,
-            rand_mirror = False,
-            data_shape  = data_shape,
-            batch_size  = batch_size,
-            num_parts   = kv.num_workers,
-            part_index  = kv.rank,
-            dtype       ='uint8',
-            ctx         = ctx)
-
-        return (train, val)
-
-    def get_iterator_int8(kv):
-        data_shape = (3, 28, 28)
-
-        train = mx.io.ImageRecordInt8Iter(
-            path_imgrec = os.path.join(path, 'cifar', 'train.rec'),
-            data_shape  = data_shape,
-            batch_size  = batch_size,
-            rand_crop   = True,
-            rand_mirror = True,
-            num_parts   = kv.num_workers,
-            part_index  = kv.rank)
-        train = mx.io.PrefetchingIter(train)
-
-        val = mx.io.ImageRecordInt8Iter(
-            path_imgrec = os.path.join(path, 'cifar', 'test.rec'),
-            rand_crop   = False,
-            rand_mirror = False,
-            data_shape  = data_shape,
-            batch_size  = batch_size,
-            num_parts   = kv.num_workers,
-            part_index  = kv.rank)
-
-        return (train, val)
-
-    def get_iterator_int8_with_param(kv, ctx):
-        data_shape = (3, 28, 28)
-
-        train = mx.io.ImageRecordIter(
-            path_imgrec = os.path.join(path, 'cifar', 'train.rec'),
-            data_shape  = data_shape,
-            batch_size  = batch_size,
-            rand_crop   = True,
-            rand_mirror = True,
-            num_parts   = kv.num_workers,
-            part_index  = kv.rank,
-            dtype       ='int8',
-            ctx         = ctx)
-        train = mx.io.PrefetchingIter(train)
-
-        val = mx.io.ImageRecordIter(
-            path_imgrec = os.path.join(path, 'cifar', 'test.rec'),
-            rand_crop   = False,
-            rand_mirror = False,
-            data_shape  = data_shape,
-            batch_size  = batch_size,
-            num_parts   = kv.num_workers,
-            part_index  = kv.rank,
-            dtype       = 'int8',
-            ctx         = ctx)
-
-        return (train, val)
-
-    def get_iterator_float32(kv):
-        data_shape = (3, 28, 28)
-
-        train = mx.io.ImageRecordIter(
-            path_imgrec = os.path.join(path, 'cifar', 'train.rec'),
-            mean_img=os.path.join(path, 'cifar', 'mean.bin'),
-            data_shape  = data_shape,
-            batch_size  = batch_size,
-            rand_crop   = True,
-            rand_mirror = True,
-            num_parts   = kv.num_workers,
-            part_index  = kv.rank)
-        train = mx.io.PrefetchingIter(train)
-
-        val = mx.io.ImageRecordIter(
-            path_imgrec = os.path.join(path, 'cifar', 'test.rec'),
-            mean_img=os.path.join(path, 'cifar', 'mean.bin'),
-            rand_crop   = False,
-            rand_mirror = False,
-            data_shape  = data_shape,
-            batch_size  = batch_size,
-            num_parts   = kv.num_workers,
-            part_index  = kv.rank)
-
-        return (train, val)
-
-    num_epoch = 1
-
-    def run_cifar10(train, val):
-        train.reset()
-        val.reset()
-        devs = [mx.cpu(0)]
-        net = get_net()
-        mod = mx.mod.Module(net, context=devs)
-        optim_args = {'learning_rate': 0.001, 'wd': 0.00001, 'momentum': 0.9}
-        eval_metrics = ['accuracy']
-        executor = mx.mod.Module(net, context=devs)
-        executor.fit(
-            train,
-            eval_data=val,
-            optimizer_params=optim_args,
-            eval_metric=eval_metrics,
-            num_epoch=num_epoch,
-            arg_params=None,
-            aux_params=None,
-            begin_epoch=0,
-            batch_end_callback=mx.callback.Speedometer(batch_size, 50),
-            epoch_end_callback=None)
-
-        ret = executor.score(val, eval_metrics)
-        ret = list(ret)
-        logging.info('final accuracy = %f', ret[0][1])
-        assert (ret[0][1] > 0.08)
-
-    class CustomDataIter(mx.io.DataIter):
-        def __init__(self, data):
-            super(CustomDataIter, self).__init__()
-            self.data = data
-            self.batch_size = data.provide_data[0][1][0]
-
-            # use legacy tuple
-            self.provide_data = [(n, s) for n, s in data.provide_data]
-            self.provide_label = [(n, s) for n, s in data.provide_label]
-
-        def reset(self):
-            self.data.reset()
-
-        def next(self):
-            return self.data.next()
-
-        def iter_next(self):
-            return self.data.iter_next()
-
-        def getdata(self):
-            return self.data.getdata()
-
-        def getlabel(self):
-            return self.data.getlable()
-
-        def getindex(self):
-            return self.data.getindex()
-
-        def getpad(self):
-            return self.data.getpad()
-
-    # print logging by default
-    logging.basicConfig(level=logging.DEBUG)
-    console = logging.StreamHandler()
-    console.setLevel(logging.DEBUG)
-    logging.getLogger('').addHandler(console)
-    kv = mx.kvstore.create("local")
-    # test float32 input
-    (train, val) = get_iterator_float32(kv)
-    run_cifar10(train, val)
-
-    # test legecay tuple in provide_data and provide_label
-    run_cifar10(CustomDataIter(train), CustomDataIter(val))
-
-    # test uint8 input
-    (train, val) = get_iterator_uint8(kv)
-    run_cifar10(train, val)
-
-    for ctx in ("gpu", "cpu"):
-        (train, val) = get_iterator_uint8_with_param(kv, ctx)
-        run_cifar10(train, val)
-
-    # test int8 input
-    (train, val) = get_iterator_int8(kv)
-    run_cifar10(train, val)
-
-    for ctx in ("gpu", "cpu"):
-        (train, val) = get_iterator_int8_with_param(kv, ctx)
-        run_cifar10(train, val)
diff --git a/tests/python/train/test_resnet_aug.py b/tests/python/train/test_resnet_aug.py
deleted file mode 100644
index 74ba679b20c7..000000000000
--- a/tests/python/train/test_resnet_aug.py
+++ /dev/null
@@ -1,151 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import mxnet as mx
-import numpy as np
-import os, pickle, gzip
-import logging
-from mxnet.test_utils import get_cifar10
-
-import pytest
-
-
-@pytest.mark.garbage_expected
-def test_cifar10(tmpdir):
-    batch_size = 128
-
-    # small mlp network
-    def get_net():
-        data = mx.symbol.Variable('data')
-        float_data = mx.symbol.Cast(data=data, dtype="float32")
-        fc1 = mx.symbol.FullyConnected(float_data, name='fc1', num_hidden=128)
-        act1 = mx.symbol.Activation(fc1, name='relu1', act_type="relu")
-        fc2 = mx.symbol.FullyConnected(act1, name = 'fc2', num_hidden = 64)
-        act2 = mx.symbol.Activation(fc2, name='relu2', act_type="relu")
-        fc3 = mx.symbol.FullyConnected(act2, name='fc3', num_hidden=10)
-        softmax = mx.symbol.SoftmaxOutput(fc3, name="softmax")
-        return softmax
-
-    # check data
-    path = str(tmpdir)
-    get_cifar10(path)
-
-    def get_iterator(kv):
-        data_shape = (3, 28, 28)
-
-        train = mx.io.ImageRecordIter(
-            path_imgrec = os.path.join(path, 'cifar', 'train.rec'),
-            mean_img=os.path.join(path, 'cifar', 'mean.bin'),
-            data_shape  = data_shape,
-            batch_size  = batch_size,
-            random_resized_crop = True,
-            min_aspect_ratio = 0.75,
-            max_aspect_ratio = 1.33,
-            min_random_area = 0.08,
-            max_random_area = 1,
-            brightness  = 0.4,
-            contrast    = 0.4,
-            saturation  = 0.4,
-            pca_noise   = 0.1,
-            rand_mirror = True,
-            num_parts   = kv.num_workers,
-            part_index  = kv.rank)
-        train = mx.io.PrefetchingIter(train)
-
-        val = mx.io.ImageRecordIter(
-            path_imgrec = os.path.join(path, 'cifar', 'test.rec'),
-            mean_img=os.path.join(path, 'cifar', 'mean.bin'),
-            rand_crop   = False,
-            rand_mirror = False,
-            data_shape  = data_shape,
-            batch_size  = batch_size,
-            num_parts   = kv.num_workers,
-            part_index  = kv.rank)
-
-        return (train, val)
-
-    num_epoch = 1
-
-    def run_cifar10(train, val):
-        train.reset()
-        val.reset()
-        devs = [mx.cpu(0)]
-        net = get_net()
-        mod = mx.mod.Module(net, context=devs)
-        optim_args = {'learning_rate': 0.001, 'wd': 0.00001, 'momentum': 0.9}
-        eval_metrics = ['accuracy']
-        executor = mx.mod.Module(net, context=devs)
-        executor.fit(
-            train,
-            eval_data=val,
-            optimizer_params=optim_args,
-            eval_metric=eval_metrics,
-            num_epoch=num_epoch,
-            arg_params=None,
-            aux_params=None,
-            begin_epoch=0,
-            batch_end_callback=mx.callback.Speedometer(batch_size, 50),
-            epoch_end_callback=None)
-
-        ret = executor.score(val, eval_metrics)
-        ret = list(ret)
-        logging.info('final accuracy = %f', ret[0][1])
-        assert (ret[0][1] > 0.08)
-
-    class CustomDataIter(mx.io.DataIter):
-        def __init__(self, data):
-            super(CustomDataIter, self).__init__()
-            self.data = data
-            self.batch_size = data.provide_data[0][1][0]
-
-            # use legacy tuple
-            self.provide_data = [(n, s) for n, s in data.provide_data]
-            self.provide_label = [(n, s) for n, s in data.provide_label]
-
-        def reset(self):
-            self.data.reset()
-
-        def next(self):
-            return self.data.next()
-
-        def iter_next(self):
-            return self.data.iter_next()
-
-        def getdata(self):
-            return self.data.getdata()
-
-        def getlabel(self):
-            return self.data.getlable()
-
-        def getindex(self):
-            return self.data.getindex()
-
-        def getpad(self):
-            return self.data.getpad()
-
-    # print logging by default
-    logging.basicConfig(level=logging.DEBUG)
-    console = logging.StreamHandler()
-    console.setLevel(logging.DEBUG)
-    logging.getLogger('').addHandler(console)
-    kv = mx.kvstore.create("local")
-    # test float32 input
-    (train, val) = get_iterator(kv)
-    run_cifar10(train, val)
-
-    # test legecay tuple in provide_data and provide_label
-    run_cifar10(CustomDataIter(train), CustomDataIter(val))
diff --git a/tests/python/train/test_sparse_fm.py b/tests/python/train/test_sparse_fm.py
deleted file mode 100644
index 0d52ab555b56..000000000000
--- a/tests/python/train/test_sparse_fm.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import mxnet as mx
-import mxnet.ndarray as nd
-from mxnet.test_utils import *
-import numpy as np
-import os
-import sys
-CURR_PATH = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-sys.path.insert(0, os.path.join(CURR_PATH, '../unittest'))
-from common import retry
-
-@retry(5)
-def test_factorization_machine_module(verbose=False):
-    """ Test factorization machine model with sparse operators """
-    def check_factorization_machine_module(optimizer=None, num_epochs=None):
-        print("check_factorization_machine_module( {} )".format(optimizer))
-
-        def fm(factor_size, feature_dim, init):
-            x = mx.symbol.Variable("data", stype='csr')
-            v = mx.symbol.Variable("v", shape=(feature_dim, factor_size),
-                                   init=init, stype='row_sparse')
-
-            w1_weight = mx.symbol.var('w1_weight', shape=(feature_dim, 1),
-                                      init=init, stype='row_sparse')
-            w1_bias = mx.symbol.var('w1_bias', shape=(1))
-            w1 = mx.symbol.broadcast_add(mx.symbol.dot(x, w1_weight), w1_bias)
-
-            v_s = mx.symbol._internal._square_sum(data=v, axis=1, keepdims=True)
-            x_s = mx.symbol.square(data=x)
-            bd_sum = mx.sym.dot(x_s, v_s)
-
-            w2 = mx.symbol.dot(x, v)
-            w2_squared = 0.5 * mx.symbol.square(data=w2)
-
-            w_all = mx.symbol.Concat(w1, w2_squared, dim=1)
-            sum1 = mx.symbol.sum(data=w_all, axis=1, keepdims=True)
-            sum2 = 0.5 * mx.symbol.negative(bd_sum)
-            model = mx.sym.elemwise_add(sum1, sum2)
-
-            y = mx.symbol.Variable("label")
-            model = mx.symbol.LinearRegressionOutput(data=model, label=y)
-            return model
-
-        # model
-        init = mx.initializer.Normal(sigma=0.01)
-        factor_size = 4
-        feature_dim = 10000
-        model = fm(factor_size, feature_dim, init)
-
-        # data iter
-        num_batches = 5
-        batch_size = 64
-        num_samples = batch_size * num_batches
-        # generate some random csr data
-        csr_nd = rand_ndarray((num_samples, feature_dim), 'csr', 0.1)
-        label = mx.nd.ones((num_samples,1))
-        # the alternative is to use LibSVMIter
-        train_iter = mx.io.NDArrayIter(data=csr_nd,
-                                       label={'label':label},
-                                       batch_size=batch_size,
-                                       last_batch_handle='discard')
-        # create module
-        mod = mx.mod.Module(symbol=model, data_names=['data'], label_names=['label'])
-        # allocate memory by given the input data and lable shapes
-        mod.bind(data_shapes=train_iter.provide_data, label_shapes=train_iter.provide_label)
-        # initialize parameters by uniform random numbers
-        mod.init_params(initializer=init)
-        if optimizer == 'sgd':
-            # use Sparse SGD with learning rate 0.1 to train
-            sgd = mx.optimizer.SGD(momentum=0.1, clip_gradient=5.0, learning_rate=0.01,
-                                   rescale_grad=1.0/batch_size)
-            mod.init_optimizer(optimizer=sgd)
-            if num_epochs is None:
-                num_epochs = 10
-            expected_accuracy = 0.02
-        elif optimizer == 'adam':
-            # use Sparse Adam to train
-            adam = mx.optimizer.Adam(clip_gradient=5.0, learning_rate=0.0005,
-                                     rescale_grad=1.0/batch_size)
-            mod.init_optimizer(optimizer=adam)
-            if num_epochs is None:
-                num_epochs = 10
-            expected_accuracy = 0.05
-        elif optimizer == 'adagrad':
-            # use Sparse AdaGrad with learning rate 0.1 to train
-            adagrad = mx.optimizer.AdaGrad(clip_gradient=5.0, learning_rate=0.01,
-                                           rescale_grad=1.0/batch_size)
-            mod.init_optimizer(optimizer=adagrad)
-            if num_epochs is None:
-                num_epochs = 20
-            expected_accuracy = 0.09
-        else:
-            raise AssertionError("Unsupported optimizer type '" + optimizer + "' specified")
-        # use accuracy as the metric
-        metric = mx.gluon.metric.create('MSE')
-        # train 'num_epochs' epoch
-        for epoch in range(num_epochs):
-            train_iter.reset()
-            metric.reset()
-            for batch in train_iter:
-                mod.forward(batch, is_train=True)       # compute predictions
-                mod.update_metric(metric, batch.label)  # accumulate prediction accuracy
-                mod.backward()                          # compute gradients
-                mod.update()                            # update parameters
-            print('Epoch %d, Training %s' % (epoch, metric.get()))
-        if num_epochs > 1:
-            assert(metric.get()[1] < expected_accuracy)
-
-    if verbose is True:
-        print("============ SGD ==========================")
-        start = time.clock()
-    check_factorization_machine_module('sgd')
-    if verbose is True:
-        print("Duration: {}".format(time.clock() - start))
-        print("============ ADAM ==========================")
-        start = time.clock()
-    check_factorization_machine_module('adam')
-    if verbose is True:
-        print("Duration: {}".format(time.clock() - start))
-        print("============ ADAGRAD ==========================")
-        start = time.clock()
-    check_factorization_machine_module('adagrad')
-    if verbose is True:
-        print("Duration: {}".format(time.clock() - start))
-
-# run as a script
-if __name__ == "__main__":
-    test_factorization_machine_module()
diff --git a/tests/python/unittest/onnx/test_models.py b/tests/python/unittest/onnx/test_models.py
deleted file mode 100644
index ce54a349003e..000000000000
--- a/tests/python/unittest/onnx/test_models.py
+++ /dev/null
@@ -1,164 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-# pylint: disable=too-many-locals,wrong-import-position,import-error
-from __future__ import absolute_import
-import sys
-import os
-import pytest
-import logging
-import tarfile
-from collections import namedtuple
-import numpy as np
-import numpy.testing as npt
-from onnx import numpy_helper
-from onnx import TensorProto
-from mxnet.test_utils import download
-from mxnet.contrib import onnx as onnx_mxnet
-import mxnet as mx
-
-CURR_PATH = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-sys.path.insert(0, os.path.join(CURR_PATH, '../../python/unittest'))
-
-
-logger = logging.getLogger()
-logger.setLevel(logging.DEBUG)
-URLS = {
-    'bvlc_googlenet':
-        'https://s3.amazonaws.com/download.onnx/models/opset_8/bvlc_googlenet.tar.gz',
-    'bvlc_reference_caffenet':
-        'https://s3.amazonaws.com/download.onnx/models/opset_8/bvlc_reference_caffenet.tar.gz',
-    'bvlc_reference_rcnn_ilsvrc13':
-        'https://s3.amazonaws.com/download.onnx/models/opset_8/bvlc_reference_rcnn_ilsvrc13.tar.gz',
-    'inception_v1':
-        'https://s3.amazonaws.com/download.onnx/models/opset_8/inception_v1.tar.gz',
-    'inception_v2':
-        'https://s3.amazonaws.com/download.onnx/models/opset_8/inception_v2.tar.gz'
-}
-
-test_model_path = "https://s3.amazonaws.com/onnx-mxnet/test_model.onnx"
-
-def get_test_files(name):
-    """Extract tar file and returns model path and input, output data"""
-    tar_name = download(URLS.get(name), dirname=CURR_PATH.__str__())
-    # extract tar file
-    tar_path = os.path.join(CURR_PATH, tar_name)
-    tar = tarfile.open(tar_path.__str__(), "r:*")
-    tar.extractall(path=CURR_PATH.__str__())
-    tar.close()
-    data_dir = os.path.join(CURR_PATH, name)
-    model_path = os.path.join(data_dir, 'model.onnx')
-
-    inputs = []
-    outputs = []
-    # get test files
-    for test_file in os.listdir(data_dir):
-        case_dir = os.path.join(data_dir, test_file)
-        # skip the non-dir files
-        if not os.path.isdir(case_dir):
-            continue
-        input_file = os.path.join(case_dir, 'input_0.pb')
-        input_tensor = TensorProto()
-        with open(input_file, 'rb') as proto_file:
-            input_tensor.ParseFromString(proto_file.read())
-        inputs.append(numpy_helper.to_array(input_tensor))
-
-        output_tensor = TensorProto()
-        output_file = os.path.join(case_dir, 'output_0.pb')
-        with open(output_file, 'rb') as proto_file:
-            output_tensor.ParseFromString(proto_file.read())
-        outputs.append(numpy_helper.to_array(output_tensor))
-
-    return model_path, inputs, outputs
-
-
-def forward_pass(sym, arg, aux, data_names, input_data):
-    """ Perform forward pass on given data"""
-    # create module
-    mod = mx.mod.Module(symbol=sym, data_names=data_names, context=mx.cpu(), label_names=None)
-    mod.bind(for_training=False, data_shapes=[(data_names[0], input_data.shape)], label_shapes=None)
-    mod.set_params(arg_params=arg, aux_params=aux,
-                   allow_missing=True, allow_extra=True)
-    # run inference
-    batch = namedtuple('Batch', ['data'])
-    mod.forward(batch([mx.nd.array(input_data)]), is_train=False)
-
-    return mod.get_outputs()[0].asnumpy()
-
-
-@pytest.mark.parametrize('model_name,input_shape,output_shape', [
-    ("bvlc_googlenet", (1, 3, 224, 224), (1, 1000)),
-    ("bvlc_reference_caffenet", (1, 3, 224, 224), (1, 1000)),
-    ("bvlc_reference_rcnn_ilsvrc13", (1, 3, 224, 224), (1, 200)),
-    ("inception_v1", (1, 3, 224, 224), (1, 1000)),
-    ("inception_v2", (1, 3, 224, 224), (1, 1000))
-])
-def test_import_export(model_name, input_shape, output_shape):
-    def get_model_results(modelpath):
-        symbol, args, aux = onnx_mxnet.import_model(modelpath)
-
-        data = onnx_mxnet.get_model_metadata(modelpath)
-        data_names = [input_name[0] for input_name in data.get('input_tensor_data')]
-
-        result = []
-        for input_data, output_data in zip(inputs, outputs):
-            output = forward_pass(symbol, args, aux, data_names, input_data)
-            result.append(output)
-        return symbol, args, aux, result, data
-
-    model_path, inputs, outputs = get_test_files(model_name)
-    logging.info("Translating " + model_name + " from ONNX model zoo to MXNet")
-
-    sym, arg_params, aux_params, expected_result, _ = get_model_results(model_path)
-
-    params = {}
-    params.update(arg_params)
-    params.update(aux_params)
-
-    dir_path = os.path.dirname(model_path)
-    new_model_name = "exported_" + model_name + ".onnx"
-    onnx_file = os.path.join(dir_path, new_model_name)
-
-    logging.info("Translating converted model from mxnet to ONNX")
-    converted_model_path = onnx_mxnet.export_model(sym, params, [input_shape], np.float32, onnx_file)
-
-    sym, arg_params, aux_params, actual_result, metadata = get_model_results(converted_model_path)
-
-    assert len(metadata) == 2
-    assert metadata.get('input_tensor_data')
-    assert metadata.get('input_tensor_data')[0][1] == input_shape
-    assert metadata.get('output_tensor_data')
-    assert metadata.get('output_tensor_data')[0][1] == output_shape
-
-    # verify the results
-    for expected, actual in zip(expected_result, actual_result):
-        npt.assert_equal(expected.shape, actual.shape)
-        npt.assert_almost_equal(expected, actual, decimal=3)
-
-    logging.info(model_name + " conversion successful")
-
-def test_nodims_import():
-    # Download test model without dims mentioned in params
-    test_model = download(test_model_path, dirname=CURR_PATH.__str__())
-    input_data = np.array([0.2, 0.5])
-    nd_data = mx.nd.array(input_data).expand_dims(0)
-    sym, arg_params, aux_params = onnx_mxnet.import_model(test_model)
-    model_metadata = onnx_mxnet.get_model_metadata(test_model)
-    input_names = [inputs[0] for inputs in model_metadata.get('input_tensor_data')]
-    output_data = forward_pass(sym, arg_params, aux_params, input_names, nd_data)
-    assert(output_data.shape == (1,1))
diff --git a/tests/python/unittest/onnx/test_node.py b/tests/python/unittest/onnx/test_node.py
index 3e2786c1bacc..c4c19a47c15b 100644
--- a/tests/python/unittest/onnx/test_node.py
+++ b/tests/python/unittest/onnx/test_node.py
@@ -74,38 +74,6 @@ def _fix_attributes(attrs, attribute_mapping):
     return new_attrs
 
 
-def forward_pass(sym, arg, aux, data_names, input_data):
-    """ Perform forward pass on given data
-    :param sym: Symbol
-    :param arg: Arg params
-    :param aux: Aux params
-    :param data_names: Input names (list)
-    :param input_data: Input data (list). If there is only one input,
-                        pass it as a list. For example, if input is [1, 2],
-                        pass input_data=[[1, 2]]
-    :return: result of forward pass
-    """
-    data_shapes = []
-    data_forward = []
-    for idx in range(len(data_names)):
-        val = input_data[idx]
-        data_shapes.append((data_names[idx], np.shape(val)))
-        data_forward.append(mx.nd.array(val))
-    # create module
-    mod = mx.mod.Module(symbol=sym, data_names=data_names, context=mx.cpu(), label_names=None)
-    mod.bind(for_training=False, data_shapes=data_shapes, label_shapes=None)
-    if not arg and not aux:
-        mod.init_params()
-    else:
-        mod.set_params(arg_params=arg, aux_params=aux,
-                       allow_missing=True, allow_extra=True)
-    # run inference
-    batch = namedtuple('Batch', ['data'])
-    mod.forward(batch(data_forward), is_train=False)
-
-    return mod.get_outputs()[0].asnumpy()
-
-
 def get_input_tensors(input_data):
     input_tensor = []
     input_names = []
@@ -134,66 +102,6 @@ class TestNode(unittest.TestCase):
     Tests are dynamically added.
     Therefore edit test_models to add more tests.
     """
-    def test_import_export(self):
-        for test in test_cases:
-            test_name, mxnet_op, onnx_name, inputs, attrs, mxnet_specific, fix_attrs, check_value, check_shape = test
-            with self.subTest(test_name):
-                names, input_tensors, inputsym = get_input_tensors(inputs)
-                if inputs:
-                    test_op = mxnet_op(*inputsym, **attrs)
-                    mxnet_output = forward_pass(test_op, None, None, names, inputs)
-                    outputshape = np.shape(mxnet_output)
-                else:
-                    test_op = mxnet_op(**attrs)
-                    shape = attrs.get('shape', (1,))
-                    x = mx.nd.zeros(shape, dtype='float32')
-                    xgrad = mx.nd.zeros(shape, dtype='float32')
-                    exe = test_op.bind(ctx=mx.cpu(), args={'x': x}, args_grad={'x': xgrad})
-                    mxnet_output = exe.forward(is_train=False)[0].asnumpy()
-                    outputshape = np.shape(mxnet_output)
-
-                if mxnet_specific:
-                    onnxmodelfile = onnx_mxnet.export_model(test_op, {}, [np.shape(ip) for ip in inputs],
-                                                            np.float32,
-                                                            onnx_name + ".onnx")
-                    onnxmodel = load_model(onnxmodelfile)
-                else:
-                    onnx_attrs = _fix_attributes(attrs, fix_attrs)
-                    onnxmodel = get_onnx_graph(test_name, names, input_tensors, onnx_name, outputshape, onnx_attrs)
-
-                bkd_rep = backend.prepare(onnxmodel, operation='export', backend='mxnet')
-                output = bkd_rep.run(inputs)
-
-                if check_value:
-                    npt.assert_almost_equal(output[0], mxnet_output)
-
-                if check_shape:
-                    npt.assert_equal(output[0].shape, outputshape)
-
-        input1 = get_rnd((1, 10, 2, 3))
-        ipsym = mx.sym.Variable("input1")
-        for test in test_scalar_ops:
-            if test == 'Add':
-                outsym = 2 + ipsym
-            if test == "Sub":
-                outsym = ipsym - 2
-            if test == "rSub":
-                outsym = ipsym.__rsub__(2)
-            if test == "Mul":
-                outsym = 2 * ipsym
-            if test == "Div":
-                outsym = ipsym / 2
-            if test == "Pow":
-                outsym = ipsym ** 2
-            forward_op = forward_pass(outsym, None, None, ['input1'], input1)
-            converted_model = onnx_mxnet.export_model(outsym, {}, [np.shape(input1)], np.float32,
-                                                      onnx_file_path=outsym.name + ".onnx")
-
-            sym, arg_params, aux_params = onnx_mxnet.import_model(converted_model)
-        result = forward_pass(sym, arg_params, aux_params, ['input1'], input1)
-
-        npt.assert_almost_equal(result, forward_op)
-
     def test_imports(self):
         for bk in ['mxnet', 'gluon']:
             for test in import_test_cases:
diff --git a/example/image-classification/symbols/resnet.py b/tests/python/unittest/resnet.py
similarity index 100%
rename from example/image-classification/symbols/resnet.py
rename to tests/python/unittest/resnet.py
diff --git a/tests/python/unittest/test_contrib_svrg_module.py b/tests/python/unittest/test_contrib_svrg_module.py
deleted file mode 100644
index 6e9f9b5ba22b..000000000000
--- a/tests/python/unittest/test_contrib_svrg_module.py
+++ /dev/null
@@ -1,307 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import mxnet as mx
-import numpy as np
-from common import with_seed, assertRaises
-from mxnet.contrib.svrg_optimization.svrg_module import SVRGModule
-from mxnet.test_utils import *
-import pytest
-
-def setup():
-    train_data = np.random.randint(1, 5, [1000, 2])
-    weights = np.array([1.0, 2.0])
-    train_label = train_data.dot(weights)
-
-    di = mx.io.NDArrayIter(train_data, train_label, batch_size=32, shuffle=True, label_name='lin_reg_label')
-    X = mx.sym.Variable('data')
-    Y = mx.symbol.Variable('lin_reg_label')
-    fully_connected_layer = mx.sym.FullyConnected(data=X, name='fc1', num_hidden=1)
-    lro = mx.sym.LinearRegressionOutput(data=fully_connected_layer, label=Y, name="lro")
-
-    mod = SVRGModule(
-        symbol=lro,
-        data_names=['data'],
-        label_names=['lin_reg_label'], update_freq=2)
-    mod.bind(data_shapes=di.provide_data, label_shapes=di.provide_label)
-    mod.init_params(initializer=mx.init.Uniform(0.01), allow_missing=False, force_init=False, allow_extra=False)
-
-    return di, mod
-
-
-def test_bind_module():
-    _, mod = setup()
-    assert mod.binded == True
-    assert mod._mod_aux.binded == True
-
-
-def test_module_init():
-    _, mod = setup()
-    assert mod._mod_aux is not None
-
-
-def test_module_initializer():
-    def regression_model(m):
-        x = mx.symbol.var("data", stype='csr')
-        v = mx.symbol.var("v", shape=(m, 1), init=mx.init.Uniform(scale=.1),
-                          stype='row_sparse')
-        model = mx.symbol.dot(lhs=x, rhs=v)
-        y = mx.symbol.Variable("label")
-        model = mx.symbol.LinearRegressionOutput(data=model, label=y, name="out")
-        return model
-
-    #shape of the data
-    n, m = 128, 100
-    model = regression_model(m)
-
-    data = mx.nd.zeros(shape=(n, m), stype='csr')
-    label = mx.nd.zeros((n, 1))
-    iterator = mx.io.NDArrayIter(data=data, label={'label': label},
-                                 batch_size=n, last_batch_handle='discard')
-
-    # create module
-    mod = SVRGModule(symbol=model, data_names=['data'], label_names=['label'], update_freq=2)
-    mod.bind(data_shapes=iterator.provide_data, label_shapes=iterator.provide_label)
-    mod.init_params()
-    v = mod._arg_params['v']
-    assert v.stype == 'row_sparse'
-    assert np.sum(v.asnumpy()) != 0
-
-
-def test_module_bind():
-    x = mx.sym.Variable("data")
-    net = mx.sym.FullyConnected(x, num_hidden=1)
-
-    mod = SVRGModule(symbol=net, data_names=['data'], label_names=None, update_freq=2)
-    assertRaises(TypeError, mod.bind, data_shapes=['data', mx.nd.zeros(shape=(2, 1))])
-
-    mod.bind(data_shapes=[('data', (2, 1))])
-    assert mod.binded == True
-    assert mod._mod_aux.binded == True
-
-
-@pytest.mark.skip(reason="Flaky test https://gitsvrhub.com/apache/incubator-mxnet/issues/12510")
-@with_seed()
-def test_module_save_load(tmpdir):
-    import os
-
-    x = mx.sym.Variable("data")
-    y = mx.sym.Variable("softmax_label")
-    net = mx.sym.FullyConnected(x, y, num_hidden=1)
-
-    mod = SVRGModule(symbol=net, data_names=['data'], label_names=['softmax_label'], update_freq=2)
-    mod.bind(data_shapes=[('data', (1, 1))])
-    mod.init_params()
-    mod.init_optimizer(optimizer='sgd', optimizer_params={'learning_rate': 0.1})
-    mod.update()
-
-    tmp = str(tmpdir)
-    tmp_file = os.path.join(tmp, 'svrg_test_output')
-    mod.save_checkpoint(tmp_file, 0, save_optimizer_states=True)
-
-    mod2 = SVRGModule.load(tmp_file, 0, load_optimizer_states=True, data_names=('data', ))
-    mod2.bind(data_shapes=[('data', (1, 1))])
-    mod2.init_optimizer(optimizer_params={'learning_rate': 0.1})
-    assert mod._symbol.tojson() == mod2._symbol.tojson()
-
-    # Multi-device
-    mod3 = SVRGModule(symbol=net, data_names=['data'], label_names=['softmax_label'], update_freq=3,
-                     context=[mx.cpu(0), mx.cpu(1)])
-    mod3.bind(data_shapes=[('data', (10, 10))])
-    mod3.init_params()
-    mod3.init_optimizer(optimizer_params={'learning_rate': 1.0})
-    mod3.update()
-    mod3.save_checkpoint(tmp_file, 0, save_optimizer_states=True)
-
-    mod4 = SVRGModule.load(tmp_file, 0, load_optimizer_states=True, data_names=('data', ))
-    mod4.bind(data_shapes=[('data', (10, 10))])
-    mod4.init_optimizer(optimizer_params={'learning_rate': 1.0})
-    assert mod3._symbol.tojson() == mod4._symbol.tojson()
-
-
-@pytest.mark.skip(reason="Flaky test https://github.com/apache/incubator-mxnet/issues/12510")
-@with_seed()
-def test_svrgmodule_reshape():
-    data = mx.sym.Variable("data")
-    sym = mx.sym.FullyConnected(data=data, num_hidden=4, name='fc')
-
-    dshape=(3, 4)
-    mod = SVRGModule(sym, data_names=["data"], label_names=None, context=[mx.cpu(0), mx.cpu(1)], update_freq=2)
-    mod.bind(data_shapes=[('data', dshape)])
-    mod.init_params()
-    mod._mod_aux.init_params()
-    mod.init_optimizer(optimizer_params={"learning_rate": 1.0})
-
-    data_batch = mx.io.DataBatch(data=[mx.nd.ones(dshape)], label=None)
-    mod.forward(data_batch)
-    mod.backward([mx.nd.ones(dshape)])
-    mod.update()
-    assert mod.get_outputs()[0].shape == dshape
-
-    dshape = (2, 4)
-    mod.reshape(data_shapes=[('data', dshape)])
-    mod.forward(mx.io.DataBatch(data=[mx.nd.ones(dshape)],
-                                label=None))
-    mod.backward([mx.nd.ones(dshape)])
-    mod.update()
-    assert mod.get_outputs()[0].shape == dshape
-
-
-@pytest.mark.skip(reason="Flaky test https://github.com/apache/incubator-mxnet/issues/12510")
-@with_seed()
-def test_update_full_grad():
-    def create_network():
-        train_data = np.random.randint(1, 5, [10, 2])
-        weights = np.array([1.0, 2.0])
-        train_label = train_data.dot(weights)
-
-        di = mx.io.NDArrayIter(train_data, train_label, batch_size=5, shuffle=True, label_name='lin_reg_label')
-        X = mx.sym.Variable('data')
-        Y = mx.symbol.Variable('lin_reg_label')
-        fully_connected_layer = mx.sym.FullyConnected(data=X, name='fc1', num_hidden=1)
-        lro = mx.sym.LinearRegressionOutput(data=fully_connected_layer, label=Y, name="lro")
-
-        mod = SVRGModule(
-            symbol=lro,
-            data_names=['data'],
-            label_names=['lin_reg_label'], update_freq=2)
-        mod.bind(data_shapes=di.provide_data, label_shapes=di.provide_label)
-        mod.init_params(initializer=mx.init.One(), allow_missing=False, force_init=False, allow_extra=False)
-        mod.init_optimizer(kvstore='local', optimizer='sgd', optimizer_params=(('learning_rate', 0.01),),
-                           force_init=False)
-        return di, mod
-
-    di, svrg_mod = create_network()
-
-    # Calculates the average of full gradients over number batches
-    full_grads_weights = mx.nd.zeros(shape=svrg_mod.get_params()[0]['fc1_weight'].shape)
-    arg, aux = svrg_mod.get_params()
-    svrg_mod._mod_aux.set_params(arg_params=arg, aux_params=aux)
-    num_batch = 2
-
-    for batch in di:
-        svrg_mod.forward(batch)
-        svrg_mod.backward()
-        full_grads_weights = mx.nd.broadcast_add(svrg_mod._exec_group.grad_arrays[0][0], full_grads_weights, axis=0)
-    full_grads_weights /= num_batch
-
-    di.reset()
-    svrg_mod.update_full_grads(di)
-    assert same(full_grads_weights, svrg_mod._param_dict[0]['fc1_weight'])
-
-
-@pytest.mark.skip(reason="Flaky test https://github.com/apache/incubator-mxnet/issues/12510")
-@with_seed()
-def test_svrg_with_sgd():
-    def create_module_with_sgd():
-        train_data = np.random.randint(1, 5, [100, 2])
-        weights = np.array([1.0, 2.0])
-        train_label = train_data.dot(weights)
-
-        di = mx.io.NDArrayIter(train_data, train_label, batch_size=10, shuffle=True, label_name='lin_reg_label')
-        X = mx.sym.Variable('data')
-        Y = mx.symbol.Variable('lin_reg_label')
-        fully_connected_layer = mx.sym.FullyConnected(data=X, name='fc1', num_hidden=1)
-        lro = mx.sym.LinearRegressionOutput(data=fully_connected_layer, label=Y, name="lro")
-
-        reg_mod = mx.mod.Module(
-            symbol=lro,
-            data_names=['data'],
-            label_names=['lin_reg_label'])
-        reg_mod.bind(data_shapes=di.provide_data, label_shapes=di.provide_label)
-        reg_mod.init_params(initializer=mx.init.One(), allow_missing=False, force_init=False, allow_extra=False)
-        reg_mod.init_optimizer(kvstore='local', optimizer='sgd', optimizer_params=(('learning_rate', 0.01),))
-
-        svrg_mod = SVRGModule(symbol=lro,
-            data_names=['data'],
-            label_names=['lin_reg_label'],
-            update_freq=2)
-        svrg_mod.bind(data_shapes=di.provide_data, label_shapes=di.provide_label)
-        svrg_mod.init_params(initializer=mx.init.One(), allow_missing=False, force_init=False, allow_extra=False)
-        svrg_mod.init_optimizer(kvstore='local', optimizer='sgd', optimizer_params=(('learning_rate', 0.01),))
-
-        return di,reg_mod, svrg_mod
-
-    di, reg_mod, svrg_mod = create_module_with_sgd()
-    num_epoch = 10
-
-    # Use metric MSE
-    metrics = mx.gluon.metric.create("mse")
-
-    # Train with SVRGModule
-    for e in range(num_epoch):
-        metrics.reset()
-        if e % svrg_mod.update_freq == 0:
-            svrg_mod.update_full_grads(di)
-        di.reset()
-        for batch in di:
-            svrg_mod.forward_backward(data_batch=batch)
-            svrg_mod.update()
-            svrg_mod.update_metric(metrics, batch.label)
-    svrg_mse = metrics.get()[1]
-
-    # Train with SGD standard Module
-    di.reset()
-    for e in range(num_epoch):
-        metrics.reset()
-        di.reset()
-        for batch in di:
-            reg_mod.forward_backward(data_batch=batch)
-            reg_mod.update()
-            reg_mod.update_metric(metrics, batch.label)
-    sgd_mse = metrics.get()[1]
-
-    assert svrg_mse < sgd_mse
-
-
-@pytest.mark.skip(reason="Flaky test https://github.com/apache/incubator-mxnet/issues/12510")
-@with_seed()
-def test_accumulate_kvstore():
-    # Test KVStore behavior when push a list of values
-    kv = mx.kv.create('local')
-    kv.init("fc1_weight", mx.nd.zeros(shape=(1, 2)))
-    kv.init("fc1_weight_full", mx.nd.zeros(shape=(1, 2)))
-    b = [mx.nd.ones(shape=(1, 2)) for i in range(4)]
-    a = mx.nd.zeros(shape=(1, 2))
-    kv.push("fc1_weight_full", b)
-    kv.pull("fc1_weight_full", out=a)
-    assert same(a, [mx.nd.array([4, 4])])
-    assert kv.num_workers == 1
-
-    # Test accumulate in KVStore and allocate gradients
-    kv_test = mx.kv.create('local')
-    _, svrg_mod = setup()
-    svrg_mod.init_optimizer(kvstore=kv_test, optimizer='sgd', optimizer_params=(('learning_rate', 0.01),),
-                            force_init=False)
-    svrg_mod._accumulate_kvstore("fc1_weight", b)
-    assert len(svrg_mod._param_dict) == svrg_mod._ctx_len
-    assert same(svrg_mod._param_dict[0]["fc1_weight"], b[0])
-
-
-@pytest.mark.skip(reason="Flaky test https://github.com/apache/incubator-mxnet/issues/12510")
-@with_seed()
-def test_fit():
-    di, mod = setup()
-    num_epoch = 100
-    metric = mx.gluon.metric.create("mse")
-    mod.fit(di, eval_metric=metric, optimizer='sgd', optimizer_params=(('learning_rate', 0.025),), num_epoch=num_epoch,
-            kvstore='local')
-
-    # Estimated MSE for using SGD optimizer of lr = 0.025, SVRG MSE should be smaller
-    estimated_mse = 1e-5
-    assert metric.get()[1] < estimated_mse
-
diff --git a/tests/python/unittest/test_contrib_svrg_optimizer.py b/tests/python/unittest/test_contrib_svrg_optimizer.py
deleted file mode 100644
index cb6fdcf1218a..000000000000
--- a/tests/python/unittest/test_contrib_svrg_optimizer.py
+++ /dev/null
@@ -1,97 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import numpy as np
-import mxnet as mx
-from mxnet.test_utils import same
-from mxnet.contrib.svrg_optimization.svrg_module import SVRGModule
-from mxnet.contrib.svrg_optimization.svrg_optimizer import _SVRGOptimizer
-
-
-def create_network():
-
-    train_data = np.random.randint(1, 5, [1000, 2])
-    weights = np.array([1.0, 2.0])
-    train_label = train_data.dot(weights)
-
-    batch_size = 32
-
-    di = mx.io.NDArrayIter(train_data, train_label, batch_size=batch_size, shuffle=True, label_name='lin_reg_label')
-    X = mx.sym.Variable('data')
-    Y = mx.symbol.Variable('lin_reg_label')
-    fully_connected_layer = mx.sym.FullyConnected(data=X, name='fc1', num_hidden=1)
-    lro = mx.sym.LinearRegressionOutput(data=fully_connected_layer, label=Y, name="lro")
-
-    mod = SVRGModule(
-        symbol=lro,
-        data_names=['data'],
-        label_names=['lin_reg_label'], update_freq=2
-    )
-
-    mod.bind(data_shapes=di.provide_data, label_shapes=di.provide_label)
-    mod.init_params(initializer=mx.init.Uniform(0.01), allow_missing=False,
-                    force_init=False, allow_extra=False)
-
-    return di, mod
-
-
-def test_init_svrg_optimizer():
-    _, mod = create_network()
-
-    kv = mx.kv.create('local')
-    mod.init_optimizer(kvstore=kv, optimizer='sgd', optimizer_params=(('learning_rate', 0.01),),
-                       force_init=False)
-
-    assert type(mod._optimizer).__name__ == _SVRGOptimizer.__name__
-
-
-def test_svrg_optimizer_constructor():
-    kv = mx.kv.create('local')
-    svrg_optimizer = _SVRGOptimizer(default_optimizer='sgd', learning_rate=-1.0)
-    kv.set_optimizer(svrg_optimizer)
-
-    assert svrg_optimizer.default_opt.lr == -1.0
-
-
-def test_kvstore_init_aux_keys():
-    param_idx2name = {0: "weight", 1: "weight_full"}
-
-    svrg_optimizer = _SVRGOptimizer(default_optimizer='sgd', param_idx2name= param_idx2name, learning_rate=1.0)
-    kv = mx.kv.create('local')
-    kv.set_optimizer(svrg_optimizer)
-
-    # Use default sgd optimizer
-    param_weight_init = mx.nd.array([0, 0, 0])
-    param_weight_update = mx.nd.array([1, 1, 1])
-
-    kv.init(0, param_weight_init)
-    kv.push(0, param_weight_update)
-    kv.pull(0, param_weight_init)
-
-    param_weight_full_init = mx.nd.array([1, 1, 1])
-    param_weight_full_update = mx.nd.array([2, 2, 2])
-
-    # Use AssignmentOptimizer
-    kv.init(1, param_weight_full_init)
-    kv.push(1, param_weight_full_update)
-    kv.pull(1, param_weight_full_init)
-
-    # updated weights using default sgd optimizer
-    assert same(param_weight_init.asnumpy(), np.array([-1, -1, -1]))
-    # updated with AssignmentOptimizer
-    assert same(param_weight_full_init.asnumpy(), np.array([2, 2, 2]))
-
diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py
index 0cc5d68ab53a..5b8c7cf967e3 100644
--- a/tests/python/unittest/test_gluon.py
+++ b/tests/python/unittest/test_gluon.py
@@ -1188,13 +1188,6 @@ def test_export():
     assert symbol_filename == 'gluon-symbol.json'
     assert params_filename == 'gluon-0000.params'
 
-    module = mx.mod.Module.load('gluon', 0, label_names=None, context=ctx)
-    module.bind(data_shapes=[('data', data.shape)])
-    module.forward(mx.io.DataBatch([data], None), is_train=False)
-    mod_out, = module.get_outputs()
-
-    assert_almost_equal(out.asnumpy(), mod_out.asnumpy())
-
     model2 = gluon.model_zoo.vision.resnet18_v1(prefix='resnet', ctx=ctx)
     model2.collect_params().load('gluon-0000.params', ctx)
     out2 = model2(data)
diff --git a/tests/python/unittest/test_gluon_rnn.py b/tests/python/unittest/test_gluon_rnn.py
index 00f01d33b9ea..73ac038c60c4 100644
--- a/tests/python/unittest/test_gluon_rnn.py
+++ b/tests/python/unittest/test_gluon_rnn.py
@@ -153,28 +153,6 @@ def test_lstmp():
         check_rnn_states(fused_states, stack_states, num_layers, True)
 
 
-@pytest.mark.serial
-def test_lstm_forget_bias():
-    forget_bias = 2.0
-    stack = gluon.rnn.SequentialRNNCell()
-    stack.add(gluon.rnn.LSTMCell(100, i2h_bias_initializer=mx.init.LSTMBias(forget_bias), prefix='l0_'))
-    stack.add(gluon.rnn.LSTMCell(100, i2h_bias_initializer=mx.init.LSTMBias(forget_bias), prefix='l1_'))
-
-    dshape = (32, 1, 200)
-    data = mx.sym.Variable('data')
-
-    sym, _ = stack.unroll(1, data, merge_outputs=True)
-    mod = mx.mod.Module(sym, label_names=None, context=mx.cpu(0))
-    mod.bind(data_shapes=[('data', dshape)], label_shapes=None)
-
-    mod.init_params()
-
-    bias_argument = next(x for x in sym.list_arguments() if x.endswith('i2h_bias'))
-    expected_bias = np.hstack([np.zeros((100,)),
-                               forget_bias * np.ones(100, ), np.zeros((2 * 100,))])
-    assert_allclose(mod.get_params()[0][bias_argument].asnumpy(), expected_bias)
-
-
 @assert_raises_cudnn_not_satisfied(min_version='5.1.10')
 def test_lstm_cpu_inference():
     # should behave the same as lstm cell
@@ -882,18 +860,6 @@ def test_rnn_unroll_variant_length():
             for valid_out_state, gt_state in zip(states, ele_states):
                 assert_allclose(valid_out_state[i:(i+1)].asnumpy(), gt_state.asnumpy(),
                                 atol=1E-4, rtol=1E-4)
-    # For symbolic test, we need to make sure that it can be binded and run
-    data = mx.sym.var('data', shape=(4, 10, 2))
-    cell = gluon.rnn.RNNCell(100)
-    valid_length = mx.sym.var('valid_length', shape=(4,))
-    outs, states = cell.unroll(length=10, inputs=data, valid_length=valid_length,
-                               merge_outputs=True, layout='NTC')
-    mod = mx.mod.Module(states[0], data_names=('data', 'valid_length'), label_names=None,
-                        context=mx.cpu())
-    mod.bind(data_shapes=[('data', (4, 10, 2)), ('valid_length', (4,))], label_shapes=None)
-    mod.init_params()
-    mod.forward(mx.io.DataBatch([mx.random.normal(0, 1, (4, 10, 2)), mx.nd.array([3, 6, 10, 2])]))
-    mod.get_outputs()[0].asnumpy()
 
 
 def test_cell_fill_shape():
diff --git a/tests/python/unittest/test_init.py b/tests/python/unittest/test_init.py
deleted file mode 100644
index 1a86c828e9d2..000000000000
--- a/tests/python/unittest/test_init.py
+++ /dev/null
@@ -1,97 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import json
-import pytest
-
-import mxnet as mx
-import numpy as np
-
-
-def test_default_init():
-    data = mx.sym.Variable('data')
-    sym = mx.sym.LeakyReLU(data=data, act_type='prelu')
-    mod = mx.mod.Module(sym)
-    mod.bind(data_shapes=[('data', (10,10))])
-    mod.init_params()
-    assert (list(mod.get_params()[0].values())[0].asnumpy() == 0.25).all()
-
-def test_variable_init():
-    data = mx.sym.Variable('data')
-    gamma = mx.sym.Variable('gamma', init=mx.init.One())
-    sym = mx.sym.LeakyReLU(data=data, gamma=gamma, act_type='prelu')
-    mod = mx.mod.Module(sym)
-    mod.bind(data_shapes=[('data', (10,10))])
-    mod.init_params()
-    assert (list(mod.get_params()[0].values())[0].asnumpy() == 1).all()
-
-def test_aux_init():
-    data = mx.sym.Variable('data')
-    sym = mx.sym.BatchNorm(data=data, name='bn')
-    mod = mx.mod.Module(sym)
-    mod.bind(data_shapes=[('data', (10, 10, 3, 3))])
-    mod.init_params()
-    assert (mod.get_params()[1]['bn_moving_var'].asnumpy() == 1).all()
-    assert (mod.get_params()[1]['bn_moving_mean'].asnumpy() == 0).all()
-
-@pytest.mark.skip(reason="rsp const init is broken: https://github.com/apache/incubator-mxnet/issues/17988")
-def test_rsp_const_init():
-    def check_rsp_const_init(init, val):
-        shape = (10, 10)
-        x = mx.symbol.Variable("data", stype='csr')
-        weight = mx.symbol.Variable("weight", shape=(shape[1], 2),
-                                    init=init, stype='row_sparse')
-        dot = mx.symbol.sparse.dot(x, weight)
-        mod = mx.mod.Module(dot, label_names=None)
-        mod.bind(data_shapes=[('data', shape)])
-        mod.init_params()
-        assert (list(mod.get_params()[0].values())[0].asnumpy() == val).all()
-
-    check_rsp_const_init(mx.initializer.Constant(value=2.), 2.)
-    check_rsp_const_init(mx.initializer.Zero(), 0.)
-    check_rsp_const_init(mx.initializer.One(), 1.)
-
-def test_bilinear_init():
-    bili = mx.init.Bilinear()
-    bili_weight = mx.ndarray.empty((1,1,4,4))
-    bili._init_weight(None, bili_weight)
-    bili_1d = np.array([[1/float(4), 3/float(4), 3/float(4), 1/float(4)]])
-    bili_2d = bili_1d * np.transpose(bili_1d)
-    assert (bili_2d == bili_weight.asnumpy()).all()
-
-def test_const_init_dumps():
-    shape = tuple(np.random.randint(1, 10, size=np.random.randint(1, 5)))
-    # test NDArray input
-    init = mx.init.Constant(mx.nd.ones(shape))
-    val = init.dumps()
-    assert val == json.dumps([init.__class__.__name__.lower(), init._kwargs])
-    # test scalar input
-    init = mx.init.Constant(1)
-    assert init.dumps() == '["constant", {"value": 1}]'
-    # test numpy input
-    init = mx.init.Constant(np.ones(shape))
-    val = init.dumps()
-    assert val == json.dumps([init.__class__.__name__.lower(), init._kwargs])
-
-
-if __name__ == '__main__':
-    test_variable_init()
-    test_default_init()
-    test_aux_init()
-    test_rsp_const_init()
-    test_bilinear_init()
-    test_const_init_dumps()
diff --git a/tests/python/unittest/test_loss.py b/tests/python/unittest/test_loss.py
index 4856bdc6f2ee..5e6c0f798d9e 100644
--- a/tests/python/unittest/test_loss.py
+++ b/tests/python/unittest/test_loss.py
@@ -65,50 +65,6 @@ def get_net(num_hidden, flatten=True):
     fc3 = mx.symbol.FullyConnected(act2, name='fc3', num_hidden=num_hidden, flatten=flatten)
     return fc3
 
-# tracked at: https://github.com/apache/incubator-mxnet/issues/11692
-@with_seed()
-def test_ce_loss():
-    nclass = 10
-    N = 20
-    data = mx.random.uniform(-1, 1, shape=(N, nclass))
-    label = mx.nd.array(np.random.randint(0, nclass, size=(N,)), dtype='int32')
-    data_iter = mx.io.NDArrayIter(data, label, batch_size=10, label_name='label')
-    output = get_net(nclass)
-    l = mx.symbol.Variable('label')
-    Loss = gluon.loss.SoftmaxCrossEntropyLoss()
-    loss = Loss(output, l)
-    loss = mx.sym.make_loss(loss)
-    mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',))
-    mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01},
-            eval_metric=mx.gluon.metric.Loss(), optimizer='adam',
-            initializer=mx.init.Xavier(magnitude=2))
-    assert mod.score(data_iter, eval_metric=mx.gluon.metric.Loss())[0][1] < 0.05
-
-# tracked at: https://github.com/apache/incubator-mxnet/issues/11691
-@with_seed()
-def test_bce_loss():
-    N = 20
-    data = mx.random.uniform(-1, 1, shape=(N, 20))
-    label = mx.nd.array(np.random.randint(2, size=(N,)), dtype='float32')
-    data_iter = mx.io.NDArrayIter(data, label, batch_size=10, label_name='label')
-    output = get_net(1)
-    l = mx.symbol.Variable('label')
-    Loss = gluon.loss.SigmoidBinaryCrossEntropyLoss()
-    loss = Loss(output, l)
-    loss = mx.sym.make_loss(loss)
-    mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',))
-    mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01},
-            eval_metric=mx.gluon.metric.Loss(), optimizer='adam',
-            initializer=mx.init.Xavier(magnitude=2))
-    assert mod.score(data_iter, eval_metric=mx.gluon.metric.Loss())[0][1] < 0.01
-    # Test against npy
-    data = mx.random.uniform(-5, 5, shape=(10,))
-    label = mx.random.uniform(0, 1, shape=(10,))
-    mx_bce_loss = Loss(data, label).asnumpy()
-    prob_npy = 1.0 / (1.0 + np.exp(-data.asnumpy()))
-    label_npy = label.asnumpy()
-    npy_bce_loss = - label_npy * np.log(prob_npy) - (1 - label_npy) * np.log(1 - prob_npy)
-    assert_almost_equal(mx_bce_loss, npy_bce_loss, rtol=1e-4, atol=1e-5)
 
 @with_seed()
 def test_bce_equal_ce2():
@@ -130,58 +86,6 @@ def test_logistic_loss_equal_bce():
     assert_almost_equal(loss_binary(data, label), loss_bce(data, label), atol=1e-6)
     assert_almost_equal(loss_signed(data, 2 * label - 1), loss_bce(data, label), atol=1e-6)
 
-@with_seed()
-def test_kl_loss():
-    N = 20
-    data = mx.random.uniform(-1, 1, shape=(N, 10))
-    label = mx.nd.softmax(mx.random.uniform(0, 1, shape=(N, 2)))
-    data_iter = mx.io.NDArrayIter(data, label, batch_size=10, label_name='label')
-    output = mx.sym.log_softmax(get_net(2))
-    l = mx.symbol.Variable('label')
-    Loss = gluon.loss.KLDivLoss()
-    loss = Loss(output, l)
-    loss = mx.sym.make_loss(loss)
-    mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',))
-    mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01},
-            eval_metric=mx.gluon.metric.Loss(), optimizer='adam')
-    assert mod.score(data_iter, eval_metric=mx.gluon.metric.Loss())[0][1] < 0.05
-
-
-@with_seed()
-def test_l2_loss():
-    N = 20
-    data = mx.random.uniform(-1, 1, shape=(N, 10))
-    label = mx.random.uniform(-1, 1, shape=(N, 1))
-    data_iter = mx.io.NDArrayIter(data, label, batch_size=10, label_name='label', shuffle=True)
-    output = get_net(1)
-    l = mx.symbol.Variable('label')
-    Loss = gluon.loss.L2Loss()
-    loss = Loss(output, l)
-    loss = mx.sym.make_loss(loss)
-    mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',))
-    mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01},
-            initializer=mx.init.Xavier(magnitude=2), eval_metric=mx.gluon.metric.Loss(),
-            optimizer='adam')
-    assert mod.score(data_iter, eval_metric=mx.gluon.metric.Loss())[0][1] < 0.05
-
-
-@with_seed()
-def test_l1_loss():
-    N = 20
-    data = mx.random.uniform(-1, 1, shape=(N, 10))
-    label = mx.random.uniform(-1, 1, shape=(N, 1))
-    data_iter = mx.io.NDArrayIter(data, label, batch_size=10, label_name='label', shuffle=True)
-    output = get_net(1)
-    l = mx.symbol.Variable('label')
-    Loss = gluon.loss.L1Loss()
-    loss = Loss(output, l)
-    loss = mx.sym.make_loss(loss)
-    mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',))
-    mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01},
-            initializer=mx.init.Xavier(magnitude=2), eval_metric=mx.gluon.metric.Loss(),
-            optimizer='adam')
-    assert mod.score(data_iter, eval_metric=mx.gluon.metric.Loss())[0][1] < 0.1
-
 
 @with_seed()
 def test_ctc_loss():
@@ -210,145 +114,6 @@ def test_ctc_loss():
     assert_almost_equal(l, np.array([18.82820702, 16.50581741]))
 
 
-@with_seed()
-def test_ctc_loss_train():
-    N = 20
-    data = mx.random.uniform(-1, 1, shape=(N, 20, 10))
-    label = mx.nd.arange(4, repeat=N).reshape((N, 4))
-    data_iter = mx.io.NDArrayIter(data, label, batch_size=10, label_name='label', shuffle=True)
-    output = get_net(5, False)
-    l = mx.symbol.Variable('label')
-    Loss = gluon.loss.CTCLoss(layout='NTC', label_layout='NT')
-    loss = Loss(output, l)
-    loss = mx.sym.make_loss(loss)
-    mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',))
-    mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01},
-            initializer=mx.init.Xavier(magnitude=2), eval_metric=mx.gluon.metric.Loss(),
-            optimizer='adam')
-    assert mod.score(data_iter, eval_metric=mx.gluon.metric.Loss())[0][1] < 10
-
-
-@with_seed()
-def test_sample_weight_loss():
-    nclass = 10
-    N = 20
-    data = mx.random.uniform(-1, 1, shape=(N, nclass))
-    label = mx.nd.array(np.random.randint(0, nclass, size=(N,)), dtype='int32')
-    weight = mx.nd.array([1 for i in range(10)] + [0 for i in range(10)])
-    data_iter = mx.io.NDArrayIter(data, {'label': label, 'w': weight}, batch_size=10)
-    output = get_net(nclass)
-    l = mx.symbol.Variable('label')
-    w = mx.symbol.Variable('w')
-    Loss = gluon.loss.SoftmaxCrossEntropyLoss()
-    loss = Loss(output, l, w)
-    loss = mx.sym.make_loss(loss)
-    mod = mx.mod.Module(loss, data_names=('data',), label_names=('label', 'w'))
-    mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01},
-            eval_metric=mx.gluon.metric.Loss(), optimizer='adam')
-    data_iter = mx.io.NDArrayIter(data[10:], {'label': label, 'w': weight}, batch_size=10)
-    score =  mod.score(data_iter, eval_metric=mx.gluon.metric.Loss())[0][1]
-    assert score > 1
-    data_iter = mx.io.NDArrayIter(data[:10], {'label': label, 'w': weight}, batch_size=10)
-    score =  mod.score(data_iter, eval_metric=mx.gluon.metric.Loss())[0][1]
-    assert score < 0.05
-
-
-@with_seed(1234)
-def test_saveload():
-    nclass = 10
-    N = 20
-    data = mx.random.uniform(-1, 1, shape=(N, nclass))
-    label = mx.nd.array(np.random.randint(0, nclass, size=(N,)), dtype='int32')
-    data_iter = mx.io.NDArrayIter(data, label, batch_size=10, label_name='label')
-    output = get_net(nclass)
-    l = mx.symbol.Variable('label')
-    Loss = gluon.loss.SoftmaxCrossEntropyLoss()
-    loss = Loss(output, l)
-    loss = mx.sym.make_loss(loss)
-    mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',))
-    mod.fit(data_iter, num_epoch=100, optimizer_params={'learning_rate': 1.},
-            eval_metric=mx.gluon.metric.Loss())
-    mod.save_checkpoint('test', 100, save_optimizer_states=True)
-    mod = mx.mod.Module.load('test', 100, load_optimizer_states=True,
-                             data_names=('data',), label_names=('label',))
-    mod.fit(data_iter, num_epoch=100, optimizer_params={'learning_rate': 1.},
-            eval_metric=mx.gluon.metric.Loss())
-    assert mod.score(data_iter, eval_metric=mx.gluon.metric.Loss())[0][1] < 0.05
-
-@with_seed()
-def test_huber_loss():
-    N = 20
-    data = mx.random.uniform(-1, 1, shape=(N, 10))
-    label = mx.random.uniform(-1, 1, shape=(N, 1))
-    data_iter = mx.io.NDArrayIter(data, label, batch_size=10, label_name='label', shuffle=True)
-    output = get_net(1)
-    l = mx.symbol.Variable('label')
-    Loss = gluon.loss.HuberLoss()
-    loss = Loss(output, l)
-    loss = mx.sym.make_loss(loss)
-    mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',))
-    mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01},
-            initializer=mx.init.Xavier(magnitude=2), eval_metric=mx.gluon.metric.Loss(),
-            optimizer='adam')
-    assert mod.score(data_iter, eval_metric=mx.gluon.metric.Loss())[0][1] < 0.05
-
-
-@with_seed()
-def test_hinge_loss():
-    N = 20
-    data = mx.random.uniform(-1, 1, shape=(N, 10))
-    label = mx.nd.sign(mx.random.uniform(-1, 1, shape=(N, 1)))
-    data_iter = mx.io.NDArrayIter(data, label, batch_size=10, label_name='label', shuffle=True)
-    output = get_net(1)
-    l = mx.symbol.Variable('label')
-    Loss = gluon.loss.HingeLoss()
-    loss = Loss(output, l)
-    loss = mx.sym.make_loss(loss)
-    mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',))
-    mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01},
-            initializer=mx.init.Xavier(magnitude=2), eval_metric=mx.gluon.metric.Loss(),
-            optimizer='adam')
-    assert mod.score(data_iter, eval_metric=mx.gluon.metric.Loss())[0][1] < 0.06
-
-
-@with_seed()
-def test_squared_hinge_loss():
-    N = 20
-    data = mx.random.uniform(-1, 1, shape=(N, 10))
-    label = mx.nd.sign(mx.random.uniform(-1, 1, shape=(N, 1)))
-    data_iter = mx.io.NDArrayIter(data, label, batch_size=10, label_name='label', shuffle=True)
-    output = get_net(1)
-    l = mx.symbol.Variable('label')
-    Loss = gluon.loss.SquaredHingeLoss()
-    loss = Loss(output, l)
-    loss = mx.sym.make_loss(loss)
-    mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',))
-    mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01},
-            initializer=mx.init.Xavier(magnitude=2), eval_metric=mx.gluon.metric.Loss(),
-            optimizer='adam')
-    assert mod.score(data_iter, eval_metric=mx.gluon.metric.Loss())[0][1] < 0.05
-
-
-@with_seed()
-def test_triplet_loss():
-    N = 20
-    data = mx.random.uniform(-1, 1, shape=(N, 10))
-    pos = mx.random.uniform(-1, 1, shape=(N, 10))
-    neg = mx.random.uniform(-1, 1, shape=(N, 10))
-    data_iter = mx.io.NDArrayIter(data, {'pos': pos, 'neg': neg}, batch_size=10,
-                                  label_name='label', shuffle=True)
-    output = get_net(10)
-    pos = mx.symbol.Variable('pos')
-    neg = mx.symbol.Variable('neg')
-    Loss = gluon.loss.TripletLoss()
-    loss = Loss(output, pos, neg)
-    loss = mx.sym.make_loss(loss)
-    mod = mx.mod.Module(loss, data_names=('data',), label_names=('pos','neg'))
-    mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01},
-            initializer=mx.init.Xavier(magnitude=2), eval_metric=mx.gluon.metric.Loss(),
-            optimizer='adam')
-    assert mod.score(data_iter, eval_metric=mx.gluon.metric.Loss())[0][1] < 0.05
-
 @xfail_when_nonstandard_decimal_separator
 @with_seed()
 def test_sdml_loss():
@@ -443,51 +208,3 @@ def test_poisson_nllloss():
     loss_compute_full = Loss_compute_full(mx.nd.array(np_pred), mx.nd.array(np_target))
     assert_almost_equal(np_compute_full, loss_compute_full.asscalar())
 
-@with_seed()
-def test_poisson_nllloss_mod():
-    N = 1000
-    data = mx.random.poisson(shape=(N, 2))
-    label = mx.random.poisson(lam=4, shape=(N, 1))
-    data_iter = mx.io.NDArrayIter(data, label, batch_size=20, label_name='label', shuffle=True)
-    output = mx.sym.exp(get_net(1))
-    l = mx.symbol.Variable('label')
-    Loss = gluon.loss.PoissonNLLLoss(from_logits=False)
-    loss = Loss(output, l)
-    loss = mx.sym.make_loss(loss)
-    mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',))
-    mod.fit(data_iter, num_epoch=20, optimizer_params={'learning_rate': 0.01},
-            initializer=mx.init.Normal(sigma=0.1), eval_metric=mx.gluon.metric.Loss(),
-            optimizer='adam')
-    assert mod.score(data_iter, eval_metric=mx.gluon.metric.Loss())[0][1] < 0.05
-
-@with_seed()
-def test_bce_loss_with_pos_weight():
-    # Suppose it's a multi-label classification
-    N = np.random.randint(5, 30)
-    data = mx.nd.random.uniform(-1, 1, shape=(N, 20))
-    label = mx.nd.array(np.random.randint(2, size=(N, 5)), dtype='float32')
-    pos_weight = mx.nd.random.uniform(0, 10, shape=(1, 5))
-    pos_weight = mx.nd.repeat(pos_weight, repeats=N, axis=0)
-    data_iter = mx.io.NDArrayIter(data, {'label': label, 'pos_w': pos_weight}, batch_size=10, label_name='label')
-    output = get_net(5)
-    l = mx.symbol.Variable('label')
-    pos_w = mx.symbol.Variable('pos_w')
-    Loss = gluon.loss.SigmoidBinaryCrossEntropyLoss()
-    loss = Loss(output, l, None, pos_w)
-    loss = mx.sym.make_loss(loss)
-    mod = mx.mod.Module(loss, data_names=('data',), label_names=('label', 'pos_w'))
-    mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01},
-            eval_metric=mx.gluon.metric.Loss(), optimizer='adam',
-            initializer=mx.init.Xavier(magnitude=2))
-    assert mod.score(data_iter, eval_metric=mx.gluon.metric.Loss())[0][1] < 0.01
-    # Test against npy
-    data = mx.nd.random.uniform(-5, 5, shape=(N, 5))
-    label = mx.nd.array(np.random.randint(2, size=(N, 5)), dtype='float32')
-    pos_weight = mx.nd.random.uniform(0, 10, shape=(1, 5))
-    mx_bce_loss = Loss(data, label, None, pos_weight).asnumpy()
-    prob_npy = 1.0 / (1.0 + np.exp(-data.asnumpy()))
-    label_npy = label.asnumpy()
-    pos_weight_npy = pos_weight.asnumpy()
-    npy_bce_loss = (- label_npy * np.log(prob_npy)*pos_weight_npy - (1 - label_npy) * np.log(1 - prob_npy)).mean(axis=1)
-    assert_almost_equal(mx_bce_loss, npy_bce_loss, rtol=1e-4, atol=1e-5)
-
diff --git a/tests/python/unittest/test_memory_opt.py b/tests/python/unittest/test_memory_opt.py
index d31eee0d4a99..ae4bd3bf1e9a 100644
--- a/tests/python/unittest/test_memory_opt.py
+++ b/tests/python/unittest/test_memory_opt.py
@@ -172,15 +172,10 @@ def test_resnet152():
     # Verify the memory allocation behavior on ResNet-152, the state-of-the-art
     # model used for image classification.
 
-    # Import the network, similar to what we did in
-    # ${MXNET_ROOT_DIR}/example/image-classification/train_imagenet.py
-    from importlib import import_module
-    sys.path.append(os.path.join(os.path.dirname(__file__),
-                    '..', '..', '..', 'example', 'image-classification'))
-    resnet_mod = import_module('symbols.resnet')
-    resnet_152 = resnet_mod.get_symbol(num_classes=1000,
-                                       num_layers=152,
-                                       image_shape='3,224,224')
+    import resnet
+    resnet_152 = resnet.get_symbol(num_classes=1000,
+                                   num_layers=152,
+                                   image_shape='3,224,224')
     # We do the binding twice, one with the memory optimizations and one without.
     # It is expected that the memory consumption of the former should be roughly
     # half of that of the latter.
diff --git a/tests/python/unittest/test_module.py b/tests/python/unittest/test_module.py
deleted file mode 100644
index b7f12ff3aee1..000000000000
--- a/tests/python/unittest/test_module.py
+++ /dev/null
@@ -1,740 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import os
-import mxnet as mx
-import mxnet.ndarray as nd
-from mxnet.test_utils import *
-import numpy as np
-from functools import reduce
-from mxnet.module.executor_group import DataParallelExecutorGroup
-from common import setup_module, with_seed, assertRaises, teardown_module
-from collections import namedtuple
-curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-sys.path.insert(0, os.path.join(curr_path, "../train"))
-import pytest
-
-
-@with_seed()
-def test_module_dtype():
-    dtype = np.float16
-    dshape = (3, 8, 7)
-
-    sym = mx.sym.Variable('data')
-    sym = mx.sym.Activation(data=sym, act_type='relu', __layout__='TNC')
-
-    mod = mx.mod.Module(sym, ('data',), None, context=[mx.cpu(0), mx.cpu(1)])
-    mod.bind(data_shapes=[mx.io.DataDesc('data', dshape, dtype, layout='TNC')])
-    mod.init_params()
-    mod.forward(mx.io.DataBatch(data=[mx.nd.ones(dshape, dtype=dtype)],
-                              label=None))
-    mod.backward([mx.nd.ones(dshape, dtype=dtype)])
-
-    for x in mod.get_outputs():
-      assert x.dtype == dtype
-
-
-def test_module_bind():
-    sym = mx.sym.Variable('data')
-    sym = mx.sym.Activation(data=sym, act_type='relu', __layout__='TNC')
-
-    mod = mx.mod.Module(sym, ('data',), None, context=[mx.cpu(0), mx.cpu(1)])
-    assertRaises(TypeError, mod.bind, data_shapes=[('data', mx.nd.array([10,10]))])
-    assert mod.binded == False
-
-    mod.bind(data_shapes=[('data', (10,10))])
-    assert mod.binded == True
-
-
-@with_seed()
-def test_module_input_grads():
-    a = mx.sym.Variable('a', __layout__='NC')
-    b = mx.sym.Variable('b', __layout__='NC')
-    c = mx.sym.Variable('c', __layout__='NC')
-
-    c = a + 2 * b + 3 * c
-    net = mx.mod.Module(c, data_names=['b', 'c', 'a'], label_names=None,
-                        context=[mx.cpu(0), mx.cpu(1)])
-    net.bind(data_shapes=[['b', (5, 5)], ['c', (5, 5)], ['a', (5, 5)]],
-             label_shapes=None, inputs_need_grad=True)
-    net.init_params()
-
-    net.forward(data_batch=mx.io.DataBatch(data=[nd.ones((5, 5)),
-                                                 nd.ones((5, 5)),
-                                                 nd.ones((5, 5))]))
-    net.backward(out_grads=[nd.ones((5, 5))])
-    input_grads = net.get_input_grads()
-    b_grad = input_grads[0].asnumpy()
-    c_grad = input_grads[1].asnumpy()
-    a_grad = input_grads[2].asnumpy()
-    assert np.all(a_grad == 1), a_grad
-    assert np.all(b_grad == 2), b_grad
-    assert np.all(c_grad == 3), c_grad
-
-
-@with_seed()
-def test_module_ctx_group():
-    def check_module_ctx_group(ctxs, group2ctxs, grad_ctxs=None):
-        with mx.AttrScope(ctx_group='dev1'):
-            a = mx.symbol.Variable('a')
-            a = a * 2
-        with mx.AttrScope(ctx_group='dev2'):
-            b = mx.symbol.Variable('b')
-            c = a + b
-        shape = (2, 5)
-        mod1 = mx.mod.Module(c, context=ctxs, data_names=['a', 'b'], label_names=None,
-                             group2ctxs=group2ctxs)
-        mod1.bind(data_shapes=[['a', shape], ['b', shape]], inputs_need_grad=True)
-        mod1.init_params()
-        mod1.forward(data_batch=mx.io.DataBatch(data=[mx.nd.ones(shape), mx.nd.ones(shape)]), is_train=True)
-        mod1.backward([mx.nd.ones(shape)])
-        mod1_input_grads = mod1.get_input_grads()
-
-        mod2 = mx.mod.Module(c, context=ctxs, data_names=['a', 'b'], label_names=None)
-        mod2.bind(data_shapes=[['a', shape], ['b', shape]], inputs_need_grad=True)
-        mod2.init_params()
-        mod2.forward(data_batch=mx.io.DataBatch(data=[mx.nd.ones(shape), mx.nd.ones(shape)]), is_train=True)
-        mod2.backward([mx.nd.ones(shape)])
-        mod2_input_grads = mod2.get_input_grads()
-
-        if grad_ctxs is not None:
-            assert(mod1_input_grads[0].context == grad_ctxs[0])
-            assert(mod1_input_grads[1].context == grad_ctxs[1])
-        assert(np.all(mod1_input_grads[0].asnumpy() == mod2_input_grads[0].asnumpy()))
-        assert(np.all(mod1_input_grads[1].asnumpy() == mod2_input_grads[1].asnumpy()))
-
-    check_module_ctx_group([mx.cpu(0)], {'dev1': mx.cpu(1), 'dev2': mx.cpu(2)}, grad_ctxs=[mx.cpu(1), mx.cpu(2)])
-    check_module_ctx_group([mx.cpu(0), mx.cpu(1)],
-        [{'dev1': mx.cpu(2), 'dev2': mx.cpu(3)}, {'dev1': mx.cpu(4), 'dev2': mx.cpu(5)}])
-    check_module_ctx_group([mx.cpu(0), mx.cpu(1)], {'dev1': mx.cpu(2), 'dev2': mx.cpu(3)})
-    check_module_ctx_group([mx.cpu(0), mx.cpu(1)], {'dev1': mx.cpu(2), 'dev2': [mx.cpu(3)]})
-    check_module_ctx_group([mx.cpu(0), mx.cpu(1)], {'dev1':mx.cpu(2), 'dev2':[mx.cpu(3), mx.cpu(3)]})
-    check_module_ctx_group([mx.cpu(0), mx.cpu(1)],
-        {'dev1':[mx.cpu(2), mx.cpu(2)], 'dev2':[mx.cpu(3), mx.cpu(3)]})
-
-@with_seed()
-def test_bucket_module_ctx_group():
-    num_hidden = 10
-    batch_size = 5
-    def sym_gen(seq_len):
-        with mx.AttrScope(ctx_group='dev1'):
-            data = mx.symbol.Variable('data')
-            weight = mx.symbol.Variable('dev1_weight')
-            bias = mx.symbol.Variable('dev1_bias')
-            fc = data
-            for i in range(seq_len):
-                fc  = mx.symbol.FullyConnected(data=fc, weight=weight, bias=bias,
-                                               name='dev1_fc_%d' % i, num_hidden=num_hidden)
-        with mx.AttrScope(ctx_group='dev2'):
-            label = mx.symbol.Variable('label')
-            weight = mx.symbol.Variable('dev2_weight')
-            bias = mx.symbol.Variable('dev2_bias')
-            for i in range(seq_len):
-                fc  = mx.symbol.FullyConnected(data=fc, weight=weight, bias=bias,
-                                               name='dev2_fc_%d' % i, num_hidden=num_hidden)
-            sym = mx.symbol.SoftmaxOutput(fc, label, name='softmax')
-
-        return sym, ('data',), ('label',)
-
-    mod = mx.mod.BucketingModule(sym_gen=sym_gen, default_bucket_key=10, context=[mx.cpu(0)],
-                                 group2ctxs=[{'dev1': mx.cpu(1), 'dev2': mx.cpu(2)}])
-    mod.bind(data_shapes=[['data', (batch_size, num_hidden)]],
-             label_shapes=[['label', (batch_size,)]],
-             for_training=True, inputs_need_grad=True)
-    assert(mod.binded)
-
-@with_seed()
-def test_module_layout():
-    sym = mx.sym.Variable('data')
-    sym = mx.sym.Activation(data=sym, act_type='relu', __layout__='TNC')
-
-    dshape = (3, 8, 7)
-    mod = mx.mod.Module(sym, ('data',), None, context=[mx.cpu(0), mx.cpu(1)])
-    mod.bind(data_shapes=[mx.io.DataDesc('data', dshape, layout='TNC')])
-    mod.init_params()
-    mod.forward(mx.io.DataBatch(data=[mx.nd.ones(dshape)],
-                                label=None))
-    mod.backward([mx.nd.ones(dshape)])
-    assert mod.get_outputs()[0].shape == dshape
-
-    hdshape = (3, 4, 7)
-    for x in mod.get_outputs(merge_multi_context=False)[0]:
-        assert x.shape == hdshape
-
-
-@with_seed()
-@pytest.mark.parametrize('ctx,get_updater', [
-    (mx.cpu(), lambda m: m._updater),
-    ([mx.cpu(0), mx.cpu(1)], lambda m: m._kvstore._updater)
-])
-def test_save_load(ctx, get_updater, tmpdir):
-    previous_update_on_kvstore = os.getenv('MXNET_UPDATE_ON_KVSTORE', "1")
-    os.putenv('MXNET_UPDATE_ON_KVSTORE', '1')
-    def dict_equ(a, b):
-        assert set(a) == set(b)
-        for k in a:
-            assert (a[k].asnumpy() == b[k].asnumpy()).all()
-
-    sym = mx.sym.Variable('data')
-    sym = mx.sym.FullyConnected(sym, num_hidden=100)
-
-    path = str(tmpdir.join('test'))
-    mod = mx.mod.Module(sym, ('data',), context=ctx)
-    mod.bind(data_shapes=[('data', (10, 10))])
-    mod.init_params()
-    mod.init_optimizer(optimizer_params={'learning_rate':0.1, 'momentum':0.9})
-    mod.update()
-    mod.save_checkpoint(path, 0, save_optimizer_states=True)
-
-    mod2 = mx.mod.Module.load(path, 0, load_optimizer_states=True, data_names=('data',))
-    mod2.bind(data_shapes=[('data', (10, 10))])
-    mod2.init_optimizer(optimizer_params={'learning_rate':0.1, 'momentum':0.9})
-    assert mod._symbol.tojson() == mod2._symbol.tojson()
-    dict_equ(mod.get_params()[0], mod2.get_params()[0])
-    dict_equ(get_updater(mod).states, mod2._updater.states)
-
-    os.putenv('MXNET_UPDATE_ON_KVSTORE', previous_update_on_kvstore)
-
-
-@with_seed()
-def test_module_reshape():
-    data = mx.sym.Variable('data')
-    sym = mx.sym.FullyConnected(data, num_hidden=20, name='fc')
-
-    dshape = (7, 20)
-    mod = mx.mod.Module(sym, ('data',), None, context=[mx.cpu(0), mx.cpu(1)])
-    mod.bind(data_shapes=[('data', dshape)])
-    mod.init_params()
-    mod.init_optimizer(optimizer_params={'learning_rate': 1})
-
-    mod.forward(mx.io.DataBatch(data=[mx.nd.ones(dshape)],
-                                label=None))
-    mod.backward([mx.nd.ones(dshape)])
-    mod.update()
-    assert mod.get_outputs()[0].shape == dshape
-    assert (mod.get_params()[0]['fc_bias'].asnumpy() == -1).all()
-
-    dshape = (14, 20)
-    mod.reshape(data_shapes=[('data', dshape)])
-    mod.forward(mx.io.DataBatch(data=[mx.nd.ones(dshape)],
-                                label=None))
-    mod.backward([mx.nd.ones(dshape)])
-    mod.update()
-    assert mod.get_outputs()[0].shape == dshape
-    assert (mod.get_params()[0]['fc_bias'].asnumpy() == -3).all()
-
-
-
-# roywei: Getting rid of fixed seed as flakiness could not be reproduced,
-# tracked at: https://github.com/apache/incubator-mxnet/issues/11705
-@with_seed()
-def test_module_set_params():
-    # data iter
-    data = mx.nd.array([[0.05, .10]]);
-    label = mx.nd.array([[.01, 0.99]]);
-    train_data = mx.io.NDArrayIter(data, label, batch_size=1)
-
-    # symbols
-    x = mx.symbol.Variable('data')
-    x = mx.symbol.FullyConnected(name='fc_0', data=x, num_hidden=2)
-    x = mx.symbol.Activation(name="act_0", data=x, act_type='sigmoid')
-    x = mx.symbol.FullyConnected(name='fc_1', data=x, num_hidden=2)
-    x = mx.symbol.Activation(name="act_1", data=x, act_type='sigmoid')
-    x = mx.symbol.LinearRegressionOutput(data=x, name='softmax', grad_scale=2)
-
-    # create module
-    mod = mx.mod.Module(x, context=[mx.cpu()]);
-    mod.bind(train_data.provide_data, label_shapes=train_data.provide_label,
-             for_training=True)
-
-    arg_params_correct = {'fc_0_weight': mx.nd.array([[.15, .20], [.25, .30]]),
-                  'fc_0_bias'  : mx.nd.array([.35, .35]),
-                  'fc_1_weight': mx.nd.array([[.40, .45], [.50, .55]]),
-                  'fc_1_bias'  : mx.nd.array([.60, .60])}
-
-    arg_params_missing = {'fc_0_weight': mx.nd.array([[.15, .20], [.25, .30]]),
-                  'fc_0_bias'  : mx.nd.array([.35, .35]),
-                  'fc_1_weight': mx.nd.array([[.40, .45], [.50, .55]])}
-
-    arg_params_extra = {'fc_0_weight': mx.nd.array([[.15, .20], [.25, .30]]),
-                  'fc_0_bias'  : mx.nd.array([.35, .35]),
-                  'fc_1_weight': mx.nd.array([[.40, .45], [.50, .55]]),
-                  'fc_1_bias'  : mx.nd.array([.60, .60]),
-                  'fc_2_weight': mx.nd.array([.60, .60])}
-
-    arg_params_missing_extra = {'fc_2_weight': mx.nd.array([.60, .60])}
-
-    # test regular set_params
-    mod.set_params(force_init=True, arg_params=arg_params_correct, aux_params={})
-
-    # test allow missing
-    mod.set_params(force_init=True, arg_params=arg_params_missing, aux_params={}, allow_missing=True)
-    assertRaises(RuntimeError, mod.set_params,
-                 force_init=True, arg_params=arg_params_missing,
-                 aux_params={}, allow_missing=False)
-
-    # test allow extra
-    mod.set_params(force_init=True, arg_params=arg_params_extra, aux_params={}, allow_missing=True, allow_extra=True)
-    assertRaises(ValueError, mod.set_params,
-                 force_init=True, arg_params=arg_params_extra,
-                 aux_params={}, allow_missing=True, allow_extra=False)
-
-    # test allow missing + extra,
-    assertRaises(RuntimeError, mod.set_params,
-                 force_init=True, arg_params=arg_params_missing_extra,
-                 aux_params={}, allow_missing=False, allow_extra=False)
-
-    # test allow missing + extra, this will throw a runtime error
-    assertRaises(ValueError, mod.set_params,
-                 force_init=True, arg_params=arg_params_missing_extra,
-                 aux_params={}, allow_missing=True, allow_extra=False)
-
-
-@with_seed()
-@pytest.mark.garbage_expected
-def test_monitor():
-    # data iter
-    data = mx.nd.array([[0.05, .10]]);
-    label = mx.nd.array([[.01, 0.99]]);
-    train_data = mx.io.NDArrayIter(data, label, batch_size=1)
-
-    # symbols
-    x = mx.symbol.Variable('data')
-    x = mx.symbol.FullyConnected(name='fc_0', data=x, num_hidden=2)
-    x = mx.symbol.Activation(name="act_0", data=x, act_type='sigmoid')
-    x = mx.symbol.FullyConnected(name='fc_1', data=x, num_hidden=2)
-    x = mx.symbol.Activation(name="act_1", data=x, act_type='sigmoid')
-    x = mx.symbol.LinearRegressionOutput(data=x, name='softmax', grad_scale=2)
-
-    # create monitor
-    def mean_abs(x):
-        sum_abs = mx.ndarray.sum(mx.ndarray.abs(x))
-        return mx.ndarray.divide(sum_abs, reduce(lambda x, y: x * y, x.shape))
-    mon = mx.mon.Monitor(1, stat_func=mean_abs, pattern='.*', sort=True)
-
-    # create module
-    mod = mx.mod.Module(x, context=[mx.cpu()]);
-    mod.bind(train_data.provide_data, label_shapes=train_data.provide_label,
-                    for_training=True)
-    mod.install_monitor(mon)
-    arg_params = {'fc_0_weight': mx.nd.array([[.15, .20], [.25, .30]]),
-                  'fc_0_bias'  : mx.nd.array([.35, .35]),
-                  'fc_1_weight': mx.nd.array([[.40, .45], [.50, .55]]),
-                  'fc_1_bias'  : mx.nd.array([.60, .60])}
-    mod.init_params(arg_params=arg_params)
-
-    data_iter = iter(train_data)
-    data_batch = next(data_iter)
-    mon.tic()
-    mod.forward_backward(data_batch)
-    res = mon.toc()
-    keys = ['act_0', 'act_1', 'data', 'fc_0', 'fc_1', 'softmax']
-    mon_result_counts = [0, 0, 0, 0, 0, 0]
-    assert(len(res) == 21)
-    for n, k, v in res:
-        for idx, key in enumerate(keys):
-            if k.startswith(key):
-                mon_result_counts[idx] += 1
-                break
-    assert(mon_result_counts == [2, 2, 1, 6, 6, 4])
-
-
-@with_seed()
-def test_factorization_machine_module():
-    """ Test factorization machine model with sparse operators """
-    # this unit test is to test the flow, training accuracy is tested in another test
-    def check_factorization_machine_module(num_epochs=None):
-        print("check_factorization_machine_module")
-
-        def fm(factor_size, feature_dim, init):
-            x = mx.symbol.Variable("data", stype='csr')
-            v = mx.symbol.Variable("v", shape=(feature_dim, factor_size),
-                                   init=init, stype='row_sparse')
-
-            w1_weight = mx.symbol.var('w1_weight', shape=(feature_dim, 1),
-                                      init=init, stype='row_sparse')
-            w1_bias = mx.symbol.var('w1_bias', shape=(1))
-            w1 = mx.symbol.broadcast_add(mx.symbol.dot(x, w1_weight), w1_bias)
-
-            v_s = mx.symbol._internal._square_sum(data=v, axis=1, keepdims=True)
-            x_s = mx.symbol.square(data=x)
-            bd_sum = mx.sym.dot(x_s, v_s)
-
-            w2 = mx.symbol.dot(x, v)
-            w2_squared = 0.5 * mx.symbol.square(data=w2)
-
-            w_all = mx.symbol.Concat(w1, w2_squared, dim=1)
-            sum1 = mx.symbol.sum(data=w_all, axis=1, keepdims=True)
-            sum2 = 0.5 * mx.symbol.negative(bd_sum)
-            model = mx.sym.elemwise_add(sum1, sum2)
-
-            y = mx.symbol.Variable("label")
-            model = mx.symbol.LinearRegressionOutput(data=model, label=y)
-            return model
-
-        # model
-        init = mx.initializer.Normal(sigma=0.01)
-        factor_size = 4
-        feature_dim = 10000
-        model = fm(factor_size, feature_dim, init)
-
-        # data iter
-        num_batches = 5
-        batch_size = 64
-        num_samples = batch_size * num_batches
-        # generate some random csr data
-        csr_nd = rand_ndarray((num_samples, feature_dim), 'csr', 0.1)
-        label = mx.nd.ones((num_samples,1))
-        # the alternative is to use LibSVMIter
-        train_iter = mx.io.NDArrayIter(data=csr_nd,
-                                       label={'label':label},
-                                       batch_size=batch_size,
-                                       last_batch_handle='discard')
-        # create module
-        mod = mx.mod.Module(symbol=model, data_names=['data'], label_names=['label'])
-        # allocate memory by given the input data and lable shapes
-        mod.bind(data_shapes=train_iter.provide_data, label_shapes=train_iter.provide_label)
-        # initialize parameters by uniform random numbers
-        mod.init_params(initializer=init)
-
-        # use Sparse SGD with learning rate 0.1 to train
-        sgd = mx.optimizer.SGD(momentum=0.1, clip_gradient=5.0, learning_rate=0.01,
-                               rescale_grad=1.0/batch_size)
-        mod.init_optimizer(optimizer=sgd)
-        if num_epochs is None:
-            num_epochs = 50
-        expected_accuracy = 0.02
-
-	# use accuracy as the metric
-        metric = mx.gluon.metric.create('MSE')
-        # train 'num_epochs' epoch
-        for epoch in range(num_epochs):
-            train_iter.reset()
-            metric.reset()
-            for batch in train_iter:
-                mod.forward(batch, is_train=True)       # compute predictions
-                mod.update_metric(metric, batch.label)  # accumulate prediction accuracy
-                mod.backward()                          # compute gradients
-                mod.update()                            # update parameters
-            print('Epoch %d, Training %s' % (epoch, metric.get()))
-        if num_epochs > 1:
-            assert(metric.get()[1] < expected_accuracy)
-
-    check_factorization_machine_module()
-
-@with_seed()
-def test_module_initializer():
-    def regression_model(m):
-         x = mx.symbol.var("data", stype='csr')
-         v = mx.symbol.var("v", shape=(m, 1), init=mx.init.Uniform(scale=.1),
-                                stype='row_sparse')
-         model = mx.symbol.dot(lhs=x, rhs=v)
-         y = mx.symbol.Variable("label")
-         model = mx.symbol.LinearRegressionOutput(data=model, label=y, name="out")
-         return model
-
-    n, m = 128, 100
-    model = regression_model(m)
-
-    data = mx.nd.zeros(shape=(n, m), stype='csr')
-    label = mx.nd.zeros((n, 1))
-    iterator = mx.io.NDArrayIter(data=data, label={'label':label},
-                                 batch_size=n, last_batch_handle='discard')
-
-    # create module
-    mod = mx.mod.Module(symbol=model, data_names=['data'], label_names=['label'])
-    mod.bind(data_shapes=iterator.provide_data, label_shapes=iterator.provide_label)
-    mod.init_params()
-    v = mod._arg_params['v']
-    assert(v.stype == 'row_sparse')
-    assert(np.sum(v.asnumpy()) != 0)
-
-@with_seed()
-def test_forward_reshape():
-    num_class=10
-    data1 = mx.sym.Variable('data1')
-    data2 = mx.sym.Variable('data2')
-    conv1 = mx.sym.Convolution(data=data1, kernel=(2, 2), num_filter=2, stride=(2, 2))
-    conv2 = mx.sym.Convolution(data=data2, kernel=(3, 3), num_filter=3, stride=(1, 1))
-    pooling1 = mx.sym.Pooling(data=conv1, kernel=(2, 2), stride=(1, 1), pool_type="avg")
-    pooling2 = mx.sym.Pooling(data=conv2, kernel=(2, 2), stride=(1, 1), pool_type="max")
-    flatten1 = mx.sym.flatten(data=pooling1)
-    flatten2 = mx.sym.flatten(data=pooling2)
-    sum = mx.sym.sum(data=flatten1, axis=1) + mx.sym.sum(data=flatten2, axis=1)
-    fc = mx.sym.FullyConnected(data=sum, num_hidden=num_class)
-    sym = mx.sym.SoftmaxOutput(data=fc, name='softmax')
-
-    dshape1 = (10, 3, 64, 64)
-    dshape2 = (10, 3, 32, 32)
-    lshape = (10,)
-
-    mod = mx.mod.Module(symbol=sym, data_names=['data1', 'data2'],
-                        label_names=['softmax_label'])
-    mod.bind(data_shapes=[('data1', dshape1), ('data2', dshape2)],
-             label_shapes=[('softmax_label', lshape)])
-    mod.init_params()
-    mod.init_optimizer(optimizer_params={'learning_rate': 0.01})
-
-    # Train with original data shapes
-    data_batch = mx.io.DataBatch(data=[mx.nd.random.uniform(0, 9, dshape1),
-                                       mx.nd.random.uniform(5, 15, dshape2)],
-                                 label=[mx.nd.ones(lshape)])
-    mod.forward(data_batch)
-    assert mod.get_outputs()[0].shape == tuple([lshape[0], num_class])
-    mod.backward()
-    mod.update()
-
-    # Train with different batch size
-    dshape1 = (3, 3, 64, 64)
-    dshape2 = (3, 3, 32, 32)
-    lshape = (3,)
-    data_batch = mx.io.DataBatch(data=[mx.nd.random.uniform(0, 9, dshape1),
-                                       mx.nd.random.uniform(5, 15, dshape2)],
-                                 label=[mx.nd.ones(lshape)])
-    mod.forward(data_batch)
-    assert mod.get_outputs()[0].shape == tuple([lshape[0], num_class])
-    mod.backward()
-    mod.update()
-
-    dshape1 = (20, 3, 64, 64)
-    dshape2 = (20, 3, 32, 32)
-    lshape = (20,)
-    data_batch = mx.io.DataBatch(data=[mx.nd.random.uniform(3, 5, dshape1),
-                                       mx.nd.random.uniform(10, 25, dshape2)],
-                                 label=[mx.nd.ones(lshape)])
-    mod.forward(data_batch)
-    assert mod.get_outputs()[0].shape == tuple([lshape[0], num_class])
-    mod.backward()
-    mod.update()
-
-    #Train with both different batch size and data shapes
-    dshape1 = (20, 3, 120, 120)
-    dshape2 = (20, 3, 32, 64)
-    lshape = (20,)
-    data_batch = mx.io.DataBatch(data=[mx.nd.random.uniform(0, 9, dshape1),
-                                       mx.nd.random.uniform(5, 15, dshape2)],
-                                 label=[mx.nd.ones(lshape)])
-    mod.forward(data_batch)
-    assert mod.get_outputs()[0].shape == tuple([lshape[0], num_class])
-    mod.backward()
-    mod.update()
-
-    dshape1 = (5, 3, 28, 40)
-    dshape2 = (5, 3, 24, 16)
-    lshape = (5,)
-    data_batch = mx.io.DataBatch(data=[mx.nd.random.uniform(0, 9, dshape1),
-                                       mx.nd.random.uniform(15, 25, dshape2)],
-                                 label=[mx.nd.ones(lshape)])
-    mod.forward(data_batch)
-    assert mod.get_outputs()[0].shape == tuple([lshape[0], num_class])
-    mod.backward()
-    mod.update()
-
-    #Test score
-    dataset_shape1 = (30, 3, 30, 30)
-    dataset_shape2 = (30, 3, 20, 40)
-    labelset_shape = (30,)
-
-    eval_dataiter = mx.io.NDArrayIter(data=[mx.nd.random.uniform(0, 9, dataset_shape1),
-                                            mx.nd.random.uniform(15, 25, dataset_shape2)],
-                                      label=[mx.nd.ones(labelset_shape)],
-                                      batch_size=5)
-    assert len(mod.score(eval_data=eval_dataiter, eval_metric='acc')) == 1
-
-    #Test prediction
-    dshape1 = (1, 3, 30, 30)
-    dshape2 = (1, 3, 20, 40)
-    dataset_shape1 = (10, 3, 30, 30)
-    dataset_shape2 = (10, 3, 20, 40)
-
-    pred_dataiter = mx.io.NDArrayIter(data=[mx.nd.random.uniform(0, 9, dataset_shape1),
-                                            mx.nd.random.uniform(15, 25, dataset_shape2)])
-    mod.bind(data_shapes=[('data1', dshape1), ('data2', dshape2)],
-             for_training=False, force_rebind=True)
-    assert mod.predict(pred_dataiter).shape == tuple([10, num_class])
-
-@with_seed()
-def test_forward_types():
-    #Test forward with other data batch API
-    Batch = namedtuple('Batch', ['data'])
-    data = mx.sym.Variable('data')
-    out = data * 2
-    mod = mx.mod.Module(symbol=out, label_names=None)
-    mod.bind(data_shapes=[('data', (1, 10))])
-    mod.init_params()
-    data1 = [mx.nd.ones((1, 10))]
-    mod.forward(Batch(data1))
-    assert mod.get_outputs()[0].shape == (1, 10)
-    data2 = [mx.nd.ones((3, 5))]
-    mod.forward(Batch(data2))
-    assert mod.get_outputs()[0].shape == (3, 5)
-
-    #Test forward with other NDArray and np.ndarray inputs
-    data = mx.sym.Variable('data')
-    out = data * 2
-    mod = mx.mod.Module(symbol=out, label_names=None)
-    mod.bind(data_shapes=[('data', (1, 10))])
-    mod.init_params()
-    data1 = mx.nd.ones((1, 10))
-    assert mod.predict(data1).shape == (1, 10)
-    data2 = np.ones((1, 10))
-    assert mod.predict(data1).shape == (1, 10)
-
-
-def test_reference_single_batch_during_fit():
-    """
-    When using C++-based iterators, it's important that only a single batch is referenced at a time. Because C++
-    iterators are exposed to the Python code through a C API, there is no concept of reference counting. Hence,
-    typically C++ iterators will deallocate a batch when next() is called on them. So, we need to make sure the Python
-    code only references a single batch at a time, otherwise the Python code will attempt to access freed memory,
-    resulting in either (a) garbage accuracy or (b) a segmentation fault.
-    """
-    current_batch_i = None
-
-    class MockBatch(object):
-        def __init__(self, i):
-            self.i = i
-
-        @property
-        def label(self):
-            global current_batch_i
-            assert self.i == current_batch_i
-
-    class MockTrainData(object):
-        def __init__(self, batches):
-            self._i = 0
-            self._batches = batches
-            self.provide_data = None
-            self.provide_label = None
-            self.reset = lambda: None
-
-        def __iter__(self):
-            self._i = 0
-            return self
-
-        def __next__(self):
-            global current_batch_i
-
-            if self._i < self._batches:
-                current_batch_i = self._i
-                self._i += 1
-                return MockBatch(current_batch_i)
-            raise StopIteration
-
-        def next(self):
-            return self.__next__()
-
-    mod = mx.mod.BaseModule()
-
-    def empty_fn(*args, **kwargs):
-        pass
-    mod.bind = empty_fn
-    mod.init_params = empty_fn
-    mod.init_optimizer = empty_fn
-    mod.forward = empty_fn
-    mod.backward = empty_fn
-    mod.update = empty_fn
-    mod.update_metric = empty_fn
-    mod.get_params = lambda: (None, None)
-
-    train_data = MockTrainData(batches=2)
-    mod.fit(train_data, num_epoch=1)
-
-@with_seed()
-def test_bucket_module_grad_req():
-    batch_size = 2
-    def sym_gen(_):
-        data = mx.symbol.Variable('data')
-        weight = mx.symbol.Variable('a', shape=(1,), init=mx.init.One())
-        sym = mx.sym.make_loss(mx.sym.broadcast_mul(data, weight))
-        return sym, ('data',), None
-
-    mod = mx.mod.BucketingModule(sym_gen=sym_gen, default_bucket_key=10)
-    mod.bind(data_shapes=[['data', (batch_size, )]], for_training=True, grad_req='write')
-    mod.init_params()
-
-    mod.forward_backward(mx.io.DataBatch(data=[mx.nd.ones((batch_size,))],
-                                         label=None,
-                                         provide_data=[mx.io.DataDesc(name='data', shape=(batch_size, ), layout='N')],
-                                         bucket_key=10))
-    assert(mod._curr_module._exec_group.execs[0].grad_dict['a'].asscalar() == batch_size)
-
-    mod.forward_backward(mx.io.DataBatch(data=[mx.nd.ones((batch_size,))],
-                                         label=None,
-                                         provide_data=[mx.io.DataDesc(name='data', shape=(batch_size, ), layout='N')],
-                                         bucket_key=5))
-    assert(mod._curr_module._exec_group.execs[0].grad_dict['a'].asscalar() == batch_size)
-
-    mod = mx.mod.BucketingModule(sym_gen=sym_gen, default_bucket_key=10)
-    mod.bind(data_shapes=[['data', (batch_size, )]], for_training=True, grad_req='add')
-    mod.init_params()
-
-    mod.forward_backward(mx.io.DataBatch(data=[mx.nd.ones((batch_size,))],
-                                         label=None,
-                                         provide_data=[mx.io.DataDesc(name='data', shape=(batch_size,), layout='N')],
-                                         bucket_key=10))
-    assert(mod._curr_module._exec_group.execs[0].grad_dict['a'].asscalar() == batch_size)
-
-    mod.forward_backward(mx.io.DataBatch(data=[mx.nd.ones((batch_size,))],
-                                         label=None,
-                                         provide_data=[mx.io.DataDesc(name='data', shape=(batch_size,), layout='N')],
-                                         bucket_key=5))
-    assert mod._curr_module._grad_req == 'add'
-    assert(mod._curr_module._exec_group.execs[0].grad_dict['a'].asscalar() == 2 * batch_size)
-
-
-def test_module_update_no_pragram():
-    # test module to do update on layers without params
-    data_shape = (10, 10)
-    data = mx.sym.Variable('data')
-    out = mx.sym.Dropout(data, 0.5)
-    mod = mx.mod.Module(out)
-    mod.bind(data_shapes=[('data', data_shape)])
-    mod.init_params()
-    mod.init_optimizer()
-    data_batch = mx.io.DataBatch([nd.ones(data_shape)])
-    mod.forward_backward(data_batch)
-    mod.update()
-    assert(mod.get_outputs()[0].shape == data_shape)
-
-
-def test_module_init_optimizer():
-    def get_module_idx2name(mod):
-        idx2name = {}
-        idx2name.update(enumerate(mod._exec_group.param_names))
-        return idx2name
-
-    data = mx.sym.Variable('data')
-    sym = mx.sym.FullyConnected(data, num_hidden=20, name='fc')
-    batch_size = 8
-    opt_params = {'learning_rate': 1, 'rescale_grad': 1.0 / batch_size}
-
-    # Pass an optimizer str
-    mod1 = mx.mod.Module(sym, ('data',), None, context=mx.cpu(0))
-    mod1.bind(data_shapes=[('data', (batch_size, 20))])
-    mod1.init_params()
-    mod1.init_optimizer(optimizer='sgd', optimizer_params=opt_params)
-    assert mod1._optimizer.idx2name == get_module_idx2name(mod1)
-
-    # Pass an Optimizer object
-    mod2 = mx.mod.Module(sym, ('data',), None, context=mx.cpu(0))
-    mod2.bind(data_shapes=[('data', (batch_size, 20))])
-    mod2.init_params()
-    opt = mx.optimizer.SGD(**opt_params)
-    mod2.init_optimizer(optimizer=opt)
-    assert mod2._optimizer.idx2name == get_module_idx2name(mod2)
-
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 0e4405379c13..59a7e2620096 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -34,46 +34,6 @@
 import pytest
 import os
 
-def check_rnn_consistency(cell1, cell2, T, N, I, H, grad_req, rtol=1e-2, atol=1e-4):
-    dshape = (N, T, I)
-    data = mx.sym.Variable('data')
-
-    Y1, _ = cell1.unroll(T, data, layout='NTC', merge_outputs=True)
-    mod1 = mx.mod.Module(Y1, label_names=None, context=default_context())
-    mod1.bind(data_shapes=[('data', dshape)], label_shapes=None, inputs_need_grad=True, grad_req=grad_req)
-
-    Y2, _ = cell2.unroll(T, data, layout='NTC', merge_outputs=True)
-    mod2 = mx.mod.Module(Y2, label_names=None, context=default_context())
-    mod2.bind(data_shapes=[('data', dshape)], label_shapes=None, inputs_need_grad=True, grad_req=grad_req)
-
-    mod1.init_params()
-    args, auxs = mod1.get_params()
-    args = cell1.unpack_weights(args)
-    args = cell2.pack_weights(args)
-    mod2.set_params(args, auxs)
-
-    x = mx.random.uniform(shape=dshape)
-    batch=mx.io.DataBatch(data=[x])
-    # check inference
-    mod1.forward(batch, is_train=False)
-    mod2.forward(batch, is_train=False)
-    assert_allclose(mod1.get_outputs()[0].asnumpy(), mod2.get_outputs()[0].asnumpy(), rtol=rtol, atol=atol)
-
-    # check training
-    mod1.forward(batch, is_train=True)
-    mod2.forward(batch, is_train=True)
-    assert_allclose(mod1.get_outputs()[0].asnumpy(), mod2.get_outputs()[0].asnumpy(), rtol=rtol, atol=atol)
-
-    dy = mx.random.uniform(shape=mod1.get_outputs()[0].shape)
-    mod1.backward(out_grads=[dy])
-    mod2.backward(out_grads=[dy])
-    if type(grad_req) is dict and grad_req['data'] == 'null' or grad_req == 'null':
-        assert(mod1.get_input_grads()[0] == None)
-        assert(mod2.get_input_grads()[0] == None)
-    else:
-        assert_allclose(mod1.get_input_grads()[0].asnumpy(), mod2.get_input_grads()[0].asnumpy(), rtol=rtol, atol=atol)
-
-
 @with_seed()
 @assert_raises_cudnn_not_satisfied(min_version='5.1.10')
 @pytest.mark.serial
diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py
index d664d06fd59b..e560f13647b7 100644
--- a/tests/python/unittest/test_optimizer.py
+++ b/tests/python/unittest/test_optimizer.py
@@ -54,32 +54,6 @@ def test_learning_rate_expect_user_warning():
         o.set_learning_rate(0.5)
 
 
-@with_seed()
-def test_lr_wd_mult():
-    data = mx.sym.Variable('data')
-    bias = mx.sym.Variable('fc1_bias', lr_mult=1.0)
-    fc1 = mx.sym.FullyConnected(data=data, bias=bias, name='fc1', num_hidden=10, lr_mult=0)
-    fc2 = mx.sym.FullyConnected(data=fc1, name='fc2', num_hidden=10, wd_mult=0.5)
-
-    mod = mx.mod.Module(symbol=fc2, label_names=None, context=default_context())
-    mod.bind(data_shapes=[('data', (5,10))])
-    mod.init_params(initializer=mx.init.Uniform(1.0))
-    mod.init_optimizer(optimizer_params={'learning_rate': 1.0})
-    args1, _ = mod.get_params()
-    args1 = {k: v.asnumpy() for k, v in args1.items()}
-    mod.forward(mx.io.DataBatch(data=[mx.random.uniform(low=-1.0, high=1.0, shape=(5,10))], label=None), is_train=True)
-    mod.backward(mod.get_outputs())
-    mod.update()
-    args2, _ = mod.get_params()
-    args2 = {k: v.asnumpy() for k, v in args2.items()}
-
-    assert mod._optimizer.lr_mult == {'fc1_bias': 1.0, 'fc1_weight': 0.0}
-    assert mod._optimizer.wd_mult == {'fc2_bias': 0.5, 'fc2_weight': 0.5}
-    assert mx.test_utils.almost_equal(args1['fc1_weight'], args2['fc1_weight'], 1e-10)
-    assert not mx.test_utils.almost_equal(args1['fc1_bias'], args2['fc1_bias'], 1e-1)
-    assert not mx.test_utils.almost_equal(args1['fc2_weight'], args2['fc2_weight'], 1e-1)
-
-
 @xfail_when_nonstandard_decimal_separator
 @with_seed()
 def test_sgd():
@@ -518,7 +492,7 @@ def test_sparse_adam():
 
 @xfail_when_nonstandard_decimal_separator
 @with_seed()
-@retry(3)
+@pytest.mark.skip(reason="Flaky test https://github.com/apache/incubator-mxnet/issues/18400")
 def test_adamax():
     opt1 = mx.optimizer.Adamax
     opt2 = mx.optimizer.Adamax