From e6a1139812db33bd11fcc8491915f6a51d42418b Mon Sep 17 00:00:00 2001
From: Eric Junyuan Xie <piiswrong@users.noreply.github.com>
Date: Thu, 3 Aug 2017 10:36:11 -0700
Subject: [PATCH 01/26] add backward(is_train=False) and always mode for
 dropout (#7303)

* add backward(is_train=False) and always mode for dropout

* fix

* fix

* fix slicing

* fix mkl dropout
---
 include/mxnet/c_api.h                         | 36 +++++++++-
 include/mxnet/executor.h                      |  2 +-
 python/mxnet/autograd.py                      | 70 ++++++++++++++-----
 python/mxnet/base.py                          |  4 +-
 python/mxnet/contrib/autograd.py              |  2 +
 python/mxnet/executor.py                      | 29 +++-----
 .../mxnet/gluon/model_zoo/vision/__init__.py  |  5 +-
 python/mxnet/image/detection.py               |  5 +-
 python/mxnet/ndarray.py                       | 17 +++--
 src/c_api/c_api_executor.cc                   |  9 ++-
 src/c_api/c_api_ndarray.cc                    | 20 +++++-
 src/executor/graph_executor.cc                |  4 +-
 src/executor/graph_executor.h                 |  2 +-
 src/ndarray/autograd.cc                       | 10 +--
 src/ndarray/autograd.h                        | 14 +++-
 src/ndarray/ndarray.cc                        |  1 +
 src/operator/dropout-inl.h                    | 21 ++++--
 src/operator/dropout.cc                       |  3 +-
 tests/python/unittest/test_autograd.py        | 16 +++++
 tests/python/unittest/test_operator.py        | 36 ++++++++++
 20 files changed, 243 insertions(+), 63 deletions(-)

diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 47447fb37196..d9a5315c9167 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -551,6 +551,13 @@ MXNET_DLL int MXImperativeInvoke(AtomicSymbolCreator creator,
                                  int num_params,
                                  const char **param_keys,
                                  const char **param_vals);
+/*!
+ * \brief set whether to record operator for autograd
+ * \param is_recording 1 when recording, 0 when not recording.
+ * \param prev returns the previous status before this set.
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXAutogradSetIsRecording(int is_recording, int* prev);
 /*!
  * \brief set whether to record operator for autograd
  * \param is_train 1 when training, 0 when testing
@@ -588,6 +595,20 @@ MXNET_DLL int MXAutogradBackward(mx_uint num_output,
                                  NDArrayHandle* output_handles,
                                  NDArrayHandle* ograd_handles,
                                  int retain_graph);
+/*!
+* \brief compute the gradient of outputs w.r.t variabels
+* \param num_output number of output NDArray
+* \param output_handles output NDArrays
+* \param ograd_handles head gradient for NDArrays
+* \param retain_graph whether to keep the graph after backward
+* \param is_train whether to do backward for training or inference
+* \return 0 when success, -1 when failure happens
+*/
+MXNET_DLL int MXAutogradBackwardEx(mx_uint num_output,
+                                   NDArrayHandle* output_handles,
+                                   NDArrayHandle* ograd_handles,
+                                   int retain_graph,
+                                   int is_train);
 /*!
  * \brief create cached operator
  */
@@ -1028,7 +1049,20 @@ MXNET_DLL int MXExecutorForward(ExecutorHandle handle, int is_train);
 MXNET_DLL int MXExecutorBackward(ExecutorHandle handle,
                                  mx_uint len,
                                  NDArrayHandle *head_grads);
-
+/*!
+ * \brief Excecutor run backward
+ *
+ * \param handle execute handle
+ * \param len lenth
+ * \param head_grads NDArray handle for heads' gradient
+ * \param is_train int value to indicate whether the backward pass is for evaluation
+ *
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXExecutorBackwardEx(ExecutorHandle handle,
+                                   mx_uint len,
+                                   NDArrayHandle *head_grads,
+                                   int is_train);
 /*!
  * \brief Get executor's head NDArray
  *
diff --git a/include/mxnet/executor.h b/include/mxnet/executor.h
index 40bd60f5f405..9308587c8d72 100644
--- a/include/mxnet/executor.h
+++ b/include/mxnet/executor.h
@@ -58,7 +58,7 @@ class Executor {
    *
    * \param head_grads the gradient of head nodes to be backproped.
    */
-  virtual void Backward(const std::vector<NDArray> &head_grads) = 0;
+  virtual void Backward(const std::vector<NDArray> &head_grads, bool is_train = true) = 0;
   /*!
    * \brief print the execution plan info to output stream.
    * \param os the output stream we like to print to.
diff --git a/python/mxnet/autograd.py b/python/mxnet/autograd.py
index b97d350f258d..2f33052e663e 100644
--- a/python/mxnet/autograd.py
+++ b/python/mxnet/autograd.py
@@ -12,9 +12,7 @@
 
 def set_recording(is_recording):
     """Set status to recording/not recording. When recording, graph will be constructed
-    for gradient computation. Operators will also run with ctx.is_train=True. For example,
-    Dropout will drop inputs randomly when is_train=True while simply passing through
-    if is_train=False.
+    for gradient computation.
 
     Parameters
     ----------
@@ -25,46 +23,77 @@ def set_recording(is_recording):
     previous state before this set.
     """
     prev = ctypes.c_int()
-    check_call(_LIB.MXAutogradSetIsTraining(
+    check_call(_LIB.MXAutogradSetIsRecording(
         ctypes.c_int(is_recording), ctypes.byref(prev)))
     return bool(prev.value)
 
+def set_training(is_train):
+    """Set status to training/not training. This affects ctx.is_train in operator
+    running context. For example, Dropout will drop inputs randomly when
+    is_train=True while simply passing through if is_train=False.
+
+    Parameters
+    ----------
+    is_train: bool
+
+    Returns
+    -------
+    previous state before this set.
+    """
+    prev = ctypes.c_int()
+    check_call(_LIB.MXAutogradSetIsTraining(
+        ctypes.c_int(is_train), ctypes.byref(prev)))
+    return bool(prev.value)
+
 
-class TrainingStateScope(object):
+class RecordingStateScope(object):
     """Scope for managing training state.
 
     Example::
-        with TrainingStateScope(True):
+        with RecordingStateScope(True, True):
             y = model(x)
             backward([y])
     """
-    def __init__(self, enter_state):
+    def __init__(self, enter_state, is_train):
         self._enter_state = enter_state
+        self._enter_is_train = is_train
         self._prev = None
+        self._prev_is_train = None
 
     def __enter__(self):
         self._prev = set_recording(self._enter_state)
+        self._prev_is_train = set_training(self._enter_is_train)
 
     def __exit__(self, ptype, value, trace):
         if self._prev != self._enter_state:
             set_recording(self._prev)
+        if self._prev_is_train != self._enter_is_train:
+            set_training(self._prev_is_train)
 
 
-def record():
+def record(is_train=True):
     """Returns a training scope context to be used in 'with' statement
     and captures training code.
 
+    .. note:: When forwarding with is_train=False, the corresponding backward
+              should also use is_train=False, otherwise gradient is undefined.
+
     Example::
         with autograd.record():
             y = model(x)
             backward([y])
         metric.update(...)
         optim.step(...)
+
+    Parameters
+    ----------
+    is_train: bool, default True
+        Whether to do forward for training or inference.
     """
-    return TrainingStateScope(True)
+    return RecordingStateScope(True, is_train)
 
 
-def pause():
+def pause(is_train=False):
     """Returns a testing scope context to be used in 'with' statement
     and captures testing code.
 
@@ -74,8 +103,13 @@ def pause():
             backward([y])
             with autograd.pause():
                 # testing, IO, gradient updates...
+
+    Parameters
+    ----------
+    is_train: bool, default False
+        Whether to do forward for training or inference.
     """
-    return TrainingStateScope(False)
+    return RecordingStateScope(False, is_train)
 
 
 def mark_variables(variables, gradients, grad_reqs='write'):
@@ -109,7 +143,7 @@ def mark_variables(variables, gradients, grad_reqs='write'):
         c_array(NDArrayHandle, gradient_handles)))
 
 
-def backward(heads, head_grads=None, retain_graph=False):
+def backward(heads, head_grads=None, retain_graph=False, is_train=True):
     """Compute the gradients of heads w.r.t previously marked variables.
 
     Parameters
@@ -118,6 +152,8 @@ def backward(heads, head_grads=None, retain_graph=False):
         Output NDArray(s)
     head_grads: NDArray or list of NDArray or None
         Gradients with respect to heads.
+    is_train: bool, optional
+        Whether to do backward for training or inference.
     """
     if isinstance(heads, NDArray):
         assert head_grads is None or isinstance(head_grads, NDArray)
@@ -129,11 +165,12 @@ def backward(heads, head_grads=None, retain_graph=False):
         output_handles.append(arr.handle)
 
     if head_grads is None:
-        check_call(_LIB.MXAutogradBackward(
+        check_call(_LIB.MXAutogradBackwardEx(
             len(output_handles),
             c_array(NDArrayHandle, output_handles),
             ctypes.c_void_p(0),
-            ctypes.c_int(retain_graph)))
+            ctypes.c_int(retain_graph),
+            ctypes.c_int(is_train)))
         return
 
     ograd_handles = []
@@ -145,8 +182,9 @@ def backward(heads, head_grads=None, retain_graph=False):
     assert len(ograd_handles) == len(output_handles), \
         "heads and head_grads must have the same length"
 
-    check_call(_LIB.MXAutogradBackward(
+    check_call(_LIB.MXAutogradBackwardEx(
         len(output_handles),
         c_array(NDArrayHandle, output_handles),
         c_array(NDArrayHandle, ograd_handles),
-        ctypes.c_int(retain_graph)))
+        ctypes.c_int(retain_graph),
+        ctypes.c_int(is_train)))
diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index 6d537529e8af..ddaeb6e77d54 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -18,14 +18,14 @@
 #----------------------------
 if sys.version_info[0] == 3:
     string_types = str,
-    numeric_types = (float, int, np.float32, np.int32)
+    numeric_types = (float, int, np.generic)
     integer_types = int
     # this function is needed for python3
     # to convert ctypes.char_p .value back to python str
     py_str = lambda x: x.decode('utf-8')
 else:
     string_types = basestring,
-    numeric_types = (float, int, long, np.float32, np.int32)
+    numeric_types = (float, int, long, np.generic)
     integer_types = (int, long)
     py_str = lambda x: x
 
diff --git a/python/mxnet/contrib/autograd.py b/python/mxnet/contrib/autograd.py
index e56361efdb1f..9074e452c981 100644
--- a/python/mxnet/contrib/autograd.py
+++ b/python/mxnet/contrib/autograd.py
@@ -28,6 +28,8 @@ def set_is_training(is_train):
     prev = ctypes.c_int()
     check_call(_LIB.MXAutogradSetIsTraining(
         ctypes.c_int(is_train), ctypes.byref(prev)))
+    check_call(_LIB.MXAutogradSetIsRecording(
+        ctypes.c_int(is_train), ctypes.byref(prev)))
     return bool(prev.value)
 
 
diff --git a/python/mxnet/executor.py b/python/mxnet/executor.py
index 6b9aab2de6f1..d2b108cc04ed 100644
--- a/python/mxnet/executor.py
+++ b/python/mxnet/executor.py
@@ -5,7 +5,6 @@
 
 import ctypes
 import copy
-import warnings
 import numpy as np
 from .base import _LIB
 from .base import mx_uint, NDArrayHandle, ExecutorHandle
@@ -61,7 +60,6 @@ def __init__(self, handle, symbol, ctx, grad_req, group2ctx):
         self._aux_dict = None
         self._output_dict = None
         self._monitor_callback = None
-        self._output_dirty = False
         self._ctx = copy.deepcopy(ctx)
         self._grad_req = copy.deepcopy(grad_req)
         self._group2ctx = copy.deepcopy(group2ctx)
@@ -99,8 +97,7 @@ def forward(self, is_train=False, **kwargs):
         ----------
         is_train: bool, optional
             Whether this forward is for evaluation purpose. If True,
-            a backward call is expected to follow. Otherwise following
-            backward is invalid.
+            a backward call is expected to follow.
 
         **kwargs
             Additional specification of input arguments.
@@ -132,15 +129,9 @@ def forward(self, is_train=False, **kwargs):
             self.handle,
             ctypes.c_int(int(is_train))))
 
-        if self._output_dirty:
-            warnings.warn(
-                "Calling forward the second time after forward(is_train=True) "
-                "without calling backward first. Is this intended?", stacklevel=2)
-        self._output_dirty = is_train
-
         return self.outputs
 
-    def backward(self, out_grads=None):
+    def backward(self, out_grads=None, is_train=True):
         """Do backward pass to get the gradient of arguments.
 
         Parameters
@@ -149,6 +140,11 @@ def backward(self, out_grads=None):
             Gradient on the outputs to be propagated back.
             This parameter is only needed when bind is called
             on outputs that are not a loss function.
+        is_train : bool, default True
+            Whether this backward is for training or inference. Note that in rare
+            cases you want to call backward with is_train=False to get gradient
+            during inference.
+
 
         Examples
         --------
@@ -211,16 +207,11 @@ def backward(self, out_grads=None):
             if not isinstance(obj, NDArray):
                 raise TypeError("inputs must be NDArray")
         ndarray = c_array(NDArrayHandle, [item.handle for item in out_grads])
-        check_call(_LIB.MXExecutorBackward(
+        check_call(_LIB.MXExecutorBackwardEx(
             self.handle,
             mx_uint(len(out_grads)),
-            ndarray))
-
-        if not self._output_dirty:
-            warnings.warn(
-                "Calling backward without calling forward(is_train=True) "
-                "first. Behavior is undefined.", stacklevel=2)
-        self._output_dirty = False
+            ndarray,
+            ctypes.c_int(is_train)))
 
     def set_monitor_callback(self, callback):
         """Install callback for monitor.
diff --git a/python/mxnet/gluon/model_zoo/vision/__init__.py b/python/mxnet/gluon/model_zoo/vision/__init__.py
index e0498dcc6bca..56e46f9a0c74 100644
--- a/python/mxnet/gluon/model_zoo/vision/__init__.py
+++ b/python/mxnet/gluon/model_zoo/vision/__init__.py
@@ -102,5 +102,8 @@ def get_model(name, **kwargs):
               'inceptionv3': inception_v3,
              }
     name = name.lower()
-    assert name in models, 'Model %s is not supported'%name
+    if name not in models:
+        raise ValueError(
+            'Model %s is not supported. Available options are\n\t%s'%(
+                name, '\n\t'.join(sorted(models.keys()))))
     return models[name](**kwargs)
diff --git a/python/mxnet/image/detection.py b/python/mxnet/image/detection.py
index d5e5c1e7e691..0a16ac36fc98 100644
--- a/python/mxnet/image/detection.py
+++ b/python/mxnet/image/detection.py
@@ -756,8 +756,9 @@ def next(self):
                     assert i < batch_size, 'Batch size must be multiples of augmenter output length'
                     batch_data[i][:] = self.postprocess_data(datum)
                     num_object = label.shape[0]
-                    batch_label[i][0:num_object][:] = nd.array(label)
-                    batch_label[i][num_object:][:] = -1
+                    batch_label[i][0:num_object] = nd.array(label)
+                    if num_object < batch_label[i].shape[0]:
+                        batch_label[i][num_object:] = -1
                     i += 1
         except StopIteration:
             if not i:
diff --git a/python/mxnet/ndarray.py b/python/mxnet/ndarray.py
index fdecebbe7996..b2178a98a84e 100644
--- a/python/mxnet/ndarray.py
+++ b/python/mxnet/ndarray.py
@@ -124,6 +124,7 @@ class NDArray(NDArrayBase):
 
     """
     __slots__ = []
+    # make numpy functions return NDArray instead of numpy object array
     __array_priority__ = 1000.0
     # pylint: disable= no-member, undefined-variable
     def __repr__(self):
@@ -1058,22 +1059,30 @@ def detach(self):
         check_call(_LIB.MXNDArrayDetach(self.handle, ctypes.byref(hdl)))
         return NDArray(hdl)
 
-    def backward(self, out_grad=None, retain_graph=False):
+    def backward(self, out_grad=None, retain_graph=False, is_train=True):
         """Compute the gradients of this NDArray w.r.t variables.
 
         Parameters
         ----------
-        out_grad: list of NDArray or None
+        out_grad : NDArray, optional
+            Gradient with respect to head.
+        retain_graph : bool, optional
+            Whether to retain the computaion graph for another backward
+            pass on the same graph. By default the computaion history
+            is cleared.
+        is_train : bool, optional
+            Whether to compute gradient for training or inference.
         """
         if out_grad is None:
             ograd_handles = [NDArrayHandle(0)]
         else:
             ograd_handles = [out_grad.handle]
 
-        check_call(_LIB.MXAutogradBackward(
+        check_call(_LIB.MXAutogradBackwardEx(
             1, c_array(NDArrayHandle, [self.handle]),
             c_array(NDArrayHandle, ograd_handles),
-            ctypes.c_int(retain_graph)))
+            ctypes.c_int(retain_graph),
+            ctypes.c_int(is_train)))
 
 
 def onehot_encode(indices, out):
diff --git a/src/c_api/c_api_executor.cc b/src/c_api/c_api_executor.cc
index ca49402ecf7e..3ba3154f2d97 100644
--- a/src/c_api/c_api_executor.cc
+++ b/src/c_api/c_api_executor.cc
@@ -35,6 +35,13 @@ int MXExecutorForward(ExecutorHandle handle, int is_train) {
 int MXExecutorBackward(ExecutorHandle handle,
                        mx_uint len,
                        NDArrayHandle *head_grads) {
+  return MXExecutorBackwardEx(handle, len, head_grads, true);
+}
+
+int MXExecutorBackwardEx(ExecutorHandle handle,
+                         mx_uint len,
+                         NDArrayHandle *head_grads,
+                         int is_train) {
   API_BEGIN();
   Executor *exec = static_cast<Executor*>(handle);
   std::vector<NDArray> ndarrays;
@@ -42,7 +49,7 @@ int MXExecutorBackward(ExecutorHandle handle,
   for (mx_uint i = 0; i < len; ++i) {
     ndarrays.push_back(*args_ptr[i]);
   }
-  exec->Backward(ndarrays);
+  exec->Backward(ndarrays, is_train);
   API_END();
 }
 
diff --git a/src/c_api/c_api_ndarray.cc b/src/c_api/c_api_ndarray.cc
index 818f263cb3b7..f40139424b31 100644
--- a/src/c_api/c_api_ndarray.cc
+++ b/src/c_api/c_api_ndarray.cc
@@ -378,7 +378,7 @@ void ImperativeInvokeImpl(const Context& default_ctx,
     }
 
     if (fn) {
-      if (AutogradRuntime::Get()->IsTraining()) {
+      if (AutogradRuntime::Get()->IsRecording()) {
         AutogradRuntime::Get()->RecordImperativeFCompute(op,
             attrs, &ndinputs, &ndoutputs);
       }
@@ -387,7 +387,7 @@ void ImperativeInvokeImpl(const Context& default_ctx,
     } else if (createop.count(op)) {
       auto state =
           createop[op](attrs, ctx, ret->arg_shapes, ret->arg_types);
-      if (AutogradRuntime::Get()->IsTraining()) {
+      if (AutogradRuntime::Get()->IsRecording()) {
         AutogradRuntime::Get()->RecordImperativeOperator(state, op,
             attrs, &ndinputs, &ndoutputs);
       }
@@ -528,6 +528,12 @@ int MXAutogradSetIsTraining(int is_training, int* prev) {
   API_END();
 }
 
+int MXAutogradSetIsRecording(int is_recording, int* prev) {
+  API_BEGIN();
+  *prev = AutogradRuntime::Get()->SetIsRecording(static_cast<bool>(is_recording));
+  API_END();
+}
+
 int MXAutogradMarkVariables(mx_uint num_var,
                             NDArrayHandle *var_handles,
                             mx_uint *reqs_array,
@@ -556,6 +562,14 @@ int MXAutogradBackward(mx_uint num_output,
                        NDArrayHandle *output_handles,
                        NDArrayHandle *ograd_handles,
                        int retain_graph) {
+  return MXAutogradBackwardEx(num_output, output_handles, ograd_handles, retain_graph, true);
+}
+
+int MXAutogradBackwardEx(mx_uint num_output,
+                         NDArrayHandle *output_handles,
+                         NDArrayHandle *ograd_handles,
+                         int retain_graph,
+                         int is_train) {
   API_BEGIN();
   MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
 
@@ -574,6 +588,6 @@ int MXAutogradBackward(mx_uint num_output,
     }
   }
 
-  AutogradRuntime::Get()->ComputeGradient(outputs, ograds, retain_graph);
+  AutogradRuntime::Get()->ComputeGradient(outputs, ograds, retain_graph, is_train);
   API_END();
 }
diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
index af5ec7f492dd..a17f44a7cff5 100644
--- a/src/executor/graph_executor.cc
+++ b/src/executor/graph_executor.cc
@@ -42,7 +42,7 @@ void GraphExecutor::PartialForward(bool is_train, int step, int *step_left) {
   *step_left = static_cast<int>(num_forward_nodes_ - sstep - 1);
 }
 
-void GraphExecutor::Backward(const std::vector<NDArray>& head_grads) {
+void GraphExecutor::Backward(const std::vector<NDArray>& head_grads, bool is_train) {
   const auto& idx = graph_.indexed_graph();
   if (num_forward_inputs_ != idx.input_nodes().size()) {
     for (size_t i = 0; i < head_grad_array_.size(); ++i) {
@@ -57,7 +57,7 @@ void GraphExecutor::Backward(const std::vector<NDArray>& head_grads) {
       }
     }
   }
-  RunOps(true, num_forward_nodes_, idx.num_nodes());
+  RunOps(is_train, num_forward_nodes_, idx.num_nodes());
 }
 
 void GraphExecutor::Print(std::ostream &os) const {  // NOLINT(*)
diff --git a/src/executor/graph_executor.h b/src/executor/graph_executor.h
index 5b6fa395b242..0efb8ae09f4a 100644
--- a/src/executor/graph_executor.h
+++ b/src/executor/graph_executor.h
@@ -44,7 +44,7 @@ class GraphExecutor : public Executor {
   virtual ~GraphExecutor();
   void Forward(bool is_train) override;
   void PartialForward(bool is_train, int step, int *step_left) override;
-  void Backward(const std::vector<NDArray> &head_grads) override;
+  void Backward(const std::vector<NDArray> &head_grads, bool is_train = true) override;
   const std::vector<NDArray>& outputs() const override;
   const std::unordered_map<std::string, NDArray>& in_arg_map() const override;
   const std::unordered_map<std::string, NDArray>& arg_grad_map() const override;
diff --git a/src/ndarray/autograd.cc b/src/ndarray/autograd.cc
index f990ee2973fd..efb6bc9dbf8d 100644
--- a/src/ndarray/autograd.cc
+++ b/src/ndarray/autograd.cc
@@ -23,9 +23,11 @@ using nnvm::NodeEntryMap;
 using exec::GraphExecutor;
 
 #if DMLC_CXX11_THREAD_LOCAL
-thread_local bool AutogradRuntime::is_train_;
+thread_local bool AutogradRuntime::is_train_ = false;
+thread_local bool AutogradRuntime::is_recording_ = false;
 #else
-MX_THREAD_LOCAL bool AutogradRuntime::is_train_;
+MX_THREAD_LOCAL bool AutogradRuntime::is_train_ = false;
+MX_THREAD_LOCAL bool AutogradRuntime::is_recording_ = false;
 #endif
 
 template<typename FVisit>
@@ -149,7 +151,7 @@ AGNodePtr AutogradRuntime::RecordOp(const nnvm::Op* op,
 
 void AutogradRuntime::ComputeGradient(const std::vector<NDArray>& outputs,
                                       const std::vector<NDArray>& ograds,
-                                      bool retain_graph) {
+                                      bool retain_graph, bool is_train) {
   static auto& fmutate_inputs = nnvm::Op::GetAttr<nnvm::FMutateInputs>("FMutateInputs");
   std::vector<AGNodeEntry> heads;
   Symbol sym;
@@ -233,7 +235,7 @@ void AutogradRuntime::ComputeGradient(const std::vector<NDArray>& outputs,
       }
     }
 
-    exec->Backward(head_grads);
+    exec->Backward(head_grads, is_train);
     delete exec;
   }
 
diff --git a/src/ndarray/autograd.h b/src/ndarray/autograd.h
index baf843dbd4e1..474864009688 100644
--- a/src/ndarray/autograd.h
+++ b/src/ndarray/autograd.h
@@ -63,6 +63,16 @@ class AutogradRuntime {
   bool IsTraining() const {
     return is_train_;
   }
+  /*! \brief turn on or turn off operator recording for autograd. */
+  bool SetIsRecording(bool is_recording) {
+      bool old = is_recording_;
+      is_recording_ = is_recording;
+      return old;
+  }
+  /*! \brief whether operator recording is on. */
+  bool IsRecording() const {
+    return is_recording_;
+  }
   /*! \brief mark variables for computing gradients. */
   void MarkVariables(const std::vector<NDArray*>& variables,
                      const std::vector<mx_uint>& grad_reqs,
@@ -81,7 +91,7 @@ class AutogradRuntime {
   /*! \brief compute the gradient of outputs w.r.t variables. */
   void ComputeGradient(const std::vector<NDArray>& outputs,
                        const std::vector<NDArray>& ograds,
-                       bool retain_graph);
+                       bool retain_graph, bool is_train);
   /*! \return AutogradRuntime singleton */
   static AutogradRuntime* Get();
   /*! \brief Get shared pointer reference to AutogradRuntime singleton.
@@ -109,8 +119,10 @@ class AutogradRuntime {
   /*! \brief indicate whether is training. */
 #if DMLC_CXX11_THREAD_LOCAL
   static thread_local bool is_train_;
+  static thread_local bool is_recording_;
 #else
   static MX_THREAD_LOCAL bool is_train_;
+  static MX_THREAD_LOCAL bool is_recording_;
 #endif
   /*! \brief node count used for naming */
   std::atomic<uint64_t> node_count_{0};
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index f2e90dd56f31..48499fa2cafd 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -69,6 +69,7 @@ NDArray NDArray::Slice(index_t begin, index_t end) const {
   using namespace autograd;
   NDArray ret = *this;
   CHECK(!is_none()) << "NDArray is not initialized";
+  CHECK_LT(begin, end) << "Invalid slicing range [" << begin << ", " << end << ")";
   CHECK_GE(shape_[0], end) << "Slice end index out of range";
   size_t length = shape_.ProdShape(1, shape_.ndim());
   MSHADOW_TYPE_SWITCH(ret.dtype(), DType, {
diff --git a/src/operator/dropout-inl.h b/src/operator/dropout-inl.h
index e77d61351be0..57d78146a68d 100644
--- a/src/operator/dropout-inl.h
+++ b/src/operator/dropout-inl.h
@@ -29,6 +29,7 @@ namespace dropout {
 enum DropoutOpInputs {kData};
 enum DropoutOpOutputs {kOut, kMask};
 enum DropoutOpForwardResource {kRandom};
+enum DropoutOpMode {kTraining, kAlways};
 }  // namespace dropout
 
 namespace mxnet {
@@ -58,10 +59,16 @@ static void bernoulli_generate(int n, double p, int* r) {
 
 struct DropoutParam : public dmlc::Parameter<DropoutParam> {
   float p;
+  int mode;
   DMLC_DECLARE_PARAMETER(DropoutParam) {
     DMLC_DECLARE_FIELD(p).set_default(0.5)
     .set_range(0, 1)
     .describe("Fraction of the input that gets dropped out during training time.");
+    DMLC_DECLARE_FIELD(mode)
+    .add_enum("training", dropout::kTraining)
+    .add_enum("always", dropout::kAlways)
+    .set_default(dropout::kTraining)
+    .describe("Whether to only turn on dropout during training or to also turn on for inference.");
   }
 };  // struct DropoutParam
 
@@ -70,6 +77,7 @@ class DropoutOp : public Operator {
  public:
   explicit DropoutOp(DropoutParam param) {
     this->pkeep_ = 1.0f - param.p;
+    this->mode_ = param.mode;
   }
 
   virtual void Forward(const OpContext &ctx,
@@ -86,7 +94,7 @@ class DropoutOp : public Operator {
     Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 2, DType> data = in_data[dropout::kData].FlatTo2D<xpu, DType>(s);
     Tensor<xpu, 2, DType> out = out_data[dropout::kOut].FlatTo2D<xpu, DType>(s);
-    if (ctx.is_train) {
+    if (ctx.is_train || mode_ == dropout::kAlways) {
       Tensor<xpu, 2, DType> mask = out_data[dropout::kMask].FlatTo2D<xpu, DType>(s);
 #if !defined(__CUDACC__) && defined(USE_MKL) && defined(_OPENMP)
       DType* outptr = out.dptr_;
@@ -96,7 +104,7 @@ class DropoutOp : public Operator {
       bernoulli_generate(count, this->pkeep_, maskptr);
   #pragma omp parallel for
       for (int i = 0; i < count; ++i) {
-        outptr[i] = dataptr[i] * maskptr[i];
+        outptr[i] = dataptr[i] * maskptr[i] * (1.0f / pkeep_);
       }
 #else
       Random<xpu> *prnd = ctx.requested[dropout::kRandom].get_random<xpu, real_t>(s);
@@ -124,6 +132,7 @@ class DropoutOp : public Operator {
     Tensor<xpu, 2, DType> grad = out_grad[dropout::kOut].FlatTo2D<xpu, DType>(s);
     Tensor<xpu, 2, DType> mask = out_data[dropout::kMask].FlatTo2D<xpu, DType>(s);
     Tensor<xpu, 2, DType> gdata = in_grad[dropout::kData].FlatTo2D<xpu, DType>(s);
+    if (ctx.is_train || mode_ == dropout::kAlways) {
 #if !defined(__CUDACC__) && defined(USE_MKL) && defined(_OPENMP)
       DType* ingradptr = gdata.dptr_;
       DType* outgradptr = grad.dptr_;
@@ -131,17 +140,21 @@ class DropoutOp : public Operator {
 
       int count = mask.shape_[0]*mask.shape_[1];
 
-  #pragma omp parallel for
+      #pragma omp parallel for
       for (int i = 0; i < count; ++i) {
-        ingradptr[i] = outgradptr[i] * maskptr[i];
+        ingradptr[i] = outgradptr[i] * maskptr[i] * (1.0f / pkeep_);
       }
 #else  // USE_MKL && _OPENMP
       Assign(gdata, req[dropout::kData], grad * mask);
 #endif  // USE_MKL && _OPENMP
+    } else {
+      Assign(gdata, req[dropout::kData], F<mshadow_op::identity>(grad));
+    }
   }
 
  private:
   real_t pkeep_;
+  int mode_;
 };  // class DropoutOp
 
 
diff --git a/src/operator/dropout.cc b/src/operator/dropout.cc
index 74a50baf80a4..e206214e9b64 100644
--- a/src/operator/dropout.cc
+++ b/src/operator/dropout.cc
@@ -33,7 +33,8 @@ MXNET_REGISTER_OP_PROPERTY(Dropout, DropoutProp)
   The whole array is rescaled by :math:`1/(1-p)` to keep the expected
   sum of the input unchanged.
 
-- During testing, this operator does not change the input.
+- During testing, this operator does not change the input if mode is 'training'.
+  If mode is 'always', the same computaion as during training will be applied.
 
 Example::
 
diff --git a/tests/python/unittest/test_autograd.py b/tests/python/unittest/test_autograd.py
index 8dea04da6abc..172075dcfda1 100644
--- a/tests/python/unittest/test_autograd.py
+++ b/tests/python/unittest/test_autograd.py
@@ -248,6 +248,22 @@ def test_attach_grad():
     assert (x.grad.asnumpy() == 2).all()
 
 
+def test_is_train():
+    x = mx.nd.ones((10, 10))
+    x.attach_grad()
+    with record(True):
+        y = mx.nd.Dropout(x, p=0.5)
+        assert y.asnumpy().max() == 2 and y.asnumpy().min() == 0
+        y.backward()
+        assert (x.grad.asnumpy() == y.asnumpy()).all()
+
+    with record(False):
+        y = mx.nd.Dropout(x, p=0.5)
+        assert (y.asnumpy() == x.asnumpy()).all()
+        y.backward(is_train=False)
+        assert (x.grad.asnumpy() == x.asnumpy()).all()
+
+
 if __name__ == "__main__":
     import nose
     nose.runmodule()
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 2f7c3b904e01..51a77e0af221 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -3658,6 +3658,42 @@ def test_stack():
         check_numeric_gradient(out, inputs)
 
 
+def test_dropout():
+    # test dropout
+    x = mx.sym.var('data')
+    y = mx.sym.Dropout(x, p=0.5)
+    exe = y.simple_bind(ctx=default_context(), data=(10, 10))
+
+    exe.arg_arrays[0][:] = 1
+    exe.forward(is_train=True)
+    assert exe.outputs[0].asnumpy().max() == 2
+    assert exe.outputs[0].asnumpy().min() == 0
+    exe.backward([mx.nd.ones((10, 10))])
+    assert (exe.grad_arrays[0].asnumpy() == exe.outputs[0].asnumpy()).all()
+
+    exe.forward(is_train=False)
+    assert (exe.outputs[0].asnumpy() == exe.arg_arrays[0].asnumpy()).all()
+    exe.backward([mx.nd.ones((10, 10))], is_train=False)
+    assert (exe.grad_arrays[0].asnumpy() == exe.arg_arrays[0].asnumpy()).all()
+
+    # test permanent dropout
+    x = mx.sym.var('data')
+    y = mx.sym.Dropout(x, p=0.5, mode='always')
+    exe = y.simple_bind(ctx=default_context(), data=(10, 10))
+
+    exe.arg_arrays[0][:] = 1
+    exe.forward(is_train=True)
+    assert exe.outputs[0].asnumpy().max() == 2
+    assert exe.outputs[0].asnumpy().min() == 0
+    exe.backward([mx.nd.ones((10, 10))])
+    assert (exe.grad_arrays[0].asnumpy() == exe.outputs[0].asnumpy()).all()
+
+    exe.forward(is_train=False)
+    assert exe.outputs[0].asnumpy().max() == 2
+    assert exe.outputs[0].asnumpy().min() == 0
+    exe.backward([mx.nd.ones((10, 10))], is_train=False)
+    assert (exe.grad_arrays[0].asnumpy() == exe.outputs[0].asnumpy()).all()
+
 
 if __name__ == '__main__':
     import nose

From 5a286b28dd60574ec4d0ede1252eb1ee4dbe3088 Mon Sep 17 00:00:00 2001
From: Madan Jampani <madjam@users.noreply.github.com>
Date: Thu, 3 Aug 2017 12:53:06 -0700
Subject: [PATCH 02/26] Fix module tutorial (#7324)

---
 docs/tutorials/basic/module.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/tutorials/basic/module.md b/docs/tutorials/basic/module.md
index 15fdaeef68c4..e0618ca65e4a 100644
--- a/docs/tutorials/basic/module.md
+++ b/docs/tutorials/basic/module.md
@@ -173,8 +173,8 @@ dataset and evaluates the performance according to the given input metric.
 It can be used as follows:
 
 ```python
-score = mod.score(val_iter, ['mse', 'acc'])
-print("Accuracy score is %f" % (score))
+score = mod.score(val_iter, ['acc'])
+print("Accuracy score is %f" % (score[0][1]))
 ```
 
 Some of the other metrics which can be used are `top_k_acc`(top-k-accuracy),

From 1f0b8130db4d22b5f2b1f3df26e26bc8a65cba2b Mon Sep 17 00:00:00 2001
From: formath <jinpengliu@163.com>
Date: Fri, 4 Aug 2017 04:01:44 +0800
Subject: [PATCH 03/26] assert size eq between shared_module.execs and context
 (#7233)

---
 python/mxnet/module/module.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/mxnet/module/module.py b/python/mxnet/module/module.py
index 2a36c6ad7e7e..b31ea0ffa319 100644
--- a/python/mxnet/module/module.py
+++ b/python/mxnet/module/module.py
@@ -386,6 +386,7 @@ def bind(self, data_shapes, label_shapes=None, for_training=True,
             assert isinstance(shared_module, Module) and \
                     shared_module.binded and shared_module.params_initialized
             shared_group = shared_module._exec_group
+            assert len(shared_group.execs) == len(self._context)
         else:
             shared_group = None
 

From 4939dc25ff89607b8e0584f9b30a804f7183e07d Mon Sep 17 00:00:00 2001
From: Xu Dong <dsqx71@gmail.com>
Date: Fri, 4 Aug 2017 04:03:50 +0800
Subject: [PATCH 04/26] Add document for BilinearSampler Op (#7203)

* Update document for BilinearSamplerOp

* fix lint

* remove space
---
 src/operator/bilinear_sampler.cc | 84 +++++++++++++++++++++++++++-----
 1 file changed, 72 insertions(+), 12 deletions(-)

diff --git a/src/operator/bilinear_sampler.cc b/src/operator/bilinear_sampler.cc
index f76e987440fa..ca83a43343a6 100644
--- a/src/operator/bilinear_sampler.cc
+++ b/src/operator/bilinear_sampler.cc
@@ -152,17 +152,77 @@ MXNET_REGISTER_OP_PROPERTY(BilinearSampler, BilinearSamplerProp)
 .add_argument("grid", "NDArray-or-Symbol", "Input grid to the BilinearsamplerOp."
                                 "grid has two channels: x_src, y_src")
 .add_arguments(BilinearSamplerParam::__FIELDS__())
-.describe("Applies bilinear sampling to input feature map,"
-" which is the key of \"[NIPS2015] Spatial Transformer Networks\"\n    "
-"output[batch, channel, y_dst, x_dst] = G(data[batch, channel, y_src, x_src)\n    "
-"x_dst, y_dst enumerate all spatial locations in output\n    "
-"x_src = grid[batch, 0, y_dst, x_dst]\n    "
-"y_src = grid[batch, 1, y_dst, x_dst]\n    "
-"G() denotes the bilinear interpolation kernel\n"
-"The out-boundary points will be padded as zeros. (The boundary is defined to be [-1, 1])\n"
-"The shape of output will be (data.shape[0], data.shape[1], grid.shape[2], grid.shape[3])\n"
-"The operator assumes that grid has been nomalized. "
-"If you want to design a CustomOp to manipulate grid, "
-"please refer to GridGeneratorOp.");
+.describe(R"code(Applies bilinear sampling to input feature map.
+
+Bilinear Sampling is the key of  [NIPS2015] \"Spatial Transformer Networks\". The usage of the operator is very similar to remap function in OpenCV, 
+except that the operator has the backward pass.
+
+Given :math:`data` and :math:`grid`, then the output is computed by 
+
+.. math::
+  x_{src} = grid[batch, 0, y_{dst}, x_{dst}] \\
+  y_{src} = grid[batch, 1, y_{dst}, x_{dst}] \\
+  output[batch, channel, y_{dst}, x_{dst}] = G(data[batch, channel, y_{src}, x_{src})
+
+:math:`x_{dst}`, :math:`y_{dst}` enumerate all spatial locations in :math:`output`, and :math:`G()` denotes the bilinear interpolation kernel.
+The out-boundary points will be padded with zeros.The shape of the output will be (data.shape[0], data.shape[1], grid.shape[2], grid.shape[3]). 
+
+The operator assumes that :math:`data` has 'NCHW' layout and :math:`grid` has been normalized to [-1, 1]. 
+
+BilinearSampler often cooperates with GridGenerator which generates sampling grids for BilinearSampler. 
+GridGenerator supports two kinds of transformation: ``affine`` and ``warp``.
+If users want to design a CustomOp to manipulate :math:`grid`, please firstly refer to the code of GridGenerator.
+
+Example 1::
+
+  ## Zoom out data two times
+  data = array([[[[1, 4, 3, 6],
+                  [1, 8, 8, 9],
+                  [0, 4, 1, 5],
+                  [1, 0, 1, 3]]]])
+  
+  affine_matrix = array([[2, 0, 0],
+                         [0, 2, 0]])
+
+  affine_matrix = reshape(affine_matrix, shape=(1, 6))
+
+  grid = GridGenerator(data=affine_matrix, transform_type='affine', target_shape=(4, 4))
+
+  out = BilinearSampler(data, grid)
+
+  out
+  [[[[ 0,   0,     0,   0],
+     [ 0,   3.5,   6.5, 0],
+     [ 0,   1.25,  2.5, 0],
+     [ 0,   0,     0,   0]]]
+
+
+Example 2::
+
+  ## shift data horizontally by -1 pixel
+
+  data = array([[[[1, 4, 3, 6],
+                  [1, 8, 8, 9],
+                  [0, 4, 1, 5],
+                  [1, 0, 1, 3]]]])
+
+  warp_maxtrix = array([[[[1, 1, 1, 1],
+                          [1, 1, 1, 1],
+                          [1, 1, 1, 1],
+                          [1, 1, 1, 1]],
+                         [[0, 0, 0, 0],
+                          [0, 0, 0, 0],
+                          [0, 0, 0, 0],
+                          [0, 0, 0, 0]]]])
+  
+  grid = GridGenerator(data=warp_matrix, transform_type='warp')
+  out = BilinearSampler(data, grid)
+
+  out
+  [[[[ 4,  3,  6,  0],
+     [ 8,  8,  9,  0],
+     [ 4,  1,  5,  0],
+     [ 0,  1,  3,  0]]]
+)code" ADD_FILELINE);
 }  // namespace op
 }  // namespace mxnet

From 2fe7aa4189941ec5d1673025336417248c22e38a Mon Sep 17 00:00:00 2001
From: Xu Dong <dsqx71@gmail.com>
Date: Fri, 4 Aug 2017 04:04:52 +0800
Subject: [PATCH 05/26] Fix bug in symbolic RNN (#7282)

* Remove forget_bais in ConvLSTM
* Remove the hard code about conv_layout
* Add interface for initializer
* Remove repetitive code in _call_ function
---
 python/mxnet/rnn/rnn_cell.py      | 195 ++++++++++++++----------------
 tests/python/unittest/test_rnn.py |   2 +-
 2 files changed, 92 insertions(+), 105 deletions(-)

diff --git a/python/mxnet/rnn/rnn_cell.py b/python/mxnet/rnn/rnn_cell.py
index 99d0e8ad606f..c8213a20f9ef 100644
--- a/python/mxnet/rnn/rnn_cell.py
+++ b/python/mxnet/rnn/rnn_cell.py
@@ -1072,41 +1072,14 @@ def unroll(self, length, inputs, begin_state=None, layout='NTC', merge_outputs=N
 
 
 class BaseConvRNNCell(BaseRNNCell):
-    """Abstract base class for Convolutional RNN cells
-
-    Parameters
-    ----------
-    input_shape : tuple of int
-        Shape of input in single timestep.
-    num_hidden : int
-        Number of units in output symbol.
-    h2h_kernel : tuple of int
-        Kernel of Convolution operator in state-to-state transitions.
-    h2h_dilate : tuple of int
-        Dilation of Convolution operator in state-to-state transitions.
-    i2h_kernel : tuple of int
-        Kernel of Convolution operator in input-to-state transitions.
-    i2h_stride : tuple of int
-        Stride of Convolution operator in input-to-state transitions.
-    i2h_pad : tuple of int
-        Pad of Convolution operator in input-to-state transitions.
-    i2h_dilate : tuple of int
-        Dilation of Convolution operator in input-to-state transitions.
-    activation : str or Symbol,
-        Type of activation function.
-    prefix : str, default ''
-        Prefix for name of layers (and name of weight if params is None).
-    params : RNNParams, default None
-        Container for weight sharing between cells. Created if None.
-    conv_layout : str, , default 'NCHW'
-        Layout of ConvolutionOp
-    """
+    """Abstract base class for Convolutional RNN cells"""
     def __init__(self, input_shape, num_hidden,
                  h2h_kernel, h2h_dilate,
                  i2h_kernel, i2h_stride,
                  i2h_pad, i2h_dilate,
-                 activation,
-                 prefix='', params=None, conv_layout='NCHW'):
+                 i2h_weight_initializer, h2h_weight_initializer,
+                 i2h_bias_initializer, h2h_bias_initializer,
+                 activation, prefix='', params=None, conv_layout='NCHW'):
         super(BaseConvRNNCell, self).__init__(prefix=prefix, params=params)
         # Convolution setting
         self._h2h_kernel = h2h_kernel
@@ -1137,11 +1110,46 @@ def __init__(self, input_shape, num_hidden,
         self._state_shape = self._state_shape.infer_shape(data=input_shape)[1][0]
         self._state_shape = (0, ) + self._state_shape[1:]
 
+        # Get params
+        self._iW = self.params.get('i2h_weight', init=i2h_weight_initializer)
+        self._hW = self.params.get('h2h_weight', init=h2h_weight_initializer)
+        self._iB = self.params.get('i2h_bias', init=i2h_bias_initializer)
+        self._hB = self.params.get('h2h_bias', init=h2h_bias_initializer)
+
+    @property
+    def _num_gates(self):
+        return len(self._gate_names)
+
     @property
     def state_info(self):
         return [{'shape': self._state_shape, '__layout__': self._conv_layout},
                 {'shape': self._state_shape, '__layout__': self._conv_layout}]
 
+    def _conv_forward(self, inputs, states, name):
+
+        i2h = symbol.Convolution(name='%si2h'%name,
+                                 data=inputs,
+                                 num_filter=self._num_hidden*self._num_gates,
+                                 kernel=self._i2h_kernel,
+                                 stride=self._i2h_stride,
+                                 pad=self._i2h_pad,
+                                 dilate=self._i2h_dilate,
+                                 weight=self._iW,
+                                 bias=self._iB,
+                                 layout=self._conv_layout)
+
+        h2h = symbol.Convolution(name='%sh2h'%name,
+                                 data=states[0],
+                                 num_filter=self._num_hidden*self._num_gates,
+                                 kernel=self._h2h_kernel,
+                                 dilate=self._h2h_dilate,
+                                 pad=self._h2h_pad,
+                                 stride=(1, 1),
+                                 weight=self._hW,
+                                 bias=self._hB,
+                                 layout=self._conv_layout)
+        return i2h, h2h
+
     def __call__(self, inputs, states):
         raise NotImplementedError("BaseConvRNNCell is abstract class for convolutional RNN")
 
@@ -1166,6 +1174,16 @@ class ConvRNNCell(BaseConvRNNCell):
         Pad of Convolution operator in input-to-state transitions.
     i2h_dilate : tuple of int, default (1, 1)
         Dilation of Convolution operator in input-to-state transitions.
+    i2h_weight_initializer : str or Initializer
+        Initializer for the input weights matrix, used for the convolution
+        transformation of the inputs.
+    h2h_weight_initializer : str or Initializer
+        Initializer for the recurrent weights matrix, used for the convolution
+        transformation of the recurrent state.
+    i2h_bias_initializer : str or Initializer, default zeros
+        Initializer for the bias vector.
+    h2h_bias_initializer : str or Initializer, default zeros
+        Initializer for the bias vector.
     activation : str or Symbol,
         default functools.partial(symbol.LeakyReLU, act_type='leaky', slope=0.2)
         Type of activation function.
@@ -1180,19 +1198,20 @@ def __init__(self, input_shape, num_hidden,
                  h2h_kernel=(3, 3), h2h_dilate=(1, 1),
                  i2h_kernel=(3, 3), i2h_stride=(1, 1),
                  i2h_pad=(1, 1), i2h_dilate=(1, 1),
+                 i2h_weight_initializer=None, h2h_weight_initializer=None,
+                 i2h_bias_initializer='zeros', h2h_bias_initializer='zeros',
                  activation=functools.partial(symbol.LeakyReLU, act_type='leaky', slope=0.2),
                  prefix='ConvRNN_', params=None, conv_layout='NCHW'):
         super(ConvRNNCell, self).__init__(input_shape=input_shape, num_hidden=num_hidden,
                                           h2h_kernel=h2h_kernel, h2h_dilate=h2h_dilate,
                                           i2h_kernel=i2h_kernel, i2h_stride=i2h_stride,
                                           i2h_pad=i2h_pad, i2h_dilate=i2h_dilate,
+                                          i2h_weight_initializer=i2h_weight_initializer,
+                                          h2h_weight_initializer=h2h_weight_initializer,
+                                          i2h_bias_initializer=i2h_bias_initializer,
+                                          h2h_bias_initializer=h2h_bias_initializer,
                                           activation=activation, prefix=prefix,
                                           params=params, conv_layout=conv_layout)
-        # Get params
-        self._iW = self.params.get('i2h_weight')
-        self._hW = self.params.get('h2h_weight')
-        self._iB = self.params.get('i2h_bias')
-        self._hB = self.params.get('h2h_bias')
 
     @property
     def _gate_names(self):
@@ -1201,24 +1220,7 @@ def _gate_names(self):
     def __call__(self, inputs, states):
         self._counter += 1
         name = '%st%d_'%(self._prefix, self._counter)
-        i2h = symbol.Convolution(name='%si2h'%name,
-                                 data=inputs,
-                                 num_filter=self._num_hidden,
-                                 kernel=self._i2h_kernel,
-                                 stride=self._i2h_stride,
-                                 pad=self._i2h_pad,
-                                 dilate=self._i2h_dilate,
-                                 weight=self._iW,
-                                 bias=self._iB,)
-        h2h = symbol.Convolution(name='%sh2h'%name,
-                                 data=states[0],
-                                 num_filter=self._num_hidden,
-                                 kernel=self._h2h_kernel,
-                                 dilate=self._h2h_dilate,
-                                 pad=self._h2h_pad,
-                                 stride=(1, 1),
-                                 weight=self._hW,
-                                 bias=self._hB)
+        i2h, h2h = self._conv_forward(inputs, states, name)
         output = self._get_activation(i2h + h2h, self._activation,
                                       name='%sout'%name)
         return output, [output]
@@ -1248,6 +1250,16 @@ class ConvLSTMCell(BaseConvRNNCell):
         Pad of Convolution operator in input-to-state transitions.
     i2h_dilate : tuple of int, default (1, 1)
         Dilation of Convolution operator in input-to-state transitions.
+    i2h_weight_initializer : str or Initializer
+        Initializer for the input weights matrix, used for the convolution
+        transformation of the inputs.
+    h2h_weight_initializer : str or Initializer
+        Initializer for the recurrent weights matrix, used for the convolution
+        transformation of the recurrent state.
+    i2h_bias_initializer : str or Initializer, default zeros
+        Initializer for the bias vector.
+    h2h_bias_initializer : str or Initializer, default zeros
+        Initializer for the bias vector.
     activation : str or Symbol
         default functools.partial(symbol.LeakyReLU, act_type='leaky', slope=0.2)
         Type of activation function.
@@ -1255,8 +1267,6 @@ class ConvLSTMCell(BaseConvRNNCell):
         Prefix for name of layers (and name of weight if params is None).
     params : RNNParams, default None
         Container for weight sharing between cells. Created if None.
-    forget_bias : bias added to forget gate, default 1.0.
-        Jozefowicz et al. 2015 recommends setting this to 1.0
     conv_layout : str, , default 'NCHW'
         Layout of ConvolutionOp
     """
@@ -1264,23 +1274,22 @@ def __init__(self, input_shape, num_hidden,
                  h2h_kernel=(3, 3), h2h_dilate=(1, 1),
                  i2h_kernel=(3, 3), i2h_stride=(1, 1),
                  i2h_pad=(1, 1), i2h_dilate=(1, 1),
+                 i2h_weight_initializer=None, h2h_weight_initializer=None,
+                 i2h_bias_initializer='zeros', h2h_bias_initializer='zeros',
                  activation=functools.partial(symbol.LeakyReLU, act_type='leaky', slope=0.2),
-                 prefix='ConvLSTM_', params=None, forget_bias=1.0,
+                 prefix='ConvLSTM_', params=None,
                  conv_layout='NCHW'):
         super(ConvLSTMCell, self).__init__(input_shape=input_shape, num_hidden=num_hidden,
                                            h2h_kernel=h2h_kernel, h2h_dilate=h2h_dilate,
                                            i2h_kernel=i2h_kernel, i2h_stride=i2h_stride,
                                            i2h_pad=i2h_pad, i2h_dilate=i2h_dilate,
+                                           i2h_weight_initializer=i2h_weight_initializer,
+                                           h2h_weight_initializer=h2h_weight_initializer,
+                                           i2h_bias_initializer=i2h_bias_initializer,
+                                           h2h_bias_initializer=h2h_bias_initializer,
                                            activation=activation, prefix=prefix,
                                            params=params, conv_layout=conv_layout)
 
-        # Get params
-        self._iW = self.params.get('i2h_weight')
-        self._hW = self.params.get('h2h_weight')
-        # we add the forget_bias to i2h_bias, this adds the bias to the forget gate activation
-        self._iB = self.params.get('i2h_bias', init=init.LSTMBias(forget_bias=forget_bias))
-        self._hB = self.params.get('h2h_bias')
-
     @property
     def _gate_names(self):
         return ['_i', '_f', '_c', '_o']
@@ -1288,25 +1297,7 @@ def _gate_names(self):
     def __call__(self, inputs, states):
         self._counter += 1
         name = '%st%d_'%(self._prefix, self._counter)
-        i2h = symbol.Convolution(name='%si2h'%name,
-                                 data=inputs,
-                                 num_filter=self._num_hidden*4,
-                                 kernel=self._i2h_kernel,
-                                 stride=self._i2h_stride,
-                                 pad=self._i2h_pad,
-                                 dilate=self._i2h_dilate,
-                                 weight=self._iW,
-                                 bias=self._iB,)
-        h2h = symbol.Convolution(name='%sh2h'%name,
-                                 data=states[0],
-                                 num_filter=self._num_hidden*4,
-                                 kernel=self._h2h_kernel,
-                                 dilate=self._h2h_dilate,
-                                 pad=self._h2h_pad,
-                                 stride=(1, 1),
-                                 weight=self._hW,
-                                 bias=self._hB)
-
+        i2h, h2h = self._conv_forward(inputs, states, name)
         gates = i2h + h2h
         slice_gates = symbol.SliceChannel(gates, num_outputs=4, axis=self._conv_layout.find('C'),
                                           name="%sslice"%name)
@@ -1346,6 +1337,16 @@ class ConvGRUCell(BaseConvRNNCell):
         Pad of Convolution operator in input-to-state transitions.
     i2h_dilate : tuple of int, default (1, 1)
         Dilation of Convolution operator in input-to-state transitions.
+    i2h_weight_initializer : str or Initializer
+        Initializer for the input weights matrix, used for the convolution
+        transformation of the inputs.
+    h2h_weight_initializer : str or Initializer
+        Initializer for the recurrent weights matrix, used for the convolution
+        transformation of the recurrent state.
+    i2h_bias_initializer : str or Initializer, default zeros
+        Initializer for the bias vector.
+    h2h_bias_initializer : str or Initializer, default zeros
+        Initializer for the bias vector.
     activation : str or Symbol,
         default functools.partial(symbol.LeakyReLU, act_type='leaky', slope=0.2)
         Type of activation function.
@@ -1360,19 +1361,20 @@ def __init__(self, input_shape, num_hidden,
                  h2h_kernel=(3, 3), h2h_dilate=(1, 1),
                  i2h_kernel=(3, 3), i2h_stride=(1, 1),
                  i2h_pad=(1, 1), i2h_dilate=(1, 1),
+                 i2h_weight_initializer=None, h2h_weight_initializer=None,
+                 i2h_bias_initializer='zeros', h2h_bias_initializer='zeros',
                  activation=functools.partial(symbol.LeakyReLU, act_type='leaky', slope=0.2),
                  prefix='ConvGRU_', params=None, conv_layout='NCHW'):
         super(ConvGRUCell, self).__init__(input_shape=input_shape, num_hidden=num_hidden,
                                           h2h_kernel=h2h_kernel, h2h_dilate=h2h_dilate,
                                           i2h_kernel=i2h_kernel, i2h_stride=i2h_stride,
                                           i2h_pad=i2h_pad, i2h_dilate=i2h_dilate,
+                                          i2h_weight_initializer=i2h_weight_initializer,
+                                          h2h_weight_initializer=h2h_weight_initializer,
+                                          i2h_bias_initializer=i2h_bias_initializer,
+                                          h2h_bias_initializer=h2h_bias_initializer,
                                           activation=activation, prefix=prefix,
                                           params=params, conv_layout=conv_layout)
-        # Get params
-        self._iW = self.params.get('i2h_weight')
-        self._hW = self.params.get('h2h_weight')
-        self._iB = self.params.get('i2h_bias')
-        self._hB = self.params.get('h2h_bias')
 
     @property
     def _gate_names(self):
@@ -1382,22 +1384,7 @@ def __call__(self, inputs, states):
         self._counter += 1
         seq_idx = self._counter
         name = '%st%d_' % (self._prefix, seq_idx)
-        i2h = symbol.Convolution(name='%s_i2h'%name, data=inputs,
-                                 num_filter=self._num_hidden * 3,
-                                 kernel=self._i2h_kernel,
-                                 stride=self._i2h_stride,
-                                 pad=self._i2h_pad,
-                                 dilate=self._i2h_dilate,
-                                 weight=self._iW,
-                                 bias=self._iB,)
-        h2h = symbol.Convolution(name='%s_h2h'%name, data=states[0],
-                                 num_filter=self._num_hidden * 3,
-                                 kernel=self._h2h_kernel,
-                                 dilate=self._h2h_dilate,
-                                 pad=self._h2h_pad,
-                                 stride=(1, 1),
-                                 weight=self._hW,
-                                 bias=self._hB)
+        i2h, h2h = self._conv_forward(inputs, states, name)
 
         i2h_r, i2h_z, i2h = symbol.SliceChannel(i2h, num_outputs=3, name="%s_i2h_slice" % name)
         h2h_r, h2h_z, h2h = symbol.SliceChannel(h2h, num_outputs=3, name="%s_h2h_slice" % name)
diff --git a/tests/python/unittest/test_rnn.py b/tests/python/unittest/test_rnn.py
index 75f41fe13389..e8176bb468c2 100644
--- a/tests/python/unittest/test_rnn.py
+++ b/tests/python/unittest/test_rnn.py
@@ -254,7 +254,7 @@ def test_convlstm():
                                h2h_kernel=(3, 3), h2h_dilate=(1, 1),
                                i2h_kernel=(3, 3), i2h_stride=(1, 1),
                                i2h_pad=(1, 1), i2h_dilate=(1, 1),
-                               prefix='rnn_', forget_bias=1.0)
+                               prefix='rnn_')
     inputs = [mx.sym.Variable('rnn_t%d_data'%i) for i in range(3)]
     outputs, _ = cell.unroll(3, inputs)
     outputs = mx.sym.Group(outputs)

From c7d18e077d149784feaf08ad3d41d8302eb8e929 Mon Sep 17 00:00:00 2001
From: MinWoo Byeon <mwbyun@gmail.com>
Date: Fri, 4 Aug 2017 05:05:42 +0900
Subject: [PATCH 06/26] fix py3 compatibilities (#7305)

* fix py3 compatibilities

* fix py3 compatibilities
---
 tools/im2rec.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/tools/im2rec.py b/tools/im2rec.py
index a05bfb04621b..30ee3ec92a5e 100644
--- a/tools/im2rec.py
+++ b/tools/im2rec.py
@@ -11,9 +11,7 @@
 import cv2
 import time
 import traceback
-
-if sys.version_info[0] == 3:
-    xrange = range
+from builtins import range
 
 try:
     import multiprocessing
@@ -61,8 +59,8 @@ def make_list(args):
         random.seed(100)
         random.shuffle(image_list)
     N = len(image_list)
-    chunk_size = (N + args.chunks - 1) / args.chunks
-    for i in xrange(args.chunks):
+    chunk_size = (N + args.chunks - 1) // args.chunks
+    for i in range(args.chunks):
         chunk = image_list[i * chunk_size:(i + 1) * chunk_size]
         if args.chunks > 1:
             str_chunk = '_%d' % i
@@ -130,16 +128,16 @@ def image_encode(args, i, item, q_out):
         return
     if args.center_crop:
         if img.shape[0] > img.shape[1]:
-            margin = (img.shape[0] - img.shape[1]) / 2;
+            margin = (img.shape[0] - img.shape[1]) // 2;
             img = img[margin:margin + img.shape[1], :]
         else:
-            margin = (img.shape[1] - img.shape[0]) / 2;
+            margin = (img.shape[1] - img.shape[0]) // 2;
             img = img[:, margin:margin + img.shape[0]]
     if args.resize:
         if img.shape[0] > img.shape[1]:
-            newsize = (args.resize, img.shape[0] * args.resize / img.shape[1])
+            newsize = (args.resize, img.shape[0] * args.resize // img.shape[1])
         else:
-            newsize = (img.shape[1] * args.resize / img.shape[0], args.resize)
+            newsize = (img.shape[1] * args.resize // img.shape[0], args.resize)
         img = cv2.resize(img, newsize)
 
     try:

From 82a3d21104c348610d6f5e224c89a4382302725f Mon Sep 17 00:00:00 2001
From: Guneet Singh Dhillon <guneetdhillon@utexas.edu>
Date: Thu, 3 Aug 2017 13:08:14 -0700
Subject: [PATCH 07/26] Added sparsity functionality, with tests (#7138)

* added pruning for sgd

* added pruning for example/image-classification

* working example for imagenet to experiment on

* added flexibility to start off with pruning

* changes to imagenet code

* minor changes for testing

* changes to imagenet pruning

* small changes to parameters for tests

* DSD test on mnist added

* improved sparsification, added sparse-sparse training, added pruning factor

* changed test for more coverage

* updated example

* updated example to save models

* added thresholding by user

* made optimizer code cleaner, created tests - mlp and rnn

* added thresholding functionality, and related tests

* made minor change to tests

* updated common file, changed to merger

* merging

* reverted for mshadow

* reverted dmlc-core

* back to old examples

* removed spaces from code

* added comments

* another style change

* made SparseSGD a subclass

* removed dependencies from tests

* minor changes

* reduced checks - not needed

* call sgd from sparsesgd

* corrected syntax

* corrected syntax

* reverted back, handle epoch count myself

* added DSD traning to examples

* added mask generation logic

* added comment on layer-wise vs global pruning

* added update message in sparse_sgd

* added an example

* changes to README
---
 example/dsd/README.md     |  30 +++++++
 example/dsd/mlp.py        | 125 ++++++++++++++++++++++++++++
 example/dsd/sparse_sgd.py | 170 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 325 insertions(+)
 create mode 100644 example/dsd/README.md
 create mode 100644 example/dsd/mlp.py
 create mode 100644 example/dsd/sparse_sgd.py

diff --git a/example/dsd/README.md b/example/dsd/README.md
new file mode 100644
index 000000000000..0ce5cc5d1f0f
--- /dev/null
+++ b/example/dsd/README.md
@@ -0,0 +1,30 @@
+DSD Training
+============
+This folder contains an optimizer class that implements DSD training coupled with SGD. The training
+procedure is described in the paper *DSD: Dense-Sparse-Dense Training for Deep Neural Networks*,
+available at https://arxiv.org/pdf/1607.04381.pdf
+
+The optimizer class is flexible in the way it prunes weights. The user can define the following:
+-   The percentage sparsity they want or the thresholding value for the pruning
+-   The epochs at which they want a particular level of pruning
+
+Note that giving the sparsity level induces that level of sparsity in every layer of the neural
+network. It layer-wise pruning, and not global pruning (which would require loooking at all the
+weights of the neural network at the same time). However, global pruning can be done if the
+threshold value is known to the user (by doing some preprocessing), and is passed to the optimizer.
+
+## Example
+
+To test out the sparsity feature on a MLP, run the following script:
+
+    python mlp.py --pruning_switch_epoch 4,7,10 --bias_sparsity 0,30,50 --weight_sparsity 0,50,70
+
+This will train a MLP with 0% sparsity uptil epoch 4, with 30% bias and 50% weight sparsity uptil
+epoch 7, 50% bias and 70% weight sparsity uptil epoch 10.
+
+To test out the thresholding feature on a MLP, run the following script:
+
+    python mlp.py --pruning_switch_epoch 4,6 --bias_threshold 0,0.01 --weight_threshold 0,0.05
+
+This will train a MLP with thresholding at 0 uptil epoch 4, with bias thresholding at 0.01 and
+weight thresholding at 0.05 uptil epoch 6.
diff --git a/example/dsd/mlp.py b/example/dsd/mlp.py
new file mode 100644
index 000000000000..ccb094062f58
--- /dev/null
+++ b/example/dsd/mlp.py
@@ -0,0 +1,125 @@
+import mxnet as mx
+import os
+import logging
+import argparse
+from math import ceil
+import sparse_sgd
+
+# symbol net
+def get_symbol():
+    data = mx.symbol.Variable('data')
+    fc1 = mx.symbol.FullyConnected(data, name='fc1', num_hidden=128)
+    act1 = mx.symbol.Activation(fc1, name='relu1', act_type="relu")
+    fc2 = mx.symbol.FullyConnected(act1, name='fc2', num_hidden=64)
+    act2 = mx.symbol.Activation(fc2, name='relu2', act_type="relu")
+    fc3 = mx.symbol.FullyConnected(act2, name='fc3', num_hidden=10)
+    softmax = mx.symbol.SoftmaxOutput(fc3, name='sm')
+
+    return softmax
+
+# download ubyte version of mnist and untar
+def download_data():
+    if not os.path.isdir("data/"):
+        os.system("mkdir data/")
+    if (not os.path.exists('data/train-images-idx3-ubyte')) or \
+       (not os.path.exists('data/train-labels-idx1-ubyte')) or \
+       (not os.path.exists('data/t10k-images-idx3-ubyte')) or \
+       (not os.path.exists('data/t10k-labels-idx1-ubyte')):
+        os.system("wget -q http://data.mxnet.io/mxnet/data/mnist.zip -P data/")
+        os.chdir("./data")
+        os.system("unzip -u mnist.zip")
+        os.chdir("..")
+
+# get data iterators
+def get_iters(batch_size):
+    train = mx.io.MNISTIter(
+        image="data/train-images-idx3-ubyte",
+        label="data/train-labels-idx1-ubyte",
+        data_shape=(784,),
+        label_name='sm_label',
+        batch_size=batch_size,
+        shuffle=True,
+        flat=True,
+        silent=False,
+        seed=10)
+    val = mx.io.MNISTIter(
+        image="data/t10k-images-idx3-ubyte",
+        label="data/t10k-labels-idx1-ubyte",
+        data_shape=(784,),
+        label_name='sm_label',
+        batch_size=batch_size,
+        shuffle=True,
+        flat=True,
+        silent=False)
+
+    return (train, val)
+
+def test_mlp(args):
+    # get parameters
+    prefix = './mlp'
+    batch_size = 100
+    pruning_switch_epoch = [int(i) for i in args.pruning_switch_epoch.split(',')]
+    num_epoch = pruning_switch_epoch[-1]
+    batches_per_epoch = ceil(60000.0/batch_size)
+    weight_sparsity = args.weight_sparsity
+    bias_sparsity = args.bias_sparsity
+    weight_threshold = args.weight_threshold
+    bias_threshold = args.bias_threshold
+    if args.weight_sparsity:
+        weight_sparsity = [float(i) for i in args.weight_sparsity.split(',')]
+        bias_sparsity = [float(i) for i in args.bias_sparsity.split(',')]
+    else:
+        weight_threshold = [float(i) for i in args.weight_threshold.split(',')]
+        bias_threshold = [float(i) for i in args.bias_threshold.split(',')]
+
+    # get symbols and iterators
+    sym = get_symbol()
+    download_data()
+    (train, val) = get_iters(batch_size)
+
+    # fit model
+    model = mx.mod.Module(
+        sym,
+        context=[mx.cpu(i) for i in range(2)],
+        data_names=['data'],
+        label_names=['sm_label'])
+    optimizer_params = {
+        'learning_rate'             : 0.1,
+        'wd'                        : 0.004,
+        'momentum'                  : 0.9,
+        'pruning_switch_epoch'      : pruning_switch_epoch,
+        'batches_per_epoch'         : batches_per_epoch,
+        'weight_sparsity'           : weight_sparsity,
+        'bias_sparsity'             : bias_sparsity,
+        'weight_threshold'          : weight_threshold,
+        'bias_threshold'            : bias_threshold}
+    logging.info('Start training...')
+    model.fit(train,
+        eval_data=val,
+        eval_metric='acc',
+        epoch_end_callback=mx.callback.do_checkpoint(prefix),
+        num_epoch=num_epoch,
+        optimizer='sparsesgd',
+        optimizer_params=optimizer_params)
+    logging.info('Finish traning...')
+
+    # remove files
+    for i in range(num_epoch):
+        os.remove('%s-%04d.params' % (prefix, i + 1))
+    os.remove('%s-symbol.json' % prefix)
+
+
+if __name__ == "__main__":
+
+    # print logging by default
+    logging.basicConfig(level=logging.DEBUG)
+
+    parser = argparse.ArgumentParser(description="sparse training")
+    parser.add_argument('--pruning_switch_epoch', type=str)
+    parser.add_argument('--weight_sparsity', type=str, default=None)
+    parser.add_argument('--bias_sparsity', type=str, default=None)
+    parser.add_argument('--weight_threshold', type=str, default=None)
+    parser.add_argument('--bias_threshold', type=str, default=None)
+    args = parser.parse_args()
+
+    test_mlp(args)
diff --git a/example/dsd/sparse_sgd.py b/example/dsd/sparse_sgd.py
new file mode 100644
index 000000000000..f11a2395c4c0
--- /dev/null
+++ b/example/dsd/sparse_sgd.py
@@ -0,0 +1,170 @@
+from mxnet.ndarray import NDArray, topk, abs as NDabs
+from mxnet.optimizer import SGD, register
+import logging
+
+log = 'Sparsity Update:\t'
+
+@register
+class SparseSGD(SGD):
+    """The SGD optimizer with weight pruning.
+
+    This class implements the optimizer described in the paper *DSD: Dense-Sparse-Dense Training for
+    Deep Neural Networks*, available at https://arxiv.org/pdf/1607.04381.pdf
+
+    The optimizer updates the weights the same way as done in SGD, but does the following
+    preprocessing::
+
+        if threshold given, all weights below the threshold in absolute value are pruned,
+            mask    =   abs(weight) >= threshold
+        if sparsity level given, the smallest (sparsity)% weights in absolute value are pruned
+        (or the largest (100-sparsity)% weights in absolute value are used)
+            mask    =   topk(abs(weight), ret_typ='mask', k=weight.size*(100-sparsity)/100)
+
+        => mask[i,j]    =   {0 if weight[i,j] is pruned, 1 otherwise} (for a matrix representation)
+
+        weight  =   weight  *   mask
+        grad    =   grad    *   mask
+        state   =   state   *   mask
+
+    This optimizer accepts the following parameters in addition to those accepted
+    by :class:`.SGD`.
+
+    Parameters
+    ----------
+    pruning_switch_epoch : list of ints, optional
+        The epochs at which there is a change in sparsity level (should be in ascending order).
+
+    weight_sparsity : list of floats, optional
+        The sparsity on the weights required on each iteration of sparse training.
+
+    bias_sparsity : list of floats, optional
+        The sparsity on the biases required on each iteration of sparse training.
+
+    weight_threshold : list of floats, optional
+        The absolute value threshold on the weights required on each iteration of sparse training.
+
+    bias_threshold : list of floats, optional
+        The absolute value threshold on the biases required on each iteration of sparse training.
+
+    batches_per_epoch : int, optional
+        The number of batches in each epoch.
+        (The ceiling integer value of number_of_examples / batch_size)
+    """
+    def __init__(self, pruning_switch_epoch, batches_per_epoch,
+                 weight_sparsity=None, bias_sparsity=None,
+                 weight_threshold=None, bias_threshold=None, **kwargs):
+        super(SparseSGD, self).__init__(**kwargs)
+
+        self.masks = []
+        self.masks_updated = False
+        self.epoch = 0
+        self.pruning_switch_epoch = pruning_switch_epoch
+        self.batches_per_epoch = batches_per_epoch
+
+        # get weight and bias sparsity percentages
+        self.weight_sparsity = weight_sparsity
+        self.bias_sparsity = bias_sparsity
+        if weight_sparsity is not None:
+            assert len(weight_sparsity) == len(bias_sparsity), \
+                'weight_sparsity and bias_sparsity should have same length'
+            assert len(weight_sparsity) == len(pruning_switch_epoch), \
+                'pruning_switch_epoch and weight_sparsity should have same length'
+
+        # get weight and bias sparsity thresholds
+        self.weight_threshold = weight_threshold
+        self.bias_threshold = bias_threshold
+        if weight_threshold is not None:
+            assert len(weight_threshold) == len(bias_threshold), \
+                'weight_threshold and bias_threshold should have same length'
+            assert len(weight_threshold) == len(pruning_switch_epoch), \
+                'pruning_switch_epoch and weight_sparsity_threshold should have same length'
+
+        # either percentages or thresholds must be given
+        assert weight_sparsity is not None or weight_threshold is not None,\
+            'weight_sparsity or weight_sparsity_threshold should be given'
+
+    def update_masks(self, index, weight):
+        """Updates the masks for sparse training.
+
+        Parameters
+        ----------
+        index : int
+            The index for weight.
+        weight : NDArray
+            The weight matrix.
+
+        Returns
+        -------
+        boolean
+            If the masks were changed
+        """
+        # determine number of updates without actually updating the count
+        if index not in self._index_update_count:
+            num_update = self.begin_num_update
+        else:
+            num_update = self._index_update_count[index]
+        num_update += 1
+        num_update = max(num_update, self.num_update)
+
+        # calculate epoch
+        epoch = int((num_update - 1) / self.batches_per_epoch) + 1
+
+        # determine if masks need to be updated, and get corresponding parameters
+        if index == 0:
+            self.masks_updated = True
+        if self.epoch != epoch:
+            self.epoch = epoch
+            if epoch == 1:
+                self.masks_updated = False
+                if self.weight_sparsity is not None:
+                    logging.info(log + 'bias-sparsity={}, weight-sparsity={}'.format(self.bias_sparsity[0], self.weight_sparsity[0]))
+                else:
+                    logging.info(log + 'bias-threshold={}, weight-threshold={}'.format(self.bias_threshold[0], self.weight_threshold[0]))
+            if self.pruning_switch_epoch[0] + 1 == epoch:
+                self.masks_updated = False
+                self.pruning_switch_epoch.pop(0)
+                if self.weight_sparsity is not None:
+                    self.weight_sparsity.pop(0)
+                    self.bias_sparsity.pop(0)
+                    logging.info(log + 'bias-sparsity={}, weight-sparsity={}'.format(self.bias_sparsity[0], self.weight_sparsity[0]))
+                else:
+                    self.weight_threshold.pop(0)
+                    self.bias_threshold.pop(0)
+                    logging.info(log + 'bias-threshold={}, weight-threshold={}'.format(self.bias_threshold[0], self.weight_threshold[0]))
+
+        # update masks if needed
+        if not self.masks_updated:
+            # initialize masks
+            if epoch == 1:
+                self.masks.append(None)
+            # if percentages are given
+            if self.weight_sparsity is not None:
+                if len(weight.shape) == 1:
+                    sparsity = self.bias_sparsity[0]
+                else:
+                    sparsity = self.weight_sparsity[0]
+                number_unpruned = int((100.0 - sparsity) * weight.size / 100.0)
+                self.masks[index] = topk(NDabs(weight), axis=None, ret_typ='mask',
+                                         k=number_unpruned)
+            # if thresholds are given
+            else:
+                if len(weight.shape) == 1:
+                    threshold = self.bias_threshold[0]
+                else:
+                    threshold = self.weight_threshold[0]
+                self.masks[index] = NDabs(weight) >= threshold
+
+        return not self.masks_updated
+
+    def update(self, index, weight, grad, state):
+        assert(isinstance(weight, NDArray))
+        assert(isinstance(grad, NDArray))
+
+        # preprocessing for pruning
+        if self.update_masks(index, weight):
+            weight[:] = weight * self.masks[index]
+        grad[:] = grad * self.masks[index]
+        if state is not None:
+            state[:] = state * self.masks[index]
+
+        super(SparseSGD, self).update(index, weight, grad, state)

From 9add5ae417cd6fa5e9153c1f19195f5b88c01305 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=A2=81=E5=BE=B7=E6=BE=8E?= <liangdepeng@gmail.com>
Date: Fri, 4 Aug 2017 04:10:52 +0800
Subject: [PATCH 08/26] improve convert_symbol.py add support to SUM with coeff
 (#7120)

* improve convert_symbol.py add support to SUM with coeff

* fix code style

* fix code style

* fix code style
---
 tools/caffe_converter/convert_symbol.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/tools/caffe_converter/convert_symbol.py b/tools/caffe_converter/convert_symbol.py
index fad89c41e83c..c384c7690088 100644
--- a/tools/caffe_converter/convert_symbol.py
+++ b/tools/caffe_converter/convert_symbol.py
@@ -207,6 +207,7 @@ def _parse_proto(prototxt_fname):
             need_flatten[name] = need_flatten[mapping[layer.bottom[0]]]
         if layer.type == 'Eltwise':
             type_string = 'mx.symbol.broadcast_add'
+            param = layer.eltwise_param
             param_string = ""
             need_flatten[name] = False
         if layer.type == 'Reshape':
@@ -239,8 +240,15 @@ def _parse_proto(prototxt_fname):
                 symbol_string += "%s = %s(name='%s', data=%s %s)\n" % (
                     name, type_string, name, mapping[bottom[0]], param_string)
             else:
-                symbol_string += "%s = %s(name='%s', *[%s] %s)\n" % (
-                    name, type_string, name, ','.join([mapping[x] for x in bottom]), param_string)
+                if layer.type == 'Eltwise' and param.operation == 1 and len(param.coeff) > 0:
+                    symbol_string += "%s = " % name
+                    symbol_string += " + ".join(["%s * %s" % (
+                        mapping[bottom[i]], param.coeff[i]) for i in range(len(param.coeff))])
+                    symbol_string += "\n"
+                else:
+                    symbol_string += "%s = %s(name='%s', *[%s] %s)\n" % (
+                        name, type_string, name, ','.join(
+                            [mapping[x] for x in bottom]), param_string)
         for j in range(len(layer.top)):
             mapping[layer.top[j]] = name
         output_name = name

From 8519eafd1d355f66245d9ce074c4f451bceef311 Mon Sep 17 00:00:00 2001
From: qingzhouzhen <576591769@qq.com>
Date: Fri, 4 Aug 2017 04:12:28 +0800
Subject: [PATCH 09/26] add mobilenet (#7121)

---
 .../image-classification/symbols/mobilenet.py | 45 +++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 example/image-classification/symbols/mobilenet.py

diff --git a/example/image-classification/symbols/mobilenet.py b/example/image-classification/symbols/mobilenet.py
new file mode 100644
index 000000000000..cf470bace727
--- /dev/null
+++ b/example/image-classification/symbols/mobilenet.py
@@ -0,0 +1,45 @@
+import mxnet as mx
+
+def Conv(data, num_filter=1, kernel=(1, 1), stride=(1, 1), pad=(0, 0), num_group=1, name=None, suffix=''):
+    conv = mx.sym.Convolution(data=data, num_filter=num_filter, kernel=kernel, num_group=num_group, stride=stride, pad=pad, no_bias=True, name='%s%s_conv2d' %(name, suffix))
+    bn = mx.sym.BatchNorm(data=conv, name='%s%s_batchnorm' %(name, suffix), fix_gamma=True)
+    act = mx.sym.Activation(data=bn, act_type='relu', name='%s%s_relu' %(name, suffix))
+    return act
+
+def get_symbol(num_classes, **kwargs):
+    data = mx.symbol.Variable(name="data") # 224
+    conv_1 = Conv(data, num_filter=32, kernel=(3, 3), pad=(1, 1), stride=(2, 2), name="conv_1") # 224/112
+    conv_2_dw = Conv(conv_1, num_group=32, num_filter=32, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name="conv_2_dw") # 112/112
+    conv_2 = Conv(conv_2_dw, num_filter=64, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_2") # 112/112
+    conv_3_dw = Conv(conv_2, num_group=64, num_filter=64, kernel=(3, 3), pad=(1, 1), stride=(2, 2), name="conv_3_dw") # 112/56
+    conv_3 = Conv(conv_3_dw, num_filter=128, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_3") # 56/56
+    conv_4_dw = Conv(conv_3, num_group=128, num_filter=128, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name="conv_4_dw") # 56/56
+    conv_4 = Conv(conv_3_dw, num_filter=128, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_4") # 56/56
+    conv_5_dw = Conv(conv_4, num_group=128, num_filter=128, kernel=(3, 3), pad=(1, 1), stride=(2, 2), name="conv_5_dw") # 56/28
+    conv_5 = Conv(conv_5_dw, num_filter=256, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_5") # 28/28
+    conv_6_dw = Conv(conv_5, num_group=256, num_filter=256, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name="conv_6_dw") # 28/28
+    conv_6 = Conv(conv_6_dw, num_filter=256, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_6") # 28/28
+    conv_7_dw = Conv(conv_6, num_group=256, num_filter=256, kernel=(3, 3), pad=(1, 1), stride=(2, 2), name="conv_7_dw") # 28/14
+    conv_7 = Conv(conv_7_dw, num_filter=512, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_7") # 14/14
+
+    conv_8_dw = Conv(conv_7, num_group=512, num_filter=512, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name="conv_8_dw") # 14/14
+    conv_8 = Conv(conv_8_dw, num_filter=512, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_8") # 14/14
+    conv_9_dw = Conv(conv_8, num_group=512, num_filter=512, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name="conv_9_dw") # 14/14
+    conv_9 = Conv(conv_9_dw, num_filter=512, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_9") # 14/14
+    conv_10_dw = Conv(conv_9, num_group=512, num_filter=512, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name="conv_10_dw") # 14/14
+    conv_10 = Conv(conv_10_dw, num_filter=512, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_10") # 14/14
+    conv_11_dw = Conv(conv_10, num_group=512, num_filter=512, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name="conv_11_dw") # 14/14
+    conv_11 = Conv(conv_11_dw, num_filter=512, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_11") # 14/14
+    conv_12_dw = Conv(conv_11, num_group=512, num_filter=512, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name="conv_12_dw") # 14/14
+    conv_12 = Conv(conv_12_dw, num_filter=512, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_12") # 14/14
+
+    conv_13_dw = Conv(conv_12, num_group=512, num_filter=512, kernel=(3, 3), pad=(1, 1), stride=(2, 2), name="conv_13_dw") # 14/7
+    conv_13 = Conv(conv_13_dw, num_filter=1024, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_13") # 7/7
+    conv_14_dw = Conv(conv_13, num_group=1024, num_filter=1024, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name="conv_14_dw") # 7/7
+    conv_14 = Conv(conv_14_dw, num_filter=1024, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_14") # 7/7
+
+    pool = mx.sym.Pooling(data=conv_14, kernel=(7, 7), stride=(1, 1), pool_type="avg", name="global_pool")
+    flatten = mx.sym.Flatten(data=pool, name="flatten")
+    fc = mx.symbol.FullyConnected(data=flatten, num_hidden=num_classes, name='fc')
+    softmax = mx.symbol.SoftmaxOutput(data=fc, name='softmax')
+    return softmax

From 76dee53dc494b35ce7dd4ac88cdce817bf9aa1ce Mon Sep 17 00:00:00 2001
From: CNevd <CNevd@users.noreply.github.com>
Date: Fri, 4 Aug 2017 04:18:07 +0800
Subject: [PATCH 10/26] [cpp-package] add lr scheduler (#6885)

* add lr scheduler

* Update lr_scheduler.h

* Update mlp_gpu.cpp

* Update test_score.cpp

* update optimizer.hpp
---
 cpp-package/example/alexnet.cpp               | 11 ++-
 cpp-package/example/charRNN.cpp               | 11 ++-
 cpp-package/example/googlenet.cpp             | 19 +++--
 cpp-package/example/inception_bn.cpp          | 12 ++-
 cpp-package/example/lenet.cpp                 | 17 ++--
 cpp-package/example/lenet_with_mxdataiter.cpp | 20 +++--
 cpp-package/example/mlp_cpu.cpp               | 20 +++--
 cpp-package/example/mlp_gpu.cpp               | 43 ++++++----
 cpp-package/example/resnet.cpp                | 11 ++-
 cpp-package/example/test_score.cpp            | 22 ++++--
 cpp-package/include/mxnet-cpp/executor.h      | 12 ---
 cpp-package/include/mxnet-cpp/executor.hpp    |  7 --
 cpp-package/include/mxnet-cpp/lr_scheduler.h  | 78 +++++++++++++++++++
 cpp-package/include/mxnet-cpp/optimizer.h     | 22 +++---
 cpp-package/include/mxnet-cpp/optimizer.hpp   | 42 +++++++---
 15 files changed, 254 insertions(+), 93 deletions(-)
 create mode 100644 cpp-package/include/mxnet-cpp/lr_scheduler.h

diff --git a/cpp-package/example/alexnet.cpp b/cpp-package/example/alexnet.cpp
index c0d8273d559b..6a9e01ab01a0 100644
--- a/cpp-package/example/alexnet.cpp
+++ b/cpp-package/example/alexnet.cpp
@@ -199,6 +199,7 @@ int main(int argc, char const *argv[]) {
 
   /*with data and label, executor can be generated automatically*/
   auto *exec = Net.SimpleBind(ctx, args_map);
+  auto arg_names = Net.ListArguments();
   aux_map = exec->aux_dict();
   args_map = exec->arg_dict();
 
@@ -240,7 +241,9 @@ int main(int argc, char const *argv[]) {
   Optimizer* opt = OptimizerRegistry::Find("ccsgd");
   opt->SetParam("momentum", 0.9)
      ->SetParam("rescale_grad", 1.0 / batch_size)
-     ->SetParam("clip_gradient", 10);
+     ->SetParam("clip_gradient", 10)
+     ->SetParam("lr", learning_rate)
+     ->SetParam("wd", weight_decay);
 
   Accuracy acu_train, acu_val;
   LogLoss logloss_val;
@@ -258,7 +261,11 @@ int main(int argc, char const *argv[]) {
       batch.label.CopyTo(&args_map["label"]);
       exec->Forward(true);
       exec->Backward();
-      exec->UpdateAll(opt, learning_rate, weight_decay);
+      for (size_t i = 0; i < arg_names.size(); ++i) {
+        if (arg_names[i] == "data" || arg_names[i] == "label") continue;
+        opt->Update(i, exec->arg_arrays[i], exec->grad_arrays[i]);
+      }
+
       NDArray::WaitAll();
       acu_train.Update(batch.label, exec->outputs[0]);
     }
diff --git a/cpp-package/example/charRNN.cpp b/cpp-package/example/charRNN.cpp
index 5cb6382137c7..d95c97d8e734 100644
--- a/cpp-package/example/charRNN.cpp
+++ b/cpp-package/example/charRNN.cpp
@@ -451,6 +451,8 @@ void train(const string file, int batch_size, int max_epoch, int start_epoch) {
   mx_float learning_rate = 0.0002;
   mx_float weight_decay = 0.000002;
   Optimizer* opt = OptimizerRegistry::Find("ccsgd");
+  opt->SetParam("lr", learning_rate)
+     ->SetParam("wd", weight_decay);
 //  opt->SetParam("momentum", 0.9)->SetParam("rescale_grad", 1.0 / batch_size)
 //  ->SetParam("clip_gradient", 10);
 
@@ -470,7 +472,10 @@ void train(const string file, int batch_size, int max_epoch, int start_epoch) {
 
       exe->Forward(true);
       exe->Backward();
-      exe->UpdateAll(opt, learning_rate, weight_decay);
+      for (size_t i = 0; i < exe->arg_arrays.size(); ++i) {
+        opt->Update(i, exe->arg_arrays[i], exe->grad_arrays[i]);
+      }
+
       NDArray::WaitAll();
     }
     auto toc = chrono::system_clock::now();
@@ -547,7 +552,9 @@ void trainWithBuiltInRNNOp(const string file, int batch_size, int max_epoch, int
 
       exe->Forward(true);
       exe->Backward();
-      exe->UpdateAll(opt, learning_rate, weight_decay);
+      for (size_t i = 0; i < exe->arg_arrays.size(); ++i) {
+        opt->Update(i, exe->arg_arrays[i], exe->grad_arrays[i]);
+      }
       NDArray::WaitAll();
     }
     auto toc = chrono::system_clock::now();
diff --git a/cpp-package/example/googlenet.cpp b/cpp-package/example/googlenet.cpp
index a4dcbbd4a6cf..2e59fbfe45cd 100644
--- a/cpp-package/example/googlenet.cpp
+++ b/cpp-package/example/googlenet.cpp
@@ -128,7 +128,13 @@ int main(int argc, char const *argv[]) {
   Optimizer* opt = OptimizerRegistry::Find("ccsgd");
   opt->SetParam("momentum", 0.9)
      ->SetParam("rescale_grad", 1.0 / batch_size)
-     ->SetParam("clip_gradient", 10);
+     ->SetParam("clip_gradient", 10)
+     ->SetParam("lr", learning_rate)
+     ->SetParam("wd", weight_decay);
+
+
+  auto *exec = googlenet.SimpleBind(Context::gpu(), args_map);
+  auto arg_names = googlenet.ListArguments();
 
   for (int iter = 0; iter < max_epoch; ++iter) {
     LG << "Epoch: " << iter;
@@ -138,11 +144,12 @@ int main(int argc, char const *argv[]) {
       args_map["data"] = data_batch.data.Copy(Context::gpu());
       args_map["data_label"] = data_batch.label.Copy(Context::gpu());
       NDArray::WaitAll();
-      auto *exec = googlenet.SimpleBind(Context::gpu(), args_map);
       exec->Forward(true);
       exec->Backward();
-      exec->UpdateAll(opt, learning_rate, weight_decay);
-      delete exec;
+      for (size_t i = 0; i < arg_names.size(); ++i) {
+        if (arg_names[i] == "data" || arg_names[i] == "data_label") continue;
+        opt->Update(i, exec->arg_arrays[i], exec->grad_arrays[i]);
+      }
     }
 
     Accuracy acu;
@@ -152,14 +159,14 @@ int main(int argc, char const *argv[]) {
       args_map["data"] = data_batch.data.Copy(Context::gpu());
       args_map["data_label"] = data_batch.label.Copy(Context::gpu());
       NDArray::WaitAll();
-      auto *exec = googlenet.SimpleBind(Context::gpu(), args_map);
       exec->Forward(false);
       NDArray::WaitAll();
       acu.Update(data_batch.label, exec->outputs[0]);
-      delete exec;
     }
     LG << "Accuracy: " << acu.Get();
   }
+
+  delete exec;
   MXNotifyShutdown();
   return 0;
 }
diff --git a/cpp-package/example/inception_bn.cpp b/cpp-package/example/inception_bn.cpp
index 5db4f81b0e07..4442e006b5a5 100644
--- a/cpp-package/example/inception_bn.cpp
+++ b/cpp-package/example/inception_bn.cpp
@@ -156,9 +156,12 @@ int main(int argc, char const *argv[]) {
   Optimizer* opt = OptimizerRegistry::Find("ccsgd");
   opt->SetParam("momentum", 0.9)
      ->SetParam("rescale_grad", 1.0 / batch_size)
-     ->SetParam("clip_gradient", 10);
+     ->SetParam("clip_gradient", 10)
+     ->SetParam("lr", learning_rate)
+     ->SetParam("wd", weight_decay);
 
   auto *exec = inception_bn_net.SimpleBind(Context::gpu(), args_map);
+  auto arg_names = inception_bn_net.ListArguments();
 
   for (int iter = 0; iter < max_epoch; ++iter) {
     LG << "Epoch: " << iter;
@@ -171,7 +174,12 @@ int main(int argc, char const *argv[]) {
 
       exec->Forward(true);
       exec->Backward();
-      exec->UpdateAll(opt, learning_rate, weight_decay);
+      // Update parameters
+      for (size_t i = 0; i < arg_names.size(); ++i) {
+        if (arg_names[i] == "data" || arg_names[i] == "data_label") continue;
+        opt->Update(i, exec->arg_arrays[i], exec->grad_arrays[i]);
+      }
+
       NDArray::WaitAll();
     }
 
diff --git a/cpp-package/example/lenet.cpp b/cpp-package/example/lenet.cpp
index 91b83a090fa3..56f8d2c8743a 100644
--- a/cpp-package/example/lenet.cpp
+++ b/cpp-package/example/lenet.cpp
@@ -118,7 +118,12 @@ class Lenet {
     Optimizer* opt = OptimizerRegistry::Find("ccsgd");
     opt->SetParam("momentum", 0.9)
        ->SetParam("rescale_grad", 1.0)
-       ->SetParam("clip_gradient", 10);
+       ->SetParam("clip_gradient", 10)
+       ->SetParam("lr", learning_rate)
+       ->SetParam("wd", weight_decay);
+
+    Executor *exe = lenet.SimpleBind(ctx_dev, args_map);
+    auto arg_names = lenet.ListArguments();
 
     for (int ITER = 0; ITER < max_epoch; ++ITER) {
       size_t start_index = 0;
@@ -135,17 +140,19 @@ class Lenet {
         start_index += batch_size;
         NDArray::WaitAll();
 
-        Executor *exe = lenet.SimpleBind(ctx_dev, args_map);
         exe->Forward(true);
         exe->Backward();
-        exe->UpdateAll(opt, learning_rate, weight_decay);
-
-        delete exe;
+        // Update parameters
+        for (size_t i = 0; i < arg_names.size(); ++i) {
+          if (arg_names[i] == "data" || arg_names[i] == "data_label") continue;
+          opt->Update(i, exe->arg_arrays[i], exe->grad_arrays[i]);
+        }
       }
 
       LG << "Iter " << ITER
          << ", accuracy: " << ValAccuracy(batch_size * 10, lenet);
     }
+    delete exe;
   }
 
  private:
diff --git a/cpp-package/example/lenet_with_mxdataiter.cpp b/cpp-package/example/lenet_with_mxdataiter.cpp
index 85a4b2012eb6..f6301b52a61f 100644
--- a/cpp-package/example/lenet_with_mxdataiter.cpp
+++ b/cpp-package/example/lenet_with_mxdataiter.cpp
@@ -85,7 +85,13 @@ int main(int argc, char const *argv[]) {
   Optimizer* opt = OptimizerRegistry::Find("ccsgd");
   opt->SetParam("momentum", 0.9)
      ->SetParam("rescale_grad", 1.0)
-     ->SetParam("clip_gradient", 10);
+     ->SetParam("clip_gradient", 10)
+     ->SetParam("lr", learning_rate)
+     ->SetParam("wd", weight_decay);
+
+
+  auto *exec = lenet.SimpleBind(Context::gpu(), args_map);
+  auto arg_names = lenet.ListArguments();
 
   for (int iter = 0; iter < max_epoch; ++iter) {
     LG << "Epoch: " << iter;
@@ -95,11 +101,13 @@ int main(int argc, char const *argv[]) {
       args_map["data"] = data_batch.data.Copy(Context::gpu());
       args_map["data_label"] = data_batch.label.Copy(Context::gpu());
       NDArray::WaitAll();
-      auto *exec = lenet.SimpleBind(Context::gpu(), args_map);
       exec->Forward(true);
       exec->Backward();
-      exec->UpdateAll(opt, learning_rate, weight_decay);
-      delete exec;
+      // Update parameters
+      for (size_t i = 0; i < arg_names.size(); ++i) {
+        if (arg_names[i] == "data" || arg_names[i] == "data_label") continue;
+        opt->Update(i, exec->arg_arrays[i], exec->grad_arrays[i]);
+      }
     }
 
     Accuracy acu;
@@ -109,14 +117,14 @@ int main(int argc, char const *argv[]) {
       args_map["data"] = data_batch.data.Copy(Context::gpu());
       args_map["data_label"] = data_batch.label.Copy(Context::gpu());
       NDArray::WaitAll();
-      auto *exec = lenet.SimpleBind(Context::gpu(), args_map);
       exec->Forward(false);
       NDArray::WaitAll();
       acu.Update(data_batch.label, exec->outputs[0]);
-      delete exec;
     }
     LG << "Accuracy: " << acu.Get();
   }
+
+  delete exec;
   MXNotifyShutdown();
   return 0;
 }
diff --git a/cpp-package/example/mlp_cpu.cpp b/cpp-package/example/mlp_cpu.cpp
index 69486490194c..358e8348ac5e 100644
--- a/cpp-package/example/mlp_cpu.cpp
+++ b/cpp-package/example/mlp_cpu.cpp
@@ -70,7 +70,13 @@ int main(int argc, char** argv) {
 
   // Create sgd optimizer
   Optimizer* opt = OptimizerRegistry::Find("sgd");
-  opt->SetParam("rescale_grad", 1.0/batch_size);
+  opt->SetParam("rescale_grad", 1.0/batch_size)
+     ->SetParam("lr", learning_rate)
+     ->SetParam("wd", weight_decay);
+
+  // Create executor by binding parameters to the model
+  auto *exec = net.SimpleBind(ctx, args);
+  auto arg_names = net.ListArguments();
 
   // Start training
   for (int iter = 0; iter < max_epoch; ++iter) {
@@ -85,15 +91,14 @@ int main(int argc, char** argv) {
       args["X"] = data_batch.data;
       args["label"] = data_batch.label;
 
-      // Create executor by binding parameters to the model
-      auto *exec = net.SimpleBind(ctx, args);
       // Compute gradients
       exec->Forward(true);
       exec->Backward();
       // Update parameters
-      exec->UpdateAll(opt, learning_rate, weight_decay);
-      // Remember to free the memory
-      delete exec;
+      for (size_t i = 0; i < arg_names.size(); ++i) {
+        if (arg_names[i] == "X" || arg_names[i] == "label") continue;
+        opt->Update(i, exec->arg_arrays[i], exec->grad_arrays[i]);
+      }
     }
     auto toc = chrono::system_clock::now();
 
@@ -103,16 +108,15 @@ int main(int argc, char** argv) {
       auto data_batch = val_iter.GetDataBatch();
       args["X"] = data_batch.data;
       args["label"] = data_batch.label;
-      auto *exec = net.SimpleBind(ctx, args);
       // Forward pass is enough as no gradient is needed when evaluating
       exec->Forward(false);
       acc.Update(data_batch.label, exec->outputs[0]);
-      delete exec;
     }
     float duration = chrono::duration_cast<chrono::milliseconds>(toc - tic).count() / 1000.0;
     LG << "Epoch: " << iter << " " << samples/duration << " samples/sec Accuracy: " << acc.Get();
   }
 
+  delete exec;
   MXNotifyShutdown();
   return 0;
 }
diff --git a/cpp-package/example/mlp_gpu.cpp b/cpp-package/example/mlp_gpu.cpp
index 23be637437ff..a6281c385dfb 100644
--- a/cpp-package/example/mlp_gpu.cpp
+++ b/cpp-package/example/mlp_gpu.cpp
@@ -24,7 +24,7 @@ Symbol mlp(const vector<int> &layers) {
       weights[i],
       biases[i],
       layers[i]);
-    outputs[i] = i == layers.size()-1? fc : Activation(fc, ActivationActType::kRelu);
+    outputs[i] = i == layers.size()-1 ? fc : Activation(fc, ActivationActType::kRelu);
   }
 
   return SoftmaxOutput(outputs.back(), label);
@@ -70,12 +70,24 @@ int main(int argc, char** argv) {
 
   // Create sgd optimizer
   Optimizer* opt = OptimizerRegistry::Find("sgd");
-  opt->SetParam("rescale_grad", 1.0/batch_size);
+  opt->SetParam("rescale_grad", 1.0/batch_size)
+     ->SetParam("lr", learning_rate)
+     ->SetParam("wd", weight_decay);
+  std::unique_ptr<LRScheduler> lr_sch(new FactorScheduler(5000, 0.1));
+  opt->SetLRScheduler(std::move(lr_sch));
+
+  // Create executor by binding parameters to the model
+  auto *exec = net.SimpleBind(ctx, args);
+  auto arg_names = net.ListArguments();
+
+  // Create metrics
+  Accuracy train_acc, val_acc;
 
   // Start training
   for (int iter = 0; iter < max_epoch; ++iter) {
     int samples = 0;
     train_iter.Reset();
+    train_acc.Reset();
 
     auto tic = chrono::system_clock::now();
     while (train_iter.Next()) {
@@ -87,35 +99,40 @@ int main(int argc, char** argv) {
       // CopyTo is imperative, need to wait for it to complete.
       NDArray::WaitAll();
 
-      // Create executor by binding parameters to the model
-      auto *exec = net.SimpleBind(ctx, args);
       // Compute gradients
       exec->Forward(true);
       exec->Backward();
+
       // Update parameters
-      exec->UpdateAll(opt, learning_rate, weight_decay);
-      // Remember to free the memory
-      delete exec;
+      for (size_t i = 0; i < arg_names.size(); ++i) {
+        if (arg_names[i] == "X" || arg_names[i] == "label") continue;
+        opt->Update(i, exec->arg_arrays[i], exec->grad_arrays[i]);
+      }
+      // Update metric
+      train_acc.Update(data_batch.label, exec->outputs[0]);
     }
+    // one epoch of training is finished
     auto toc = chrono::system_clock::now();
+    float duration = chrono::duration_cast<chrono::milliseconds>(toc - tic).count() / 1000.0;
+    LG << "Epoch[" << iter << "] " << samples/duration \
+       << " samples/sec " << "Train-Accuracy=" << train_acc.Get();;
 
-    Accuracy acc;
     val_iter.Reset();
+    val_acc.Reset();
     while (val_iter.Next()) {
       auto data_batch = val_iter.GetDataBatch();
       data_batch.data.CopyTo(&args["X"]);
       data_batch.label.CopyTo(&args["label"]);
       NDArray::WaitAll();
-      auto *exec = net.SimpleBind(ctx, args);
+
       // Only forward pass is enough as no gradient is needed when evaluating
       exec->Forward(false);
-      acc.Update(data_batch.label, exec->outputs[0]);
-      delete exec;
+      val_acc.Update(data_batch.label, exec->outputs[0]);
     }
-    float duration = chrono::duration_cast<chrono::milliseconds>(toc - tic).count() / 1000.0;
-    LG << "Epoch: " << iter << " " << samples/duration << " samples/sec Accuracy: " << acc.Get();
+    LG << "Epoch[" << iter << "] Val-Accuracy=" << val_acc.Get();
   }
 
+  delete exec;
   MXNotifyShutdown();
   return 0;
 }
diff --git a/cpp-package/example/resnet.cpp b/cpp-package/example/resnet.cpp
index 5521567e119d..b9766c7a64d0 100644
--- a/cpp-package/example/resnet.cpp
+++ b/cpp-package/example/resnet.cpp
@@ -165,11 +165,14 @@ int main(int argc, char const *argv[]) {
       .CreateDataIter();
 
   Optimizer* opt = OptimizerRegistry::Find("ccsgd");
-  opt->SetParam("momentum", 0.9)
+  opt->SetParam("lr", learning_rate)
+     ->SetParam("wd", weight_decay)
+     ->SetParam("momentum", 0.9)
      ->SetParam("rescale_grad", 1.0 / batch_size)
      ->SetParam("clip_gradient", 10);
 
   auto *exec = resnet.SimpleBind(Context::gpu(), args_map);
+  auto arg_names = resnet.ListArguments();
 
   for (int iter = 0; iter < max_epoch; ++iter) {
     LG << "Epoch: " << iter;
@@ -182,7 +185,11 @@ int main(int argc, char const *argv[]) {
 
       exec->Forward(true);
       exec->Backward();
-      exec->UpdateAll(opt, learning_rate, weight_decay);
+
+      for (size_t i = 0; i < arg_names.size(); ++i) {
+        if (arg_names[i] == "data" || arg_names[i] == "data_label") continue;
+        opt->Update(i, exec->arg_arrays[i], exec->grad_arrays[i]);
+      }
       NDArray::WaitAll();
     }
 
diff --git a/cpp-package/example/test_score.cpp b/cpp-package/example/test_score.cpp
index 7dccd30b6f94..35342699558f 100644
--- a/cpp-package/example/test_score.cpp
+++ b/cpp-package/example/test_score.cpp
@@ -72,7 +72,15 @@ int main(int argc, char** argv) {
 
   // Create sgd optimizer
   Optimizer* opt = OptimizerRegistry::Find("sgd");
-  opt->SetParam("rescale_grad", 1.0/batch_size);
+  opt->SetParam("rescale_grad", 1.0/batch_size)
+     ->SetParam("lr", learning_rate)
+     ->SetParam("wd", weight_decay);
+  std::unique_ptr<LRScheduler> lr_sch(new FactorScheduler(5000, 0.1));
+  opt->SetLRScheduler(std::move(lr_sch));
+
+  // Create executor by binding parameters to the model
+  auto *exec = net.SimpleBind(ctx, args);
+  auto arg_names = net.ListArguments();
 
   float score = 0;
   // Start training
@@ -90,15 +98,14 @@ int main(int argc, char** argv) {
       // CopyTo is imperative, need to wait for it to complete.
       NDArray::WaitAll();
 
-      // Create executor by binding parameters to the model
-      auto *exec = net.SimpleBind(ctx, args);
       // Compute gradients
       exec->Forward(true);
       exec->Backward();
       // Update parameters
-      exec->UpdateAll(opt, learning_rate, weight_decay);
-      // Remember to free the memory
-      delete exec;
+      for (size_t i = 0; i < arg_names.size(); ++i) {
+        if (arg_names[i] == "X" || arg_names[i] == "label") continue;
+        opt->Update(i, exec->arg_arrays[i], exec->grad_arrays[i]);
+      }
     }
     auto toc = chrono::system_clock::now();
 
@@ -109,17 +116,16 @@ int main(int argc, char** argv) {
       data_batch.data.CopyTo(&args["X"]);
       data_batch.label.CopyTo(&args["label"]);
       NDArray::WaitAll();
-      auto *exec = net.SimpleBind(ctx, args);
       // Only forward pass is enough as no gradient is needed when evaluating
       exec->Forward(false);
       acc.Update(data_batch.label, exec->outputs[0]);
-      delete exec;
     }
     float duration = chrono::duration_cast<chrono::milliseconds>(toc - tic).count() / 1000.0;
     LG << "Epoch: " << iter << " " << samples/duration << " samples/sec Accuracy: " << acc.Get();
     score = acc.Get();
   }
 
+  delete exec;
   MXNotifyShutdown();
   return score >= MIN_SCORE ? 0 : 1;
 }
diff --git a/cpp-package/include/mxnet-cpp/executor.h b/cpp-package/include/mxnet-cpp/executor.h
index 822344b7efee..67eec0100b65 100644
--- a/cpp-package/include/mxnet-cpp/executor.h
+++ b/cpp-package/include/mxnet-cpp/executor.h
@@ -79,18 +79,6 @@ class Executor {
   */
   std::string DebugStr();
   /*!
-  * \brief update the arguments with given learning rate and optimizer
-  * \param opt the pointer to the optimizer
-  * \param lr learning rate
-  * \param wd weight decay
-  * \param arg_update_begin begin index of the arguments to be updated, it
-  * starts after the input data by default
-  * \param arg_update_end end index of the arguments to be updated, it ends
-  * before the label data by default
-  */
-  void UpdateAll(Optimizer *opt, float lr, float wd, int arg_update_begin = 1,
-                 int arg_update_end = -1);
-  /*!
   * \brief destructor, free the handle
   */
   ~Executor() { MXExecutorFree(handle_); }
diff --git a/cpp-package/include/mxnet-cpp/executor.hpp b/cpp-package/include/mxnet-cpp/executor.hpp
index 1a452a1610db..6887956290c2 100644
--- a/cpp-package/include/mxnet-cpp/executor.hpp
+++ b/cpp-package/include/mxnet-cpp/executor.hpp
@@ -79,13 +79,6 @@ inline std::string Executor::DebugStr() {
   return std::string(output);
 }
 
-inline void Executor::UpdateAll(Optimizer *opt, float lr, float wd,
-                                int arg_update_begin, int arg_update_end) {
-  arg_update_end = arg_update_end < 0 ? arg_arrays.size() - 1 : arg_update_end;
-  for (int i = arg_update_begin; i < arg_update_end; ++i) {
-    opt->Update(i, arg_arrays[i], grad_arrays[i], lr, wd);
-  }
-}
 }  // namespace cpp
 }  // namespace mxnet
 
diff --git a/cpp-package/include/mxnet-cpp/lr_scheduler.h b/cpp-package/include/mxnet-cpp/lr_scheduler.h
new file mode 100644
index 000000000000..91f9b3c0a952
--- /dev/null
+++ b/cpp-package/include/mxnet-cpp/lr_scheduler.h
@@ -0,0 +1,78 @@
+/*!
+*  Copyright (c) 2017 by Contributors
+* \file lr_scheduler.h
+* \brief Scheduling learning rate
+*/
+
+#ifndef CPP_PACKAGE_INCLUDE_MXNET_CPP_LR_SCHEDULER_H_
+#define CPP_PACKAGE_INCLUDE_MXNET_CPP_LR_SCHEDULER_H_
+
+#include "dmlc/logging.h"
+
+namespace mxnet {
+namespace cpp {
+
+/*!
+* \brief lr scheduler interface
+*/
+class LRScheduler {
+ public:
+  /*!
+  * \brief constructor
+  * \param base_lr the initial learning rate.
+  */
+  explicit LRScheduler(float base_lr = 0.01)
+      : base_lr_(base_lr) {}
+  /*!
+  * \brief set base lr
+  * \param lr learning rate from optimizer
+  */
+  void SetLR(const float lr) { base_lr_ = lr; }
+  /*!
+  * \brief get a new learning rate
+  */
+  virtual float GetLR(unsigned num_update) = 0;
+  /*!
+  * \brief destructor
+  */
+  virtual ~LRScheduler() {}
+
+ protected:
+  float base_lr_;
+};
+
+class FactorScheduler : public LRScheduler {
+ public:
+  explicit FactorScheduler(int step, float factor = 1, float stop_factor_lr = 1e-8)
+      : LRScheduler() {
+    step_ = step;
+    factor_ = factor;
+    stop_factor_lr_ = stop_factor_lr;
+  }
+
+  float GetLR(unsigned num_update) override {
+    while (num_update > unsigned(count_ + step_)) {
+      count_ += step_;
+      base_lr_ *= factor_;
+      if (base_lr_ < stop_factor_lr_) {
+        base_lr_ = stop_factor_lr_;
+        LG << "Update[" << num_update << "]: now learning rate arrived at " \
+           << base_lr_ << ", will not change in the future";
+      } else {
+        LG << "Update[" << num_update << "]: Change learning rate to " << base_lr_;
+      }
+    }
+    return base_lr_;
+  }
+
+ private:
+  int count_ = 0;
+  int step_;
+  float factor_;
+  float stop_factor_lr_;
+};
+
+}  // namespace cpp
+}  // namespace mxnet
+
+#endif  // CPP_PACKAGE_INCLUDE_MXNET_CPP_LR_SCHEDULER_H_
diff --git a/cpp-package/include/mxnet-cpp/optimizer.h b/cpp-package/include/mxnet-cpp/optimizer.h
index 76f8a3564fbe..1bc36d58fd1a 100644
--- a/cpp-package/include/mxnet-cpp/optimizer.h
+++ b/cpp-package/include/mxnet-cpp/optimizer.h
@@ -17,6 +17,7 @@
 #include "dmlc/logging.h"
 #include "mxnet-cpp/ndarray.h"
 #include "mxnet-cpp/op_map.h"
+#include "mxnet-cpp/lr_scheduler.h"
 
 namespace mxnet {
 namespace cpp {
@@ -57,15 +58,16 @@ class Optimizer {
     return this;
   }
   /*!
-  *  \brief Update a weight with gradient.
-  *  \param index the unique index for the weight.
-  *  \param weight the weight to update.
-  *  \param grad gradient for the weight.
-  *  \param lr learning rate.
-  *  \param wd weight decay.
+  * \bried set the lr scheduler
+  * \param lrScheduler lr scheduler used for this optimizer
+  * \return reference if self
   */
-  void Update(int index, NDArray weight, NDArray grad, mx_float lr,
-              mx_float wd);
+  Optimizer *SetLRScheduler(std::unique_ptr<LRScheduler> lrScheduler) {
+    CHECK(lrScheduler);
+    lrScheduler_ = std::move(lrScheduler);
+    lrScheduler_->SetLR(std::stof(params_["lr"]));
+    return this;
+  }
   /*!
   *  \brief Update a weight with gradient.
   *  \param index the unique index for the weight.
@@ -92,7 +94,10 @@ class Optimizer {
   std::map<int, unsigned> count_;
   unsigned begin_num_update_, num_update_;
   unsigned UpdateCount_(int index);
+  float GetLR_(int index);
+  float GetWD_(int index);
   virtual void CreateState_(int index, NDArray weight);
+  std::unique_ptr<LRScheduler> lrScheduler_ = nullptr;
 };
 
 typedef std::function<Optimizer*()> OptimizerCreator;
@@ -172,7 +177,6 @@ class AdaDeltaOptimizer : public Optimizer {
   std::map<int, NDArray*> acc_g_, acc_delta_;
 };
 
-
 }  // namespace cpp
 }  // namespace mxnet
 
diff --git a/cpp-package/include/mxnet-cpp/optimizer.hpp b/cpp-package/include/mxnet-cpp/optimizer.hpp
index 9dcb158b9e14..0d6a7be9dd6b 100644
--- a/cpp-package/include/mxnet-cpp/optimizer.hpp
+++ b/cpp-package/include/mxnet-cpp/optimizer.hpp
@@ -42,6 +42,8 @@ namespace cpp {
 inline Optimizer::Optimizer(unsigned begin_num_update)
   : begin_num_update_(begin_num_update),
     num_update_(begin_num_update_) {
+  params_["lr"] = "0.01f";
+  params_["wd"] = "0.f";
 }
 
 inline std::map<std::string, OptimizerCreator>& OptimizerRegistry::cmap() {
@@ -56,14 +58,6 @@ inline OpMap*& Optimizer::op_map() {
 
 inline Optimizer::~Optimizer() {}
 
-inline void Optimizer::Update(int index, NDArray weight, NDArray grad, mx_float lr,
-                       mx_float wd) {
-  params_["lr"] = std::to_string(lr);
-  params_["wd"] = std::to_string(wd);
-  UpdateCount_(index);
-  Update(index, weight, grad);
-}
-
 inline void Optimizer::CreateState_(int index, NDArray weight) {
 }
 
@@ -100,6 +94,18 @@ inline unsigned Optimizer::UpdateCount_(int index) {
   return new_count;
 }
 
+inline float Optimizer::GetLR_(int index) {
+  if (nullptr != lrScheduler_) {
+    return lrScheduler_->GetLR(num_update_);
+  }
+  return std::stof(params_["lr"]);
+}
+
+inline float Optimizer::GetWD_(int index) {
+  float wd = std::stof(params_["wd"]);
+  return wd;
+}
+
 inline Optimizer* OptimizerRegistry::Find(const std::string& name) {
   MXNETCPP_REGISTER_OPTIMIZER(sgd, SGDOptimizer);
   MXNETCPP_REGISTER_OPTIMIZER(ccsgd, SGDOptimizer);  // For backward compatibility
@@ -140,6 +146,9 @@ inline void SGDOptimizer::Update(int index, NDArray weight, NDArray grad) {
     CreateState_(index, weight);
   }
 
+  params_["lr"] = std::to_string(GetLR_(index));
+  params_["wd"] = std::to_string(GetWD_(index));
+  UpdateCount_(index);
   auto keys = GetParamKeys_();
   auto values = GetParamValues_();
   CHECK_EQ(keys.size(), values.size());
@@ -203,6 +212,9 @@ inline void RMSPropOptimizer::Update(int index, NDArray weight, NDArray grad) {
     CreateState_(index, weight);
   }
 
+  params_["lr"] = std::to_string(GetLR_(index));
+  params_["wd"] = std::to_string(GetWD_(index));
+  UpdateCount_(index);
   auto keys = GetParamKeys_();
   auto values = GetParamValues_();
   CHECK_EQ(keys.size(), values.size());
@@ -257,6 +269,10 @@ inline void AdamOptimizer::Update(int index, NDArray weight, NDArray grad) {
   if (mean_.count(index) == 0) {
     CreateState_(index, weight);
   }
+
+  params_["lr"] = std::to_string(GetLR_(index));
+  params_["wd"] = std::to_string(GetWD_(index));
+  UpdateCount_(index);
   auto keys = GetParamKeys_();
   auto values = GetParamValues_();
   CHECK_EQ(keys.size(), values.size());
@@ -306,9 +322,11 @@ inline void AdaGradOptimizer::Update(int index, NDArray weight, NDArray grad) {
   if (history_.count(index) == 0) {
     CreateState_(index, weight);
   }
-  float lr = std::stof(params_["lr"]);
-  float wd = std::stof(params_["wd"]);
+
   float eps = std::stof(params_["eps"]);
+  float lr = GetLR_(index);
+  float wd = GetWD_(index);
+  UpdateCount_(index);
   if (params_.count("rescale_grad") > 0) {
     grad *= std::stof(params_["rescale_grad"]);
   }
@@ -345,9 +363,11 @@ inline void AdaDeltaOptimizer::Update(int index, NDArray weight, NDArray grad) {
   if (acc_g_.count(index) == 0) {
     CreateState_(index, weight);
   }
-  float wd = std::stof(params_["wd"]);
+
   float rho = std::stof(params_["rho"]);
   float epsilon = std::stof(params_["epsilon"]);
+  float wd = GetWD_(index);
+  UpdateCount_(index);
 
   if (params_.count("rescale_grad") > 0) {
     grad *= std::stof(params_["rescale_grad"]);

From c9440ba263dac9731975d956c5dabdf70a384809 Mon Sep 17 00:00:00 2001
From: Mengxiao Lin <linmx14@fudan.edu.cn>
Date: Fri, 4 Aug 2017 04:32:41 +0800
Subject: [PATCH 11/26] update RCNN example for BaseModule::init_params (#6813)

* update RCNN example for BaseModule::init_params

* Update module.py

From d65d363be82fcf8f29994b6506a7047af29b488d Mon Sep 17 00:00:00 2001
From: joey2014 <joeyye@foxmail.com>
Date: Thu, 3 Aug 2017 15:33:40 -0500
Subject: [PATCH 12/26] [caffe] support convert mtcnn and MobileNet model
 (#6956)

* support convert mtcnn and MobileNet model

* pass python lint

* put "import re" before "import caffe_parser" as lint required

* correct missed checkin  and pass pylint
---
 tools/caffe_converter/convert_model.py  |  7 ++++---
 tools/caffe_converter/convert_symbol.py | 24 ++++++++++++++++++++++--
 2 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/tools/caffe_converter/convert_model.py b/tools/caffe_converter/convert_model.py
index 2d8c9941ddea..d1e4cd07c155 100644
--- a/tools/caffe_converter/convert_model.py
+++ b/tools/caffe_converter/convert_model.py
@@ -3,6 +3,7 @@
 from __future__ import print_function
 import argparse
 import sys
+import re
 import caffe_parser
 import mxnet as mx
 import numpy as np
@@ -53,8 +54,8 @@ def convert_model(prototxt_fname, caffemodel_fname, output_prefix=None):
            or layer_type == 'Deconvolution' or layer_type == 39:
             if layer_type == 'PReLU':
                 assert (len(layer_blobs) == 1)
-                wmat = layer_blobs[0].data
                 weight_name = layer_name + '_gamma'
+                wmat = np.array(layer_blobs[0].data).reshape(arg_shape_dic[weight_name])
                 arg_params[weight_name] = mx.nd.zeros(wmat.shape)
                 arg_params[weight_name][:] = wmat
                 continue
@@ -148,7 +149,7 @@ def convert_model(prototxt_fname, caffemodel_fname, output_prefix=None):
             aux_params[var_name] = mx.nd.zeros(var.shape)
             # Get the original epsilon
             for idx, layer in enumerate(layers_proto):
-                if layer.name == bn_name:
+                if layer.name == bn_name or re.sub('[-/]', '_', layer.name) == bn_name:
                     bn_index = idx
             eps_caffe = layers_proto[bn_index].batch_norm_param.eps
             # Compensate for the epsilon shift performed in convert_symbol
@@ -180,7 +181,7 @@ def convert_model(prototxt_fname, caffemodel_fname, output_prefix=None):
             assert len(layer_blobs) == 0
 
     if output_prefix is not None:
-        model = mx.mod.Module(symbol=sym, label_names=['prob_label', ])
+        model = mx.mod.Module(symbol=sym, label_names=[arg_names[-1], ])
         model.bind(data_shapes=[('data', tuple(input_dim))])
         model.init_params(arg_params=arg_params, aux_params=aux_params)
         model.save_checkpoint(output_prefix, 0)
diff --git a/tools/caffe_converter/convert_symbol.py b/tools/caffe_converter/convert_symbol.py
index c384c7690088..100a64fe63c6 100644
--- a/tools/caffe_converter/convert_symbol.py
+++ b/tools/caffe_converter/convert_symbol.py
@@ -120,6 +120,7 @@ def _parse_proto(prototxt_fname):
     flatten_count = 0
     output_name = ""
     prev_name = None
+    _output_name = {}
 
     # convert reset layers one by one
     for i, layer in enumerate(layers):
@@ -252,6 +253,22 @@ def _parse_proto(prototxt_fname):
         for j in range(len(layer.top)):
             mapping[layer.top[j]] = name
         output_name = name
+        for k in range(len(layer.bottom)):
+            if layer.bottom[k] in _output_name:
+                _output_name[layer.bottom[k]]['count'] = _output_name[layer.bottom[k]]['count']+1
+            else:
+                _output_name[layer.bottom[k]] = {'count':0}
+        for k in range(len(layer.top)):
+            if layer.top[k] in _output_name:
+                _output_name[layer.top[k]]['count'] = _output_name[layer.top[k]]['count']+1
+            else:
+                _output_name[layer.top[k]] = {'count':0, 'name':name}
+
+    output_name = []
+    for i in _output_name:
+        if 'name' in _output_name[i] and _output_name[i]['count'] == 0:
+            output_name.append(_output_name[i]['name'])
+
     return symbol_string, output_name, input_dim
 
 def convert_symbol(prototxt_fname):
@@ -272,8 +289,11 @@ def convert_symbol(prototxt_fname):
     sym, output_name, input_dim = _parse_proto(prototxt_fname)
     exec(sym)                   # pylint: disable=exec-used
     _locals = locals()
-    exec("ret = " + output_name, globals(), _locals)  # pylint: disable=exec-used
-    ret = _locals['ret']
+    ret = []
+    for i in  output_name:
+        exec("ret = " + i, globals(), _locals)  # pylint: disable=exec-used
+        ret.append(_locals['ret'])
+    ret = mx.sym.Group(ret)
     return ret, input_dim
 
 def main():

From be7e7916f444f67eda5d8acdc8b92d0bf339cb53 Mon Sep 17 00:00:00 2001
From: gurumurthys <gurumurthys@gmail.com>
Date: Thu, 3 Aug 2017 13:34:45 -0700
Subject: [PATCH 13/26] Fixed visualization code error for bi-directional lstms
 (#6674)

---
 python/mxnet/visualization.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/mxnet/visualization.py b/python/mxnet/visualization.py
index 97b6bfa25b1b..e67fee427be2 100644
--- a/python/mxnet/visualization.py
+++ b/python/mxnet/visualization.py
@@ -317,7 +317,6 @@ def looks_like_weight(name):
                                 params = input_node["attr"]
                                 if "num_outputs" in params:
                                     key += str(int(params["num_outputs"]) - 1)
-                                    params["num_outputs"] = int(params["num_outputs"]) - 1
                             shape = shape_dict[key][1:]
                             label = "x".join([str(x) for x in shape])
                             attr["label"] = label

From b848c241be41b6933923e9acc7caa40d6c2f76b4 Mon Sep 17 00:00:00 2001
From: Leonard Lausen <leonard@lausen.nl>
Date: Fri, 4 Aug 2017 05:35:18 +0900
Subject: [PATCH 14/26] fix example/rnn: Speedometer(..., auto_reset=False)
 (#6679)

If the Speedometer resets the eval_metric and due to an unlucky number of
batches the end_of_batch is reached immediately after, the Perplexity will throw
an ZeroDivisionError as eval_metric.num_inst == 0.
---
 example/rnn/cudnn_lstm_bucketing.py | 4 ++--
 example/rnn/lstm_bucketing.py       | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/example/rnn/cudnn_lstm_bucketing.py b/example/rnn/cudnn_lstm_bucketing.py
index 140f2e697015..fbf32bbacb42 100644
--- a/example/rnn/cudnn_lstm_bucketing.py
+++ b/example/rnn/cudnn_lstm_bucketing.py
@@ -135,13 +135,13 @@ def sym_gen(seq_len):
         eval_metric         = mx.metric.Perplexity(invalid_label),
         kvstore             = args.kv_store,
         optimizer           = args.optimizer,
-        optimizer_params    = opt_params, 
+        optimizer_params    = opt_params,
         initializer         = mx.init.Xavier(factor_type="in", magnitude=2.34),
         arg_params          = arg_params,
         aux_params          = aux_params,
         begin_epoch         = args.load_epoch,
         num_epoch           = args.num_epochs,
-        batch_end_callback  = mx.callback.Speedometer(args.batch_size, args.disp_batches),
+        batch_end_callback  = mx.callback.Speedometer(args.batch_size, args.disp_batches, auto_reset=False),
         epoch_end_callback  = mx.rnn.do_rnn_checkpoint(cell, args.model_prefix, 1)
                               if args.model_prefix else None)
 
diff --git a/example/rnn/lstm_bucketing.py b/example/rnn/lstm_bucketing.py
index 6c4371b2fd4e..609276a11f19 100644
--- a/example/rnn/lstm_bucketing.py
+++ b/example/rnn/lstm_bucketing.py
@@ -107,4 +107,4 @@ def sym_gen(seq_len):
                                 'wd': args.wd },
         initializer         = mx.init.Xavier(factor_type="in", magnitude=2.34),
         num_epoch           = args.num_epochs,
-        batch_end_callback  = mx.callback.Speedometer(args.batch_size, args.disp_batches))
+        batch_end_callback  = mx.callback.Speedometer(args.batch_size, args.disp_batches, auto_reset=False))

From edbb4ff26321773789ce3237211587697dbaced0 Mon Sep 17 00:00:00 2001
From: lxn2 <lxn2@users.noreply.github.com>
Date: Thu, 3 Aug 2017 13:44:54 -0700
Subject: [PATCH 15/26] Test installation of pip --pre (#7314)

* Enable test for pip-mkl

* Add --pre flag to test pre-releases

* Fix conflicts
---
 tests/jenkins/run_test_pip_installations.sh | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/tests/jenkins/run_test_pip_installations.sh b/tests/jenkins/run_test_pip_installations.sh
index 9246708f4329..de235a0a6359 100755
--- a/tests/jenkins/run_test_pip_installations.sh
+++ b/tests/jenkins/run_test_pip_installations.sh
@@ -42,13 +42,12 @@ for DEV in "${DEVICES[@]}"; do
         echo "Testing ${PYTHON}"
         DOCKER_CMD="virtualenv -p \"/usr/bin/${PYTHON}\" ${PYTHON}; source \"${PYTHON}/bin/activate\"; cd ${WORKSPACE};"
         if [[ "${DEV}" == *"cpu"* ]]; then
-            DOCKER_CMD="${DOCKER_CMD} pip install mxnet; python tests/python/train/test_conv.py"
+            DOCKER_CMD="${DOCKER_CMD} pip install mxnet --pre; python tests/python/train/test_conv.py"
         elif [[ "${DEV}" == *"cu75"* ]]; then
-            DOCKER_CMD="${DOCKER_CMD} pip install mxnet-cu75; python tests/python/train/test_conv.py --gpu"
+            DOCKER_CMD="${DOCKER_CMD} pip install mxnet-cu75 --pre; python tests/python/train/test_conv.py --gpu"
         elif [[ "${DEV}" == *"cu80"* ]]; then
-            DOCKER_CMD="${DOCKER_CMD} pip install mxnet-cu80; python tests/python/train/test_conv.py --gpu"
+            DOCKER_CMD="${DOCKER_CMD} pip install mxnet-cu80 --pre; python tests/python/train/test_conv.py --gpu"
         fi
-	
         ${DOCKER_BINARY} run --rm -v ${WORKSPACE}:${WORKSPACE} -w ${WORKSPACE} ${DOCKER_TAG} bash -c "tests/jenkins/run_as_user.sh `id -u` `id -un` `id -g` `id -un` '${DOCKER_CMD}'"
     done
 

From dd4512f82051711240adc301033e52bec7998abf Mon Sep 17 00:00:00 2001
From: "Qiang Kou (KK)" <qkou@qkou.info>
Date: Thu, 3 Aug 2017 22:22:09 +0000
Subject: [PATCH 16/26] [R] update docs from mx.symbol.MakeLoss. close #2922
 (#7325)

---
 R-package/vignettes/CustomLossFunction.Rmd | 159 +++++++++++++++
 docs/tutorials/r/CustomLossFunction.md     | 220 +++++++++++++++++----
 2 files changed, 341 insertions(+), 38 deletions(-)
 create mode 100644 R-package/vignettes/CustomLossFunction.Rmd

diff --git a/R-package/vignettes/CustomLossFunction.Rmd b/R-package/vignettes/CustomLossFunction.Rmd
new file mode 100644
index 000000000000..1817109e1387
--- /dev/null
+++ b/R-package/vignettes/CustomLossFunction.Rmd
@@ -0,0 +1,159 @@
+---
+title: "Customized loss function"
+output:
+  md_document:
+    variant: markdown_github
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+This tutorial provides guidelines for using customized loss function in network construction.
+
+Model Training Example
+----------
+
+Let's begin with a small regression example. We can build and train a regression model with the following code:
+
+```{r}
+data(BostonHousing, package = "mlbench")
+BostonHousing[, sapply(BostonHousing, is.factor)] <-
+  as.numeric(as.character(BostonHousing[, sapply(BostonHousing, is.factor)]))
+BostonHousing <- data.frame(scale(BostonHousing))
+
+test.ind = seq(1, 506, 5)    # 1 pt in 5 used for testing
+train.x = data.matrix(BostonHousing[-test.ind,-14])
+train.y = BostonHousing[-test.ind, 14]
+test.x = data.matrix(BostonHousing[--test.ind,-14])
+test.y = BostonHousing[--test.ind, 14]
+
+require(mxnet)
+
+data <- mx.symbol.Variable("data")
+label <- mx.symbol.Variable("label")
+fc1 <- mx.symbol.FullyConnected(data, num_hidden = 14, name = "fc1")
+tanh1 <- mx.symbol.Activation(fc1, act_type = "tanh", name = "tanh1")
+fc2 <- mx.symbol.FullyConnected(tanh1, num_hidden = 1, name = "fc2")
+lro <- mx.symbol.LinearRegressionOutput(fc2, name = "lro")
+
+mx.set.seed(0)
+model <- mx.model.FeedForward.create(lro, X = train.x, y = train.y,
+                                     ctx = mx.cpu(),
+                                     num.round = 5,
+                                     array.batch.size = 60,
+                                     optimizer = "rmsprop",
+                                     verbose = TRUE,
+                                     array.layout = "rowmajor",
+                                     batch.end.callback = NULL,
+                                     epoch.end.callback = NULL)
+
+pred <- predict(model, test.x)
+sum((test.y - pred[1,])^2) / length(test.y)
+```
+
+Besides the `LinearRegressionOutput`, we also provide `LogisticRegressionOutput` and `MAERegressionOutput`.
+However, this might not be enough for real-world models. You can provide your own loss function
+by using `mx.symbol.MakeLoss` when constructing the network.
+
+How to Use Your Own Loss Function
+---------
+
+We still use our previous example, but this time we use `mx.symbol.MakeLoss` to minimize the `(pred-label)^2`
+
+```{r}
+data <- mx.symbol.Variable("data")
+label <- mx.symbol.Variable("label")
+fc1 <- mx.symbol.FullyConnected(data, num_hidden = 14, name = "fc1")
+tanh1 <- mx.symbol.Activation(fc1, act_type = "tanh", name = "tanh1")
+fc2 <- mx.symbol.FullyConnected(tanh1, num_hidden = 1, name = "fc2")
+lro2 <- mx.symbol.MakeLoss(mx.symbol.square(mx.symbol.Reshape(fc2, shape = 0) - label), name="lro2")
+```
+
+Then we can train the network just as usual.
+
+```{r}
+mx.set.seed(0)
+model2 <- mx.model.FeedForward.create(lro2, X = train.x, y = train.y,
+                                      ctx = mx.cpu(),
+                                      num.round = 5,
+                                      array.batch.size = 60,
+                                      optimizer = "rmsprop",
+                                      verbose = TRUE,
+                                      array.layout = "rowmajor",
+                                      batch.end.callback = NULL,
+                                      epoch.end.callback = NULL)
+```
+
+We should get very similar results because we are actually minimizing the same loss function.
+However, the result is quite different.
+
+```{r}
+pred2 <- predict(model2, test.x)
+sum((test.y - pred2)^2) / length(test.y)
+```
+
+This is because output of `mx.symbol.MakeLoss` is the gradient of loss with respect to the input data.
+We can get the real prediction as below.
+
+```{r}
+internals = internals(model2$symbol)
+fc_symbol = internals[[match("fc2_output", outputs(internals))]]
+
+model3 <- list(symbol = fc_symbol,
+               arg.params = model2$arg.params,
+               aux.params = model2$aux.params)
+
+class(model3) <- "MXFeedForwardModel"
+
+pred3 <- predict(model3, test.x)
+sum((test.y - pred3[1,])^2) / length(test.y)
+```
+
+We have provided many operations on the symbols. An example of `|pred-label|` can be found below.
+
+```{r}
+lro_abs <- mx.symbol.MakeLoss(mx.symbol.abs(mx.symbol.Reshape(fc2, shape = 0) - label))
+mx.set.seed(0)
+model4 <- mx.model.FeedForward.create(lro_abs, X = train.x, y = train.y,
+                                      ctx = mx.cpu(),
+                                      num.round = 20,
+                                      array.batch.size = 60,
+                                      optimizer = "sgd",
+                                      learning.rate = 0.001,
+                                      verbose = TRUE,
+                                      array.layout = "rowmajor",
+                                      batch.end.callback = NULL,
+                                      epoch.end.callback = NULL)
+
+internals = internals(model4$symbol)
+fc_symbol = internals[[match("fc2_output", outputs(internals))]]
+
+model5 <- list(symbol = fc_symbol,
+               arg.params = model4$arg.params,
+               aux.params = model4$aux.params)
+
+class(model5) <- "MXFeedForwardModel"
+
+pred5 <- predict(model5, test.x)
+sum(abs(test.y - pred5[1,])) / length(test.y)
+```
+
+
+```{r}
+lro_mae <- mx.symbol.MAERegressionOutput(fc2, name = "lro")
+mx.set.seed(0)
+model6 <- mx.model.FeedForward.create(lro_mae, X = train.x, y = train.y,
+                                      ctx = mx.cpu(),
+                                      num.round = 20,
+                                      array.batch.size = 60,
+                                      optimizer = "sgd",
+                                      learning.rate = 0.001,
+                                      verbose = TRUE,
+                                      array.layout = "rowmajor",
+                                      batch.end.callback = NULL,
+                                      epoch.end.callback = NULL)
+pred6 <- predict(model6, test.x)
+sum(abs(test.y - pred6[1,])) / length(test.y)
+```
+
diff --git a/docs/tutorials/r/CustomLossFunction.md b/docs/tutorials/r/CustomLossFunction.md
index a7104803cacb..afb99518894c 100644
--- a/docs/tutorials/r/CustomLossFunction.md
+++ b/docs/tutorials/r/CustomLossFunction.md
@@ -3,57 +3,201 @@ Customized loss function
 
 This tutorial provides guidelines for using customized loss function in network construction.
 
-
 Model Training Example
-----------
+----------------------
 
 Let's begin with a small regression example. We can build and train a regression model with the following code:
 
+``` r
+data(BostonHousing, package = "mlbench")
+BostonHousing[, sapply(BostonHousing, is.factor)] <-
+  as.numeric(as.character(BostonHousing[, sapply(BostonHousing, is.factor)]))
+BostonHousing <- data.frame(scale(BostonHousing))
+
+test.ind = seq(1, 506, 5)    # 1 pt in 5 used for testing
+train.x = data.matrix(BostonHousing[-test.ind,-14])
+train.y = BostonHousing[-test.ind, 14]
+test.x = data.matrix(BostonHousing[--test.ind,-14])
+test.y = BostonHousing[--test.ind, 14]
+
+require(mxnet)
+```
+
+    ## Loading required package: mxnet
+
+``` r
+data <- mx.symbol.Variable("data")
+label <- mx.symbol.Variable("label")
+fc1 <- mx.symbol.FullyConnected(data, num_hidden = 14, name = "fc1")
+tanh1 <- mx.symbol.Activation(fc1, act_type = "tanh", name = "tanh1")
+fc2 <- mx.symbol.FullyConnected(tanh1, num_hidden = 1, name = "fc2")
+lro <- mx.symbol.LinearRegressionOutput(fc2, name = "lro")
+
+mx.set.seed(0)
+model <- mx.model.FeedForward.create(lro, X = train.x, y = train.y,
+                                     ctx = mx.cpu(),
+                                     num.round = 5,
+                                     array.batch.size = 60,
+                                     optimizer = "rmsprop",
+                                     verbose = TRUE,
+                                     array.layout = "rowmajor",
+                                     batch.end.callback = NULL,
+                                     epoch.end.callback = NULL)
+```
+
+    ## Start training with 1 devices
+
+``` r
+pred <- predict(model, test.x)
+```
+
+    ## Warning in mx.model.select.layout.predict(X, model): Auto detect layout of input matrix, use rowmajor..
+
+``` r
+sum((test.y - pred[1,])^2) / length(test.y)
+```
 
- ```r
-    library(mxnet)
-    data(BostonHousing, package="mlbench")
-    train.ind = seq(1, 506, 3)
-    train.x = data.matrix(BostonHousing[train.ind, -14])
-    train.y = BostonHousing[train.ind, 14]
-    test.x = data.matrix(BostonHousing[-train.ind, -14])
-    test.y = BostonHousing[-train.ind, 14]
-    data <- mx.symbol.Variable("data")
-    fc1 <- mx.symbol.FullyConnected(data, num_hidden=1)
-    lro <- mx.symbol.LinearRegressionOutput(fc1)
-    mx.set.seed(0)
-    model <- mx.model.FeedForward.create(
-      lro, X=train.x, y=train.y,
-      eval.data=list(data=test.x, label=test.y),
-      ctx=mx.cpu(), num.round=10, array.batch.size=20,
-      learning.rate=2e-6, momentum=0.9, eval.metric=mx.metric.rmse)
- ```
-
-Besides the `LinearRegressionOutput`, we also provide `LogisticRegressionOutput` and `MAERegressionOutput`.
-However, this might not be enough for real-world models. You can provide your own loss function
-by using `mx.symbol.MakeLoss` when constructing the network.
+    ## [1] 0.2485236
 
+Besides the `LinearRegressionOutput`, we also provide `LogisticRegressionOutput` and `MAERegressionOutput`. However, this might not be enough for real-world models. You can provide your own loss function by using `mx.symbol.MakeLoss` when constructing the network.
 
 How to Use Your Own Loss Function
----------
+---------------------------------
+
+We still use our previous example, but this time we use `mx.symbol.MakeLoss` to minimize the `(pred-label)^2`
+
+``` r
+data <- mx.symbol.Variable("data")
+label <- mx.symbol.Variable("label")
+fc1 <- mx.symbol.FullyConnected(data, num_hidden = 14, name = "fc1")
+tanh1 <- mx.symbol.Activation(fc1, act_type = "tanh", name = "tanh1")
+fc2 <- mx.symbol.FullyConnected(tanh1, num_hidden = 1, name = "fc2")
+lro2 <- mx.symbol.MakeLoss(mx.symbol.square(mx.symbol.Reshape(fc2, shape = 0) - label), name="lro2")
+```
+
+Then we can train the network just as usual.
+
+``` r
+mx.set.seed(0)
+model2 <- mx.model.FeedForward.create(lro2, X = train.x, y = train.y,
+                                      ctx = mx.cpu(),
+                                      num.round = 5,
+                                      array.batch.size = 60,
+                                      optimizer = "rmsprop",
+                                      verbose = TRUE,
+                                      array.layout = "rowmajor",
+                                      batch.end.callback = NULL,
+                                      epoch.end.callback = NULL)
+```
+
+    ## Start training with 1 devices
+
+We should get very similar results because we are actually minimizing the same loss function. However, the result is quite different.
+
+``` r
+pred2 <- predict(model2, test.x)
+```
+
+    ## Warning in mx.model.select.layout.predict(X, model): Auto detect layout of input matrix, use rowmajor..
+
+``` r
+sum((test.y - pred2)^2) / length(test.y)
+```
+
+    ## [1] 1.234584
+
+This is because output of `mx.symbol.MakeLoss` is the gradient of loss with respect to the input data. We can get the real prediction as below.
+
+``` r
+internals = internals(model2$symbol)
+fc_symbol = internals[[match("fc2_output", outputs(internals))]]
+
+model3 <- list(symbol = fc_symbol,
+               arg.params = model2$arg.params,
+               aux.params = model2$aux.params)
+
+class(model3) <- "MXFeedForwardModel"
+
+pred3 <- predict(model3, test.x)
+```
+
+    ## Warning in mx.model.select.layout.predict(X, model): Auto detect layout of input matrix, use rowmajor..
+
+``` r
+sum((test.y - pred3[1,])^2) / length(test.y)
+```
+
+    ## [1] 0.248294
+
+We have provided many operations on the symbols. An example of `|pred-label|` can be found below.
+
+``` r
+lro_abs <- mx.symbol.MakeLoss(mx.symbol.abs(mx.symbol.Reshape(fc2, shape = 0) - label))
+mx.set.seed(0)
+model4 <- mx.model.FeedForward.create(lro_abs, X = train.x, y = train.y,
+                                      ctx = mx.cpu(),
+                                      num.round = 20,
+                                      array.batch.size = 60,
+                                      optimizer = "sgd",
+                                      learning.rate = 0.001,
+                                      verbose = TRUE,
+                                      array.layout = "rowmajor",
+                                      batch.end.callback = NULL,
+                                      epoch.end.callback = NULL)
+```
+
+    ## Start training with 1 devices
+
+``` r
+internals = internals(model4$symbol)
+fc_symbol = internals[[match("fc2_output", outputs(internals))]]
+
+model5 <- list(symbol = fc_symbol,
+               arg.params = model4$arg.params,
+               aux.params = model4$aux.params)
+
+class(model5) <- "MXFeedForwardModel"
+
+pred5 <- predict(model5, test.x)
+```
+
+    ## Warning in mx.model.select.layout.predict(X, model): Auto detect layout of input matrix, use rowmajor..
+
+``` r
+sum(abs(test.y - pred5[1,])) / length(test.y)
+```
+
+    ## [1] 0.7056902
+
+``` r
+lro_mae <- mx.symbol.MAERegressionOutput(fc2, name = "lro")
+mx.set.seed(0)
+model6 <- mx.model.FeedForward.create(lro_mae, X = train.x, y = train.y,
+                                      ctx = mx.cpu(),
+                                      num.round = 20,
+                                      array.batch.size = 60,
+                                      optimizer = "sgd",
+                                      learning.rate = 0.001,
+                                      verbose = TRUE,
+                                      array.layout = "rowmajor",
+                                      batch.end.callback = NULL,
+                                      epoch.end.callback = NULL)
+```
+
+    ## Start training with 1 devices
 
-We still use our previous example.
+``` r
+pred6 <- predict(model6, test.x)
+```
 
- ```r
-    library(mxnet)
-    data <- mx.symbol.Variable("data")
-    fc1 <- mx.symbol.FullyConnected(data, num_hidden=1)
-    lro <- mx.symbol.MakeLoss(mx.symbol.square(mx.symbol.Reshape(fc1, shape = 0) - label))
- ```
+    ## Warning in mx.model.select.layout.predict(X, model): Auto detect layout of input matrix, use rowmajor..
 
-In the last line of network definition, we do not use the predefined loss function. We define the loss
-by ourselves, which is `(pred-label)^2`.
+``` r
+sum(abs(test.y - pred6[1,])) / length(test.y)
+```
 
-We have provided many operations on the symbols, so you can also define `|pred-label|` using the line below.
+    ## [1] 0.7056902
 
- ```r
-    lro <- mx.symbol.MakeLoss(mx.symbol.abs(mx.symbol.Reshape(fc1, shape = 0) - label))
- ```
 
 ## Next Steps
 * [Neural Networks with MXNet in Five Minutes](http://mxnet.io/tutorials/r/fiveMinutesNeuralNetwork.html)

From feebe774053c3b8361bf6d48dba998186261fe56 Mon Sep 17 00:00:00 2001
From: smarthi <smarthi@apache.org>
Date: Fri, 28 Jul 2017 15:45:56 -0400
Subject: [PATCH 17/26] Add KEYS file

---
 KEYS | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)
 create mode 100644 KEYS

diff --git a/KEYS b/KEYS
new file mode 100644
index 000000000000..28c497fceb68
--- /dev/null
+++ b/KEYS
@@ -0,0 +1,73 @@
+This file contains the PGP keys of various developers.
+Please don't use them for email unless you have to. Their main
+purpose is code signing.
+
+Examples of importing this file in your keystore:
+ gpg --import KEYS.txt
+ (need pgp and other examples here)
+
+Examples of adding your key to this file:
+ pgp -kxa <your name> and append it to this file.
+ (pgpk -ll <your name> && pgpk -xa <your name>) >> this file.
+ (gpg --list-sigs <your name>
+     && gpg --armor --export <your name>) >> this file.
+
+-----------------------------------------------------------------------------------
+pub   4096R/D3541808 2014-01-09
+uid       [ultimate] Suneel Marthi (CODE SIGNING KEY) <smarthi@apache.org>
+sig 3        D3541808 2014-01-09  Suneel Marthi (CODE SIGNING KEY) <smarthi@apache.org>
+sub   4096R/AF46E2DE 2014-01-09
+sig          D3541808 2014-01-09  Suneel Marthi (CODE SIGNING KEY) <smarthi@apache.org>
+
+-----BEGIN PGP PUBLIC KEY BLOCK-----
+Comment: GPGTools - https://gpgtools.org
+
+mQINBFLPJmEBEAC9d/dUZCXeyhB0fVGmJAjdjXfLebav4VqGdNZC+M1T9C3dcVsh
+X/JGme5bjJeIgVwiH5UsdNceYn1+hyxs8jXuRAWEWKP76gD+pNrp8Az0ZdBkJoAy
+zCywOPtJV2PCOz7+S5ri2nUA2+1Kgcu6IlSLMmYAGO0IAmRrjBEzxy9iGaxiNGTc
+LvQt/iVtIXWkKKI8yvpoJ8iFf3TGhpjgaC/h7cJP3zpy0SScmhJJASLXRsfocLv9
+sle6ndN9IPbDtRW8cL7Fk3VQlzp1ToVjmnQTyZZ6S1WafsjzCZ9hLN+k++o8VbvY
+v3icY6Sy0BKz0J6KwaxTkuZ6w1K7oUkVOQboKaWFIEdO+jwrEmU+Puyd8Np8jLnF
+Q0Y5GPfyMlqM3S/zaDm1t4D1eb5FLciStkxfg5wPVK6TkqB325KVD3aio5C7E7kt
+aQechHxaJXCQOtCtVY4X+L4iClnMSuk+hcSc8W8MYRTSVansItK0vI9eQZXMnpan
+w9/jk5rS4Gts1rHB7+kdjT3QRJmkyk6fEFT0fz5tfMC7N8waeEUhCaRW6lAoiqDW
+NW1h+0UGxJw+9YcGxBC0kkt3iofNOWQWmuf/BS3DHPKT7XV/YtBHe44wW0sF5L5P
+nfQUHpnA3pcZ0En6bXAvepKVZTNdOWWJqMyHV+436DA+33h45QL6lWb/GwARAQAB
+tDVTdW5lZWwgTWFydGhpIChDT0RFIFNJR05JTkcgS0VZKSA8c21hcnRoaUBhcGFj
+aGUub3JnPokCNwQTAQoAIQUCUs8mYQIbAwULCQgHAwUVCgkICwUWAgMBAAIeAQIX
+gAAKCRC08czE01QYCOKKEAChRtHBoYNTX+RZbFO0Kl1GlN+i1Ik0shEm5ZJ56XHv
+AnFx/gRK7CfZzJswWo7kf2s/dvJiFfs+rrolYVuO6E8gNhAaTEomSuvWQAMHdPcR
+9G5APRKCSkbZYugElqplEbSphk78FKoFO+sml52M7Pr9jj88ApBjoFVVY8njdnNq
+6DVlaDsg8YninCD78Z7PNFnRGwxyZ8Qd4Dh0rG+MUTfAWopZu6/MxpQxU7QpeVeX
+SIMLg7ClFrGfXnZcszYF4dnav1aa0i7W88PAdYNPko7tC5qz5yv2ep7t2gRbcYKf
+RXhYC2FHQey3wPhMKjA8V436lAqmfYnY/YdmhEy9Xq/1EdX1nHsQ7OEkfgXK14WM
+F+rnqXRAl/0cwiyb41eocdg5kpZFIKgCYT02usLWxwNnd3jOCe109Ze3y3acN/G8
++xOf9YRfNVAe6pD8H6ieRbv9gRjBmsbz9bXQCmxFnDqxNri5Me6gBAQPNmYTJD0h
+jgJTK6o0vJ0pwjBLauasJsLu+1tR3Cb0dxPE+JVaTF26FCd7pM7W6KdVfod9ZfrN
+cSyJ/cECc2KvYVGmTjQNVo1dYG0awBachlWnYNt+0Qx4opLsczZOLtPKtFY4BJA7
+aZoXT4Qf9yB8km7x2/cgNExVbFummToJ/IP3M39/EaryspsQQuM5Qu5Q5lZp8Qnn
+ybkCDQRSzyZhARAA7bAawFzbJaghYnm6mTZyGG5hQmfAynbF6cPAE+g2SnXcNQjP
+6kjYx3tSpb7rEzmjQqs46ztqdec6PIVBMhakON6z27Zz+IviAtO/TcaZHWNuCAjw
+FXVQZ+tYsSeiKInttfkrQc8jXAHWwSkSjLqNpvQpBdBEX80MYkFB6ZPOeON2+/Ta
+GC1H/HU2YngF0qQSmG33KKG6ezihBJdKxU6t2tsQfTlCmZW6R6MGpS9fVurYMKBk
+vR+7RGZ/H6dSjWPcpxhusGg92J9uz7r5SopN1wSdyPMUCMAFGeyoxcAuBDl38quU
+H/ENG3x5LDPq2aEH2AJ6yvZfIXbeJ1zmXf2cAHv+HbmvZaTSp0XIjq8Yxh8NkYEC
+ZdfRWmsGLIpU16TkBijpK3Dn9MDXjHGT3V8/qfdpURtMvIaL8WFrq9ejcy/vGRFn
+mCYqxIIPH+vLiMXKWtuMc61GN3ES21msKQH6IuQxxfQLyhK44L/pv7FpF4E+6LaE
+8uRwAex5HIDpR1v4aJq089rRtye9VXTJJLZ7lYs0HctdZ30QbBRWT4jS9d9rj3cr
+HgQ7mIGO9TAfK2kWc6AJN/EvxPWNbOwptsTUzAF/adiy9ax8C18iw7nKczC+2eN6
+UcbxXiPdytuKYK7O9A8S9e1w89GwpxYN7Xfn2o6QfpSbL9cLKiinOeV+xikAEQEA
+AYkCHwQYAQoACQUCUs8mYQIbDAAKCRC08czE01QYCG7yD/471dmyOD+go8cZkdqR
+3CHhjH03odtI0EJNVy4VGEC0r9paz3BWYTy18LqWYkw3ygphOIU1r8/7QK3H5Ke3
+c4yCSUxaMk5SlAJ+iVRek5TABkR8+zI+ZN5pQtqRH+ya5JxV4F/Sx5Q3KWMzpvgY
+n6AgSSc3hEfkgdI7SalIeyLaLDWv+RFdGZ5JU5gD28C0G8BeH8L62x6sixZcqoGT
+oy9rwkjs45/ZmmvBZhd1wLvC/au8l2Ecou6O8+8m26W8Z7vCuGKxuWn0KV3DLLWe
+66uchDVlakGoMJSPIK06JWYUlE+gL0CW+U2ekt/v2qb8hGgMVET3CBAMq+bFWuJ6
+juX7hJd7wHtCFfjnFDDAkdp2IIIZAlBW6FZGv7pJ82xsW6pSAg0A7VrV6nTtMtDv
+T8esOfo/t4t0gaL7bivy9DVVdATbUBcJJFpoVoe5MxiyjptveqPzIRwzt04n52Ph
+ordVWAnX5AokXWTg+Glem/EWEuf7jUuZArfqCSl/sZoQdXGTjR7G4iFscispji4+
+kNjVQsItqFbgDpuc6n+GcFxlKQ7YMCnu5MVtTV01U4lFs0qy0NTUqsuR35DM4z14
+DkFmj1upWAayCoXTpKzsHBvJZPC+Wqf9Pl3O47apelg7KxU3S011YfXpVPvCTKBv
+kD2o/5GKWS5QkSUEUXXY1oDiLg==
+=f8kJ
+-----END PGP PUBLIC KEY BLOCK-----

From 51ec4846409a3ebe09f24c17e2c10b4a255f480c Mon Sep 17 00:00:00 2001
From: qingzhouzhen <576591769@qq.com>
Date: Fri, 4 Aug 2017 11:39:53 +0800
Subject: [PATCH 18/26] Mobilenet (#7330)

* add mobilenet

* modify a mistake in conv_4
---
 example/image-classification/symbols/mobilenet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/example/image-classification/symbols/mobilenet.py b/example/image-classification/symbols/mobilenet.py
index cf470bace727..8ad584a50768 100644
--- a/example/image-classification/symbols/mobilenet.py
+++ b/example/image-classification/symbols/mobilenet.py
@@ -14,7 +14,7 @@ def get_symbol(num_classes, **kwargs):
     conv_3_dw = Conv(conv_2, num_group=64, num_filter=64, kernel=(3, 3), pad=(1, 1), stride=(2, 2), name="conv_3_dw") # 112/56
     conv_3 = Conv(conv_3_dw, num_filter=128, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_3") # 56/56
     conv_4_dw = Conv(conv_3, num_group=128, num_filter=128, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name="conv_4_dw") # 56/56
-    conv_4 = Conv(conv_3_dw, num_filter=128, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_4") # 56/56
+    conv_4 = Conv(conv_4_dw, num_filter=128, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_4") # 56/56
     conv_5_dw = Conv(conv_4, num_group=128, num_filter=128, kernel=(3, 3), pad=(1, 1), stride=(2, 2), name="conv_5_dw") # 56/28
     conv_5 = Conv(conv_5_dw, num_filter=256, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name="conv_5") # 28/28
     conv_6_dw = Conv(conv_5, num_group=256, num_filter=256, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name="conv_6_dw") # 28/28

From 14b83fccef7b96f8d38d780dbce3d0ef47267934 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=A2=81=E5=BE=B7=E6=BE=8E?= <liangdepeng@gmail.com>
Date: Fri, 4 Aug 2017 23:46:51 +0800
Subject: [PATCH 19/26] [Scala] Make Module Api sync with Python interface
 (#7246)

* [Scala] Make Module Api sync with Python interface

* fix
---
 .../ml/dmlc/mxnet/module/BaseModule.scala     |  38 +-
 .../module/DataParallelExecutorGroup.scala    |  53 ++-
 .../scala/ml/dmlc/mxnet/module/Module.scala   |  69 +++-
 .../dmlc/mxnet/module/SequentialModule.scala  |  10 +-
 .../scala/ml/dmlc/mxnet/ModuleSuite.scala     | 368 ++++++++++++++++++
 .../scala/ml/dmlc/mxnet/OperatorSuite.scala   |   2 +-
 6 files changed, 514 insertions(+), 26 deletions(-)
 create mode 100644 scala-package/core/src/test/scala/ml/dmlc/mxnet/ModuleSuite.scala

diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/module/BaseModule.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/module/BaseModule.scala
index c1cb91de56f5..0a73e1afcde1 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/module/BaseModule.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/module/BaseModule.scala
@@ -121,6 +121,7 @@ abstract class BaseModule {
   private[module] var auxParams: Map[String, NDArray] = null
 
   // High Level API
+  def getSymbol: Symbol = this.symbol
 
   // A convenient function that calls both `forward` and `backward`.
   def forwardBackward(dataBatch: DataBatch): Unit = {
@@ -259,7 +260,7 @@ abstract class BaseModule {
   /**
    * Get parameters, those are potentially copies of the the actual parameters used
    * to do computation on the device.
-   * @return `(arg_params, aux_params)`, a pair of dictionary of name to value mapping.
+   * @return `(argParams, auxParams)`, a pair of dictionary of name to value mapping.
    */
   def getParams: (Map[String, NDArray], Map[String, NDArray])
 
@@ -267,41 +268,52 @@ abstract class BaseModule {
    * Initialize the parameters and auxiliary states.
    * @param initializer : Initializer
    *         Called to initialize parameters if needed.
-   *     arg_params : dict
+   *     argParams : dict
    *         If not None, should be a dictionary of existing arg_params. Initialization
    *         will be copied from that.
-   *     aux_params : dict
+   *     auxParams : dict
    *         If not None, should be a dictionary of existing aux_params. Initialization
    *         will be copied from that.
-   *     allow_missing : bool
+   *     allowMissing : bool
    *         If true, params could contain missing values, and the initializer will be
    *         called to fill those missing params.
-   *     force_init : bool
+   *     forceInit : bool
    *         If true, will force re-initialize even if already initialized.
+   *     allowExtra : bool
+   *         Whether allow extra parameters that are not needed by symbol.
+   *         If this is True, no error will be thrown when argParams or auxParams
+   *         contain extra parameters that is not needed by the executor.
    */
   def initParams(initializer: Initializer = new Uniform(0.01f),
                  argParams: Map[String, NDArray] = null,
                  auxParams: Map[String, NDArray] = null,
-                 allowMissing: Boolean = false, forceInit: Boolean = false): Unit
+                 allowMissing: Boolean = false,
+                 forceInit: Boolean = false,
+                 allowExtra: Boolean = false): Unit
 
   /**
    * Assign parameter and aux state values.
-   *     arg_params : dict
+   *     argParams : dict
    *         Dictionary of name to value (`NDArray`) mapping.
-   *     aux_params : dict
+   *     auxParams : dict
    *         Dictionary of name to value (`NDArray`) mapping.
-   *     allow_missing : bool
+   *     allowMissing : bool
    *         If true, params could contain missing values, and the initializer will be
    *         called to fill those missing params.
-   *     force_init : bool
+   *     forceInit : bool
    *         If true, will force re-initialize even if already initialized.
+   *     allowExtra : bool
+   *         Whether allow extra parameters that are not needed by symbol.
+   *         If this is True, no error will be thrown when argParams or auxParams
+   *         contain extra parameters that is not needed by the executor.
    */
   def setParams(argParams: Map[String, NDArray],
                 auxParams: Map[String, NDArray],
                 allowMissing: Boolean = false,
-                forceInit: Boolean = true): Unit = {
-    initParams(initializer = null, argParams = argParams, auxParams = auxParams,
-      allowMissing = allowMissing, forceInit = forceInit)
+                forceInit: Boolean = true,
+                allowExtra: Boolean = false): Unit = {
+    initParams(initializer = null, argParams, auxParams,
+      allowMissing, forceInit, allowExtra)
   }
 
   /**
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/module/DataParallelExecutorGroup.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/module/DataParallelExecutorGroup.scala
index 2e724c6dc9ce..ea78962d00e8 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/module/DataParallelExecutorGroup.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/module/DataParallelExecutorGroup.scala
@@ -297,6 +297,7 @@ class DataParallelExecutorGroup private[module](
 
   private var batchSize: Int = -1
   private var slices: Array[(Int, Int)] = null
+  private var _defaultExecs: Array[Executor] = null
   private var execs: Array[Executor] = null
   private var dataArrays: Seq[Array[((Int, Int), NDArray)]] = null
   private var labelArrays: Option[Seq[Array[((Int, Int), NDArray)]]] = None
@@ -305,8 +306,8 @@ class DataParallelExecutorGroup private[module](
   private[module] var auxArrays: IndexedSeq[Array[NDArray]] = null
   private var inputGradArrays: IndexedSeq[Array[NDArray]] = null
 
-  private val dataLayouts = decideSlices(dataShapes)
-  private val labelLayouts =
+  private var dataLayouts = decideSlices(dataShapes)
+  private var labelLayouts =
     // call it to make sure labels has the same batch size as data
     if (labelShapes != None) decideSlices(labelShapes.get)
     else null
@@ -349,12 +350,30 @@ class DataParallelExecutorGroup private[module](
    * @param dataShapes DataDesc for input data.
    * @param labelShapes DataDesc for input labels.
    * @param sharedGroup
+   * @param reshape
    */
   def bindExec(dataShapes: Seq[DataDesc], labelShapes: Option[Seq[DataDesc]],
-               sharedGroup: Option[DataParallelExecutorGroup]): Unit = {
-    execs = (0 until contexts.length).map(i =>
-      bindIthExec(i, dataShapes, labelShapes, sharedGroup)
-    ).toArray
+               sharedGroup: Option[DataParallelExecutorGroup], reshape: Boolean = false): Unit = {
+    this.batchSize = -1
+    dataLayouts = decideSlices(dataShapes)
+    labelLayouts = {
+      // call it to make sure labels has the same batch size as data
+      if (labelShapes != None) decideSlices(labelShapes.get)
+      else null
+    }
+    if (reshape) {
+      (0 until contexts.length).foreach { i =>
+        val dataShapesSliced = slicedShape(dataShapes, i, dataLayouts)
+        val labelShapesSliced = labelShapes.map(slicedShape(_, i, labelLayouts))
+        val inputShapes
+          = dataShapesSliced.toMap ++ labelShapesSliced.getOrElse(Map.empty[String, Shape])
+        execs(i) = _defaultExecs(i).reshape(allowUpSizing = true, kwargs = inputShapes)
+      }
+    } else {
+      execs = (0 until contexts.length).map(i =>
+        bindIthExec(i, dataShapes, labelShapes, sharedGroup)
+      ).toArray
+    }
 
     // convenient data structures
     dataArrays = dataShapes.map(dataDesc =>
@@ -399,13 +418,31 @@ class DataParallelExecutorGroup private[module](
     auxArrays = (0 until auxNames.length).map(i => execs.map(_.auxArrays(i)))
   }
 
+  /**
+   * Reshape executors.
+   * @param dataShapes
+   * @param labelShapes
+   */
+  def reshape(dataShapes: Seq[DataDesc], labelShapes: Option[Seq[DataDesc]]): Unit = {
+    if (!(dataShapes == this.dataShapes && labelShapes == this.labelShapes)) {
+      if (this._defaultExecs == null) {
+        this._defaultExecs = this.execs.map(x => x)
+      }
+      this.bindExec(dataShapes, labelShapes, None, reshape = true)
+    }
+  }
+
   /**
    * Assign, i.e. copy parameters to all the executors.
    * @param argParams A dictionary of name to `NDArray` parameter mapping.
    * @param auxParams A dictionary of name to `NDArray` auxiliary variable mapping.
+   * @param allowExtra hether allow extra parameters that are not needed by symbol.
+   *         If this is True, no error will be thrown when argParams or auxParams
+   *         contain extra parameters that is not needed by the executor.
    */
-  def setParams(argParams: Map[String, NDArray], auxParams: Map[String, NDArray]): Unit = {
-    execs.foreach(_.copyParamsFrom(argParams, auxParams))
+  def setParams(argParams: Map[String, NDArray], auxParams: Map[String, NDArray],
+    allowExtra: Boolean = false): Unit = {
+    execs.foreach(_.copyParamsFrom(argParams, auxParams, allowExtraParams = allowExtra))
   }
 
   /**
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/module/Module.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/module/Module.scala
index 2b1d743ea648..b9cc07826504 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/module/Module.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/module/Module.scala
@@ -107,11 +107,16 @@ class Module(symbolVar: Symbol,
    * @param allowMissing If true, params could contain missing values,
    *                     and the initializer will be called to fill those missing params.
    * @param forceInit If true, will force re-initialize even if already initialized.
+   * @param allowExtra Whether allow extra parameters that are not needed by symbol.
+   *         If this is True, no error will be thrown when argParams or auxParams
+   *         contain extra parameters that is not needed by the executor.
    */
   override def initParams(initializer: Initializer = new Uniform(0.01f),
                           argParams: Map[String, NDArray] = null,
                           auxParams: Map[String, NDArray] = null,
-                          allowMissing: Boolean = false, forceInit: Boolean = false): Unit = {
+                          allowMissing: Boolean = false,
+                          forceInit: Boolean = false,
+                          allowExtra: Boolean = false): Unit = {
     if (paramsInitialized && !forceInit) {
       return
     }
@@ -141,7 +146,7 @@ class Module(symbolVar: Symbol,
     this.paramsDirty = false
 
     // copy the initialized parameters to devices
-    this.execGroup.setParams(this.argParams, this.auxParams)
+    this.execGroup.setParams(this.argParams, this.auxParams, allowExtra = allowExtra)
   }
 
   // Internal helper for parameter initialization
@@ -261,6 +266,46 @@ class Module(symbolVar: Symbol,
     }
   }
 
+  /**
+   * Check that input names matches input data descriptors.
+   */
+  @throws(classOf[IllegalArgumentException])
+  private def _checkNamesMatch(dataNames: IndexedSeq[String], dataShapes: IndexedSeq[DataDesc],
+                        name: String, throwEx: Boolean): Unit = {
+    val actual = dataShapes.map(_.name)
+    if (dataNames.sorted != actual.sorted) {
+      val msg = s"Data provided by ${name}_shapes don't match names specified by " +
+        s"${name}_names (${dataShapes.mkString(", ")} vs. ${dataNames.mkString(", ")})"
+      if (throwEx) throw new IllegalArgumentException(msg)
+      else logger.warn(msg)
+    }
+  }
+
+  /**
+   * parse data_attrs into DataDesc format and check that names match
+   */
+  @throws(classOf[IllegalArgumentException])
+  private def _parseDataDesc(dataNames: IndexedSeq[String], labelNames: IndexedSeq[String],
+                      dataShapes: IndexedSeq[DataDesc], labelShapes: Option[IndexedSeq[DataDesc]]):
+    (IndexedSeq[DataDesc], Option[IndexedSeq[DataDesc]]) = {
+    _checkNamesMatch(dataNames, dataShapes, "data", true)
+    if (labelShapes != None) _checkNamesMatch(labelNames, labelShapes.get, "label", false)
+    (dataShapes, labelShapes)
+  }
+
+  /**
+   * Reshapes the module for new input shapes.
+   * @param dataShapes Typically is `dataIter.provideData`.
+   * @param labelShapes Typically is `dataIter.provideLabel`.
+   */
+  def reshape(dataShapes: IndexedSeq[DataDesc],
+              labelShapes: Option[IndexedSeq[DataDesc]] = None): Unit = {
+    require(this.binded)
+    val (tdataShapes, tlabelShapes) = this._parseDataDesc(
+      this.dataNames, this.labelNames, dataShapes, labelShapes)
+    this.execGroup.reshape(tdataShapes, tlabelShapes)
+  }
+
   /**
    * Install and initialize optimizers.
    * @param kvstore
@@ -344,6 +389,26 @@ class Module(symbolVar: Symbol,
    */
   def forward(dataBatch: DataBatch, isTrain: Option[Boolean] = None): Unit = {
     require(binded && paramsInitialized)
+    val currDataShapes = this.dataShapes.map(_.shape)
+    val newDataShapes = dataBatch.data.map(_.shape)
+    if (currDataShapes != newDataShapes) {
+      val newDShapes: IndexedSeq[DataDesc] =
+        if (dataBatch.provideData != null) dataBatch.provideData
+        else {
+          this.dataShapes.zip(newDataShapes).map { case (i, shape) =>
+            DataDesc(i.name, shape, i.dtype, i.layout)
+          }
+        }
+      val newLShapes: Option[IndexedSeq[DataDesc]] =
+        if (dataBatch.provideLabel != null) Some(dataBatch.provideLabel)
+        else if (dataBatch.label != null && dataBatch.label.length > 0
+            && this.labelShapes != null) {
+          Some(this.labelShapes.zip(dataBatch.label).map { case (i, j) =>
+            DataDesc(i.name, j.shape, i.dtype, i.layout)
+          })
+        } else None
+      this.reshape(newDShapes, newLShapes)
+    }
     execGroup.forward(dataBatch, isTrain)
   }
 
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/module/SequentialModule.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/module/SequentialModule.scala
index dfa63ebac629..a77041de5b0a 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/module/SequentialModule.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/module/SequentialModule.scala
@@ -144,11 +144,16 @@ class SequentialModule extends BaseModule {
    * @param allowMissing If true, params could contain missing values,
    *                     and the initializer will be called to fill those missing params.
    * @param forceInit If true, will force re-initialize even if already initialized.
+   * @param allowExtra Whether allow extra parameters that are not needed by symbol.
+   *         If this is True, no error will be thrown when argParams or auxParams
+   *         contain extra parameters that is not needed by the executor.
    */
   override def initParams(initializer: Initializer = new Uniform(0.01f),
                           argParams: Map[String, NDArray] = null,
                           auxParams: Map[String, NDArray] = null,
-                          allowMissing: Boolean = false, forceInit: Boolean = false): Unit = {
+                          allowMissing: Boolean = false,
+                          forceInit: Boolean = false,
+                          allowExtra: Boolean = false): Unit = {
     if (this.paramsInitialized && !forceInit) {
       return
     }
@@ -156,7 +161,8 @@ class SequentialModule extends BaseModule {
 
     for (module <- this.modules) {
       module.initParams(initializer = initializer, argParams = argParams,
-          auxParams = auxParams, allowMissing = allowMissing, forceInit = forceInit)
+          auxParams = auxParams, allowMissing = allowMissing,
+          forceInit = forceInit, allowExtra = allowExtra)
     }
 
     // Internal function to help checking duplicated names,
diff --git a/scala-package/core/src/test/scala/ml/dmlc/mxnet/ModuleSuite.scala b/scala-package/core/src/test/scala/ml/dmlc/mxnet/ModuleSuite.scala
new file mode 100644
index 000000000000..ab48ef7d1928
--- /dev/null
+++ b/scala-package/core/src/test/scala/ml/dmlc/mxnet/ModuleSuite.scala
@@ -0,0 +1,368 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ml.dmlc.mxnet
+
+import org.scalatest.{BeforeAndAfterAll, FunSuite}
+import ml.dmlc.mxnet.CheckUtils._
+import ml.dmlc.mxnet.module._
+import ml.dmlc.mxnet.optimizer._
+import ml.dmlc.mxnet.io._
+
+class ModuleSuite extends FunSuite with BeforeAndAfterAll {
+  test ("model dtype") {
+    val dType = DType.Float16
+    val dShape = Shape(3, 8, 7)
+
+    var sym = Symbol.Variable("data")
+    sym = Symbol.Activation(attr = Map("__layout__" -> "TNC"))()(
+      Map("data" -> sym, "act_type" -> "relu"))
+
+    val mod = new Module(sym, IndexedSeq("data"), null,
+      contexts = Array(Context.cpu(0), Context.cpu(1)))
+    mod.bind(dataShapes = IndexedSeq(DataDesc("data", dShape, dType, "TNC")))
+    mod.initParams()
+    mod.forward(new DataBatch(
+      data = IndexedSeq(NDArray.ones(dShape, dtype = dType)),
+      label = null, index = null, pad = 0))
+    mod.backward(Array(NDArray.ones(dShape, dtype = dType)))
+
+    assert(mod.getOutputs.flatten.forall(_.dtype == dType))
+  }
+
+  test ("module input_grads") {
+    val a = Symbol.Variable("a", kwargs = Map("__layout__" -> "NC"))
+    val b = Symbol.Variable("b", kwargs = Map("__layout__" -> "NC"))
+    var c = Symbol.Variable("c", kwargs = Map("__layout__" -> "NC"))
+
+    import SymbolConversions._
+    c = a + 2 * b + 3 * c
+
+    val mod = new Module(c, IndexedSeq("b", "c", "a"), null,
+      contexts = Array(Context.cpu(0), Context.cpu(1)))
+    mod.bind(dataShapes = IndexedSeq(
+      DataDesc("b", Shape(5, 5)),
+      DataDesc("c", Shape(5, 5)),
+      DataDesc("a", Shape(5, 5))),
+      inputsNeedGrad = true
+    )
+    mod.initParams()
+    mod.forward(new DataBatch(
+      data = IndexedSeq(
+        NDArray.ones(5, 5), NDArray.ones(5, 5), NDArray.ones(5, 5)),
+      label = null, index = null, pad = 0))
+    mod.backward(Array(NDArray.ones(5, 5)))
+
+    val inputGrads = mod.getInputGradsMerged()
+    val aGrad = inputGrads(0).toArray
+    val bGrad = inputGrads(1).toArray
+    val cGrad = inputGrads(2).toArray
+
+    assert(aGrad.forall(_ == 1f))
+    assert(bGrad.forall(_ == 2f))
+    assert(cGrad.forall(_ == 3f))
+  }
+
+  test ("module layout") {
+    var sym = Symbol.Variable("data")
+    sym = Symbol.Activation(attr = Map("__layout__" -> "TNC"))()(
+      Map("data" -> sym, "act_type" -> "relu"))
+
+    val dShape = Shape(3, 8, 7)
+    val mod = new Module(sym, IndexedSeq("data"), null,
+      contexts = Array(Context.cpu(0), Context.cpu(1)))
+    mod.bind(dataShapes = IndexedSeq(DataDesc("data", dShape, layout = "TNC")))
+    mod.initParams()
+    mod.forward(new DataBatch(
+      data = IndexedSeq(NDArray.ones(dShape)),
+      label = null, index = null, pad = 0))
+    mod.backward(Array(NDArray.ones(dShape)))
+    assert(mod.getOutputsMerged()(0).shape == dShape)
+
+    val hdShape = Shape(3, 4, 7)
+    for (x <- mod.getOutputs) assert(x(0).shape == hdShape)
+  }
+
+  test ("save load") {
+    def mapEqu(a: Map[String, NDArray], b: Map[String, NDArray]): Unit = {
+      assert(a.toSet == b.toSet)
+      for (k <- a.keys) assert(a(k) == b(k))
+    }
+
+    var sym = Symbol.Variable("data")
+    sym = Symbol.FullyConnected()()(Map("data" -> sym, "num_hidden" -> 100))
+
+    // single device
+    var mod = new Module(sym, IndexedSeq("data"), null)
+    mod.bind(dataShapes = IndexedSeq(DataDesc("data", Shape(10, 10))))
+    mod.initParams()
+    mod.initOptimizer(optimizer = new SGD(learningRate = 0.1f, momentum = 0.9f))
+    mod.update()
+    mod.saveCheckpoint("test", 0, saveOptStates = true)
+
+    var mod2 = Module.loadCheckpoint("test", 0, loadOptimizerStates = true)
+    mod2.bind(dataShapes = IndexedSeq(DataDesc("data", Shape(10, 10))))
+    mod2.initOptimizer(optimizer = new SGD(learningRate = 0.1f, momentum = 0.9f))
+    assert(mod.getSymbol.toJson == mod2.getSymbol.toJson)
+    mapEqu(mod.getParams._1, mod2.getParams._1)
+
+    // multi device
+    mod = new Module(sym, IndexedSeq("data"), null,
+      contexts = Array(Context.cpu(0), Context.cpu(1)))
+    mod.bind(dataShapes = IndexedSeq(DataDesc("data", Shape(10, 10))))
+    mod.initParams()
+    mod.initOptimizer(optimizer = new SGD(learningRate = 0.1f, momentum = 0.9f))
+    mod.update()
+    mod.saveCheckpoint("test", 0, saveOptStates = true)
+
+    mod2 = Module.loadCheckpoint("test", 0, loadOptimizerStates = true)
+    mod2.bind(dataShapes = IndexedSeq(DataDesc("data", Shape(10, 10))))
+    mod2.initOptimizer(optimizer = new SGD(learningRate = 0.1f, momentum = 0.9f))
+    assert(mod.getSymbol.toJson == mod2.getSymbol.toJson)
+    mapEqu(mod.getParams._1, mod2.getParams._1)
+  }
+
+  test ("module reshape") {
+    var sym = Symbol.Variable("data")
+    sym = Symbol.FullyConnected("fc")()(Map("data" -> sym, "num_hidden" -> 20))
+
+    var dShape = Shape(7, 20)
+    val mod = new Module(sym, IndexedSeq("data"), null,
+      contexts = Array(Context.cpu(0), Context.cpu(1)))
+    mod.bind(dataShapes = IndexedSeq(DataDesc("data", dShape)))
+    mod.initParams()
+    mod.initOptimizer(optimizer = new SGD(learningRate = 1f))
+
+    mod.forward(new DataBatch(
+      data = IndexedSeq(NDArray.ones(dShape)),
+      label = null, index = null, pad = 0))
+    mod.backward(Array(NDArray.ones(dShape)))
+    mod.update()
+    assert(mod.getOutputsMerged()(0).shape == dShape)
+    assert(mod.getParams._1("fc_bias").toArray.forall(_ == -1f))
+
+    dShape = Shape(14, 20)
+    mod.reshape(IndexedSeq(DataDesc("data", dShape)))
+    mod.forward(new DataBatch(
+      data = IndexedSeq(NDArray.ones(dShape)),
+      label = null, index = null, pad = 0))
+    mod.backward(Array(NDArray.ones(dShape)))
+    mod.update()
+    assert(mod.getOutputsMerged()(0).shape == dShape)
+    assert(mod.getParams._1("fc_bias").toArray.forall(x => (x - -3f) < 1e-3))
+  }
+
+  test ("module setParams") {
+    val data = NDArray.array(Array(0.05f, 0.1f), Shape(1, 2))
+    val label = NDArray.array(Array(0.01f, 0.99f), Shape(1, 2))
+    val trainData = new NDArrayIter(
+      IndexedSeq(data), IndexedSeq(label), labelName = "softmax_label")
+
+    // symbols
+    var x = Symbol.Variable("data")
+    x = Symbol.FullyConnected(name = "fc_0")()(Map("data" -> x, "num_hidden" -> 2))
+    x = Symbol.Activation(name = "act_0")()(Map("data" -> x, "act_type" -> "sigmoid"))
+    x = Symbol.FullyConnected(name = "fc_1")()(Map("data" -> x, "num_hidden" -> 2))
+    x = Symbol.Activation(name = "act_1")()(Map("data" -> x, "act_type" -> "sigmoid"))
+    x = Symbol.LinearRegressionOutput(name = "softmax")()(Map("data" -> x, "grad_scale" -> 2))
+
+    // create module
+    val mod = new Module(x, contexts = Array(Context.cpu()))
+    mod.bind(dataShapes = trainData.provideData,
+      Option(trainData.provideLabel))
+    val argParamsCorrect = Map(
+      "fc_0_weight" -> NDArray.array(Array(0.15f, 0.2f, 0.25f, 0.3f), Shape(2, 2)),
+      "fc_0_bias" -> NDArray.array(Array(0.35f, 0.35f), Shape(2)),
+      "fc_1_weight" -> NDArray.array(Array(0.4f, 0.45f, 0.5f, 0.55f), Shape(2, 2)),
+      "fc_1_bias" -> NDArray.array(Array(0.6f, 0.6f), Shape(2))
+    )
+    val argParamsMissing = Map(
+      "fc_0_weight" -> NDArray.array(Array(0.15f, 0.2f, 0.25f, 0.3f), Shape(2, 2)),
+      "fc_0_bias" -> NDArray.array(Array(0.35f, 0.35f), Shape(2)),
+      "fc_1_weight" -> NDArray.array(Array(0.4f, 0.45f, 0.5f, 0.55f), Shape(2, 2))
+    )
+    val argParamsExtra = Map(
+      "fc_0_weight" -> NDArray.array(Array(0.15f, 0.2f, 0.25f, 0.3f), Shape(2, 2)),
+      "fc_0_bias" -> NDArray.array(Array(0.35f, 0.35f), Shape(2)),
+      "fc_1_weight" -> NDArray.array(Array(0.4f, 0.45f, 0.5f, 0.55f), Shape(2, 2)),
+      "fc_1_bias" -> NDArray.array(Array(0.6f, 0.6f), Shape(2)),
+      "fc_2_weight" -> NDArray.array(Array(0.6f, 0.6f), Shape(2))
+    )
+
+    mod.setParams(forceInit = true, argParams = argParamsCorrect,
+      auxParams = null)
+
+    // test allow missing
+    mod.setParams(forceInit = true, argParams = argParamsMissing,
+      auxParams = null, allowMissing = true)
+
+    // test allow extra
+    mod.setParams(forceInit = true, argParams = argParamsExtra, auxParams = null,
+      allowMissing = true, allowExtra = true)
+  }
+
+  test ("monitor") {
+    // data iter
+    val data = NDArray.array(Array(0.05f, 0.1f), Shape(1, 2))
+    val label = NDArray.array(Array(0.01f, 0.99f), Shape(1, 2))
+    val trainData = new NDArrayIter(
+      IndexedSeq(data), IndexedSeq(label), labelName = "softmax_label")
+
+    // symbols
+    var x = Symbol.Variable("data")
+    x = Symbol.FullyConnected(name = "fc_0")()(Map("data" -> x, "num_hidden" -> 2))
+    x = Symbol.Activation(name = "act_0")()(Map("data" -> x, "act_type" -> "sigmoid"))
+    x = Symbol.FullyConnected(name = "fc_1")()(Map("data" -> x, "num_hidden" -> 2))
+    x = Symbol.Activation(name = "act_1")()(Map("data" -> x, "act_type" -> "sigmoid"))
+    x = Symbol.LinearRegressionOutput(name = "softmax")()(Map("data" -> x, "grad_scale" -> 2))
+
+    // create monitor
+    def meanAbs(x: NDArray): NDArray = {
+      val sumAbs = NDArray.sum(NDArray.abs(x))
+      sumAbs / x.shape.product
+    }
+    val mon = new Monitor(1, statFunc = meanAbs)
+
+    // create module
+    val mod = new Module(x, contexts = Array(Context.cpu()))
+    mod.bind(dataShapes = trainData.provideData,
+      Option(trainData.provideLabel))
+    mod.installMonitor(mon)
+    val argParams = Map(
+      "fc_0_weight" -> NDArray.array(Array(0.15f, 0.2f, 0.25f, 0.3f), Shape(2, 2)),
+      "fc_0_bias" -> NDArray.array(Array(0.35f, 0.35f), Shape(2)),
+      "fc_1_weight" -> NDArray.array(Array(0.4f, 0.45f, 0.5f, 0.55f), Shape(2, 2)),
+      "fc_1_bias" -> NDArray.array(Array(0.6f, 0.6f), Shape(2))
+    )
+    mod.initParams(argParams = argParams)
+
+    val dataBatch = trainData.next()
+    mon.tic()
+    mod.forwardBackward(dataBatch)
+    val res = mon.toc()
+    val keys = Array("act_0", "act_1", "data", "fc_0", "fc_1", "softmax")
+    val monResultCounts = Array(0, 0, 0, 0, 0, 0)
+    assert(res.length == 21)
+    for ((n, k, v) <- res) {
+      var break = false
+      for ((key, idx) <- keys.zipWithIndex) {
+        if (!break && k.startsWith(key)) {
+          monResultCounts(idx) += 1
+          break = true
+        }
+      }
+    }
+    assert(monResultCounts.zip(Array(2, 2, 1, 6, 6, 4)).forall(x => x._1 == x._2))
+  }
+
+  test ("forward reshape") {
+    val numClass = 10
+    val data1 = Symbol.Variable("data1")
+    val data2 = Symbol.Variable("data2")
+    val conv1 = Symbol.Convolution()()(Map("data" -> data1,
+        "kernel" -> "(2, 2)", "num_filter" -> 2, "stride" -> "(2, 2)"))
+    val conv2 = Symbol.Convolution()()(Map("data" -> data2,
+        "kernel" -> "(3, 3)", "num_filter" -> 3, "stride" -> "(1, 1)"))
+    val pooling1 = Symbol.Pooling()()(Map("data" -> conv1,
+        "kernel" -> "(2, 2)", "pool_type" -> "avg", "stride" -> "(1, 1)"))
+    val pooling2 = Symbol.Pooling()()(Map("data" -> conv2,
+        "kernel" -> "(2, 2)", "pool_type" -> "max", "stride" -> "(1, 1)"))
+    val flatten1 = Symbol.flatten()()(Map("data" -> pooling1))
+    val flatten2 = Symbol.flatten()()(Map("data" -> pooling2))
+    val sum = Symbol.sum()()(Map("data" -> flatten1, "axis" -> 1)) +
+      Symbol.sum()()(Map("data" -> flatten2, "axis" -> 1))
+    val fc = Symbol.FullyConnected()()(
+      Map("data" -> sum, "num_hidden" -> numClass))
+    val sym = Symbol.SoftmaxOutput(name = "softmax")()(Map("data" -> fc))
+
+    var dShape1 = Shape(10, 3, 64, 64)
+    var dShape2 = Shape(10, 3, 32, 32)
+    var lShape = Shape(10)
+
+    val mod = new Module(sym, IndexedSeq("data1", "data2"))
+    mod.bind(dataShapes = IndexedSeq(
+      DataDesc("data1", dShape1), DataDesc("data2", dShape2)),
+      labelShapes = Option(IndexedSeq(DataDesc("softmax_label", lShape)))
+    )
+    mod.initParams()
+    mod.initOptimizer(optimizer = new SGD(learningRate = 0.01f))
+
+    // Train with original data shapes
+    var dataBatch = new DataBatch(
+      data = IndexedSeq(
+        NDArray.random_uniform(Map("low" -> 0, "high" -> 9, "shape" -> dShape1.toString()))(),
+        NDArray.random_uniform(Map("low" -> 5, "high" -> 15, "shape" -> dShape2.toString()))()),
+      label = IndexedSeq(NDArray.ones(lShape)), index = null, pad = 0)
+    mod.forward(dataBatch)
+    assert(mod.getOutputsMerged()(0).shape == Shape(lShape(0), numClass))
+    mod.backward()
+    mod.update()
+
+    dShape1 = Shape(3, 3, 64, 64)
+    dShape2 = Shape(3, 3, 32, 32)
+    lShape = Shape(3)
+    dataBatch = new DataBatch(
+      data = IndexedSeq(
+        NDArray.random_uniform(Map("low" -> 0, "high" -> 9, "shape" -> dShape1.toString()))(),
+        NDArray.random_uniform(Map("low" -> 5, "high" -> 15, "shape" -> dShape2.toString()))()),
+      label = IndexedSeq(NDArray.ones(lShape)), index = null, pad = 0)
+    mod.forward(dataBatch)
+    assert(mod.getOutputsMerged()(0).shape == Shape(lShape(0), numClass))
+    mod.backward()
+    mod.update()
+
+    dShape1 = Shape(20, 3, 64, 64)
+    dShape2 = Shape(20, 3, 32, 32)
+    lShape = Shape(20)
+    dataBatch = new DataBatch(
+      data = IndexedSeq(
+        NDArray.random_uniform(Map("low" -> 3, "high" -> 5, "shape" -> dShape1.toString()))(),
+        NDArray.random_uniform(Map("low" -> 10, "high" -> 25, "shape" -> dShape2.toString()))()),
+      label = IndexedSeq(NDArray.ones(lShape)), index = null, pad = 0)
+    mod.forward(dataBatch)
+    assert(mod.getOutputsMerged()(0).shape == Shape(lShape(0), numClass))
+    mod.backward()
+    mod.update()
+
+    // Train with both different batch size and data shapes
+    dShape1 = Shape(20, 3, 120, 120)
+    dShape2 = Shape(20, 3, 32, 64)
+    lShape = Shape(20)
+    dataBatch = new DataBatch(
+      data = IndexedSeq(
+        NDArray.random_uniform(Map("low" -> 0, "high" -> 9, "shape" -> dShape1.toString()))(),
+        NDArray.random_uniform(Map("low" -> 5, "high" -> 15, "shape" -> dShape2.toString()))()),
+      label = IndexedSeq(NDArray.ones(lShape)), index = null, pad = 0)
+    mod.forward(dataBatch)
+    assert(mod.getOutputsMerged()(0).shape == Shape(lShape(0), numClass))
+    mod.backward()
+    mod.update()
+
+    dShape1 = Shape(5, 3, 28, 40)
+    dShape2 = Shape(5, 3, 24, 16)
+    lShape = Shape(5)
+    dataBatch = new DataBatch(
+      data = IndexedSeq(
+        NDArray.random_uniform(Map("low" -> 0, "high" -> 9, "shape" -> dShape1.toString()))(),
+        NDArray.random_uniform(Map("low" -> 15, "high" -> 25, "shape" -> dShape2.toString()))()),
+      label = IndexedSeq(NDArray.ones(lShape)), index = null, pad = 0)
+    mod.forward(dataBatch)
+    assert(mod.getOutputsMerged()(0).shape == Shape(lShape(0), numClass))
+    mod.backward()
+    mod.update()
+  }
+}
diff --git a/scala-package/core/src/test/scala/ml/dmlc/mxnet/OperatorSuite.scala b/scala-package/core/src/test/scala/ml/dmlc/mxnet/OperatorSuite.scala
index 187869c3af21..ac1cee202e5b 100644
--- a/scala-package/core/src/test/scala/ml/dmlc/mxnet/OperatorSuite.scala
+++ b/scala-package/core/src/test/scala/ml/dmlc/mxnet/OperatorSuite.scala
@@ -239,7 +239,7 @@ class OperatorSuite extends FunSuite with BeforeAndAfterAll
       var exe = x.simpleBind(ctx = Context.cpu(), gradReq = "write", shapeDict = Map())
       exe.forward(isTrain = false)
       assert(exe.gradArrays.length == 0)
-      assert(CheckUtils.reldiff(result.toArray, exe.outputs.head.toArray) <= 1e-5f)
+      assert(CheckUtils.reldiff(result.toArray, exe.outputs.head.toArray) <= 1e-4f)
     }
   }
 

From c9aacaa7bfa9de0036837a504b67229be8404c67 Mon Sep 17 00:00:00 2001
From: Sheng Zha <szha@users.noreply.github.com>
Date: Fri, 4 Aug 2017 10:12:31 -0700
Subject: [PATCH 20/26] fix cpplint (#7332)

---
 cpp-package/include/mxnet-cpp/lr_scheduler.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp-package/include/mxnet-cpp/lr_scheduler.h b/cpp-package/include/mxnet-cpp/lr_scheduler.h
index 91f9b3c0a952..4c56b7ab3f0b 100644
--- a/cpp-package/include/mxnet-cpp/lr_scheduler.h
+++ b/cpp-package/include/mxnet-cpp/lr_scheduler.h
@@ -4,8 +4,8 @@
 * \brief Scheduling learning rate
 */
 
-#ifndef CPP_PACKAGE_INCLUDE_MXNET_CPP_LR_SCHEDULER_H_
-#define CPP_PACKAGE_INCLUDE_MXNET_CPP_LR_SCHEDULER_H_
+#ifndef MXNET_CPP_LR_SCHEDULER_H_
+#define MXNET_CPP_LR_SCHEDULER_H_
 
 #include "dmlc/logging.h"
 
@@ -75,4 +75,4 @@ class FactorScheduler : public LRScheduler {
 }  // namespace cpp
 }  // namespace mxnet
 
-#endif  // CPP_PACKAGE_INCLUDE_MXNET_CPP_LR_SCHEDULER_H_
+#endif  // MXNET_CPP_LR_SCHEDULER_H_

From c8db271dcea47ca60222d30aeeee8ff2c5336af9 Mon Sep 17 00:00:00 2001
From: Leonard Lausen <leonard@lausen.nl>
Date: Sat, 5 Aug 2017 02:14:43 +0900
Subject: [PATCH 21/26] Add Autograd doc to gluon page (#7327)

* Add Autograd doc to gluon page

* Make autograd doc toplevel
---
 docs/api/python/autograd.md | 32 ++++++++++++++++++++++++++++++++
 docs/api/python/index.md    |  1 +
 2 files changed, 33 insertions(+)
 create mode 100644 docs/api/python/autograd.md

diff --git a/docs/api/python/autograd.md b/docs/api/python/autograd.md
new file mode 100644
index 000000000000..440a1e4de289
--- /dev/null
+++ b/docs/api/python/autograd.md
@@ -0,0 +1,32 @@
+# Autograd Package
+
+
+```eval_rst
+.. currentmodule:: mxnet.autograd
+```
+
+```eval_rst
+.. warning:: This package is currently experimental and may change in the near future.
+```
+
+<script type="text/javascript" src='../../_static/js/auto_module_index.js'></script>
+
+## Autograd
+
+```eval_rst
+.. currentmodule:: mxnet.autograd
+```
+
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    record
+    pause
+    mark_variables
+    backward
+    set_training
+    set_recording
+```
+
diff --git a/docs/api/python/index.md b/docs/api/python/index.md
index 4014a886a0d4..964ccde0145a 100644
--- a/docs/api/python/index.md
+++ b/docs/api/python/index.md
@@ -28,6 +28,7 @@ imported by running:
    ndarray
    symbol
    module
+   autograd
    gluon
    rnn
    kvstore

From 79231e22fcc00dc47ba3ae34944891fcc111799d Mon Sep 17 00:00:00 2001
From: Dick Carter <dick.carter@comcast.net>
Date: Fri, 4 Aug 2017 10:22:40 -0700
Subject: [PATCH 22/26] Fixes improper deconv workspace alloc (#7326)

---
 src/operator/cudnn_convolution-inl.h   | 40 +++++++++++-------
 src/operator/cudnn_deconvolution-inl.h | 57 +++++++++++++++++---------
 tests/python/unittest/test_operator.py |  4 +-
 3 files changed, 64 insertions(+), 37 deletions(-)

diff --git a/src/operator/cudnn_convolution-inl.h b/src/operator/cudnn_convolution-inl.h
index 508b1f8be84d..06887a94aa70 100644
--- a/src/operator/cudnn_convolution-inl.h
+++ b/src/operator/cudnn_convolution-inl.h
@@ -95,9 +95,8 @@ class CuDNNConvolutionOp : public Operator {
     CHECK_EQ(out_data.size(), 1U);
     Stream<gpu> *s = ctx.get_stream<gpu>();
     GetTempSize(ctx);
-    Tensor<gpu, 1, DType> workspace =
-        ctx.requested[conv::kTempSpace].get_space_typed<gpu, 1, DType>(
-            mshadow::Shape1(forward_workspace_), s);
+    Tensor<gpu, 1, DType> workspace = AllocateTempWorkspace(ctx, forward_workspace_byte_);
+    size_t workspace_size = TensorSizeBytes(workspace);
 
     if (param_.kernel.ndim() == 2) {
       Tensor<gpu, 4, DType> data = in_data[conv::kData].get<gpu, 4, DType>(s);
@@ -133,7 +132,7 @@ class CuDNNConvolutionOp : public Operator {
                                        forward_conv_desc_,
                                        algo_,
                                        workspace.dptr_,
-                                       forward_workspace_byte_,
+                                       workspace_size,
                                        req[conv::kOut] == kAddTo? &beta_add : &beta,
                                        out_desc_,
                                        out_ptr + out_offset_ * g));
@@ -203,9 +202,8 @@ class CuDNNConvolutionOp : public Operator {
       data_ptr = data.dptr_;
       gdata_ptr = gdata.dptr_;
     }
-    Tensor<gpu, 1, DType> workspace =
-      ctx.requested[conv::kTempSpace].get_space_typed<gpu, 1, DType>(
-      mshadow::Shape1(backward_workspace_), s);
+    Tensor<gpu, 1, DType> workspace = AllocateTempWorkspace(ctx, backward_workspace_byte_);
+    size_t workspace_size = TensorSizeBytes(workspace);
     for (uint32_t g = 0; g < param_.num_group; ++g) {
       typename DataType<DType>::ScaleType alpha = 1.0f;
       typename DataType<DType>::ScaleType beta = 0.0f;
@@ -231,7 +229,7 @@ class CuDNNConvolutionOp : public Operator {
                backward_conv_desc_,
                back_algo_w_,
                workspace.dptr_,
-               backward_workspace_byte_,
+               workspace_size,
                req[conv::kWeight] == kAddTo? &beta_add : &beta,
                filter_desc_,
                gwmat_ptr + weight_offset_ * g));
@@ -245,7 +243,7 @@ class CuDNNConvolutionOp : public Operator {
                backward_conv_desc_,
                back_algo_w_,
                workspace.dptr_,
-               backward_workspace_byte_,
+               workspace_size,
                req[conv::kWeight] == kAddTo? &beta_add : &beta,
                filter_desc_,
                gwmat_ptr + weight_offset_ * g));
@@ -262,7 +260,7 @@ class CuDNNConvolutionOp : public Operator {
                backward_conv_desc_,
                back_algo_,
                workspace.dptr_,
-               backward_workspace_byte_,
+               workspace_size,
                req[conv::kData] == kAddTo? &beta_add : &beta,
                in_desc_,
                gdata_ptr + data_offset_ * g));
@@ -276,7 +274,7 @@ class CuDNNConvolutionOp : public Operator {
                backward_conv_desc_,
                back_algo_,
                workspace.dptr_,
-               backward_workspace_byte_,
+               workspace_size,
                req[conv::kData] == kAddTo? &beta_add : &beta,
                in_desc_,
                gdata_ptr + data_offset_ * g));
@@ -667,8 +665,6 @@ class CuDNNConvolutionOp : public Operator {
                algo_,
                &forward_workspace_byte_));
 
-    forward_workspace_ = forward_workspace_byte_ / sizeof(DType) + 1;
-    backward_workspace_ = backward_workspace_byte_ / sizeof(DType) + 1;
     init_temp_size_ = true;
   }
 
@@ -684,15 +680,29 @@ class CuDNNConvolutionOp : public Operator {
     CastTShapeToIntPtr(param_.pad, &param_pad_);
   }
 
+  // Allocates a 1D Tensor of words with size in bytes >= `size_bytes`.
+  // Always allocates at least one word.
+  mshadow::Tensor<gpu, 1, DType> AllocateTempWorkspace(const OpContext &ctx, size_t size_bytes) {
+    mshadow::Stream<gpu> *s = ctx.get_stream<gpu>();
+    size_t size_words = size_bytes / sizeof(DType) + 1;
+    return ctx.requested[conv::kTempSpace].get_space_typed<gpu, 1, DType>(
+        mshadow::Shape1(size_words), s);
+  }
+
+  // Returns the size in bytes of the 1D Tensor of words.
+  size_t TensorSizeBytes(const mshadow::Tensor<gpu, 1, DType> &tensor) {
+    return tensor.MSize() * sizeof(DType);
+  }
+
   std::vector<int> param_stride_;
   std::vector<int> param_dilate_;
   std::vector<int> param_pad_;
 
   bool init_cudnn_;
   bool init_temp_size_;
-  size_t forward_workspace_;
-  size_t backward_workspace_;
+  // Temp workspace size in bytes needed for Forward() operation.
   size_t forward_workspace_byte_;
+  // Temp workspace size in bytes needed for Backward() operation.
   size_t backward_workspace_byte_;
   size_t data_offset_;
   size_t out_offset_;
diff --git a/src/operator/cudnn_deconvolution-inl.h b/src/operator/cudnn_deconvolution-inl.h
index 5bba1e5278fa..2e2ae3a8cb8f 100644
--- a/src/operator/cudnn_deconvolution-inl.h
+++ b/src/operator/cudnn_deconvolution-inl.h
@@ -92,9 +92,8 @@ class CuDNNDeconvolutionOp : public Operator {
     CHECK_EQ(out_data.size(), 1U);
     Stream<gpu> *s = ctx.get_stream<gpu>();
     GetTempSize(ctx);
-    Tensor<gpu, 1, DType> workspace =
-      ctx.requested[deconv::kTempSpace].get_space_typed<gpu, 1, DType>(
-        mshadow::Shape1(forward_workspace_), s);
+    Tensor<gpu, 1, DType> workspace = AllocateTempWorkspace(ctx, forward_workspace_byte_);
+    size_t workspace_size = TensorSizeBytes(workspace);
 
     if (param_.kernel.ndim() == 2) {
       Tensor<gpu, 4, DType> data = in_data[deconv::kData].get<gpu, 4, DType>(s);
@@ -131,7 +130,7 @@ class CuDNNDeconvolutionOp : public Operator {
                  forward_conv_desc_,  // this backward algorithm used for inference
                  back_algo_,
                  workspace.dptr_,
-                 backward_workspace_byte_,
+                 workspace_size,
                  &beta,
                  out_desc_,
                  out.dptr_ + out_offset_ * g));
@@ -145,7 +144,7 @@ class CuDNNDeconvolutionOp : public Operator {
                  forward_conv_desc_,  // this backward algorithm used for inference
                  back_algo_,
                  workspace.dptr_,
-                 backward_workspace_byte_,
+                 workspace_size,
                  &beta,
                  out_desc_,
                  out_ptr + out_offset_ * g));
@@ -222,9 +221,8 @@ class CuDNNDeconvolutionOp : public Operator {
       CHECK_NE(req[deconv::kBias], kWriteInplace);
     }
     CHECK_NE(req[deconv::kData], kWriteInplace);
-    Tensor<gpu, 1, DType> workspace =
-        ctx.requested[deconv::kTempSpace].get_space_typed<gpu, 1, DType>(
-                                 mshadow::Shape1(backward_workspace_), s);
+    Tensor<gpu, 1, DType> workspace = AllocateTempWorkspace(ctx, backward_workspace_byte_);
+    size_t workspace_size = TensorSizeBytes(workspace);
     for (uint32_t g = 0; g < param_.num_group; ++g) {
       typename DataType<DType>::ScaleType alpha = 1.0f;
       typename DataType<DType>::ScaleType bias_beta = 0.0f;
@@ -257,7 +255,7 @@ class CuDNNDeconvolutionOp : public Operator {
           backward_conv_desc_,
           back_algo_w_,
           workspace.dptr_,
-          backward_workspace_byte_,
+          workspace_size,
           &weight_beta,
           filter_desc_,
           gwmat.dptr_ + weight_offset_ * g));
@@ -272,7 +270,7 @@ class CuDNNDeconvolutionOp : public Operator {
           backward_conv_desc_,
           back_algo_w_,
           workspace.dptr_,
-          backward_workspace_byte_,
+          workspace_size,
           &weight_beta,
           filter_desc_,
           gwmat_ptr + weight_offset_ * g));
@@ -288,7 +286,7 @@ class CuDNNDeconvolutionOp : public Operator {
                                            backward_conv_desc_,
                                            algo_,
                                            workspace.dptr_,
-                                           forward_workspace_byte_,
+                                           workspace_size,
                                            &data_beta,
                                            in_desc_,
                                            gdata_ptr + data_offset_ * g));
@@ -664,32 +662,34 @@ class CuDNNDeconvolutionOp : public Operator {
   void GetTempSize(const OpContext& ctx) {
     if (init_temp_size_) return;
     mshadow::Stream<gpu> *s = ctx.get_stream<gpu>();
-    size_t back_size = 0, back_size_w = 0;
+    size_t back_data_algo_workspace_size = 0;
+    size_t back_filter_algo_workspace_size = 0;
+    size_t forward_algo_workspace_size = 0;
     CUDNN_CALL(cudnnGetConvolutionBackwardDataWorkspaceSize(s->dnn_handle_,
                filter_desc_,
                in_desc_,
                forward_conv_desc_,
                out_desc_,
                back_algo_,
-               &back_size));
+               &back_data_algo_workspace_size));
     CUDNN_CALL(cudnnGetConvolutionBackwardFilterWorkspaceSize(s->dnn_handle_,
                out_desc_,
                in_desc_,
                backward_conv_desc_,
                filter_desc_,
                back_algo_w_,
-               &back_size_w));
-    backward_workspace_byte_ = std::max(back_size, back_size_w);
+               &back_filter_algo_workspace_size));
     CUDNN_CALL(cudnnGetConvolutionForwardWorkspaceSize(s->dnn_handle_,
                out_desc_,
                filter_desc_,
                backward_conv_desc_,
                in_desc_,
                algo_,
-               &forward_workspace_byte_));
+               &forward_algo_workspace_size));
 
-    forward_workspace_ = forward_workspace_byte_ / sizeof(DType) + 1;
-    backward_workspace_ = backward_workspace_byte_ / sizeof(DType) + 1;
+    forward_workspace_byte_ = back_data_algo_workspace_size;
+    backward_workspace_byte_ = std::max(forward_algo_workspace_size,
+                                        back_filter_algo_workspace_size);
     init_temp_size_ = true;
   }
 
@@ -704,14 +704,31 @@ class CuDNNDeconvolutionOp : public Operator {
     CastTShapeToIntPtr(param_.dilate, &param_dilate_);
   }
 
+  // Allocates a 1D Tensor of words with size in bytes >= `size_bytes`.
+  // Always allocates at least one word.
+  mshadow::Tensor<gpu, 1, DType> AllocateTempWorkspace(const OpContext &ctx, size_t size_bytes) {
+    mshadow::Stream<gpu> *s = ctx.get_stream<gpu>();
+    size_t size_words = size_bytes / sizeof(DType) + 1;
+    return ctx.requested[deconv::kTempSpace].get_space_typed<gpu, 1, DType>(
+        mshadow::Shape1(size_words), s);
+  }
+
+  // Returns the size in bytes of the 1D Tensor of words.
+  size_t TensorSizeBytes(const mshadow::Tensor<gpu, 1, DType> &tensor) {
+    return tensor.MSize() * sizeof(DType);
+  }
+
   std::vector<int> param_stride_;
   std::vector<int> param_dilate_;
 
   bool init_cudnn_;
   bool init_temp_size_;
-  size_t forward_workspace_;
-  size_t backward_workspace_;
+  // Temp workspace size in bytes needed for Forward() operation.  Note that
+  // in deconvolution, this is handled by the cuDNN backprop-to-data kernel.
   size_t forward_workspace_byte_;
+  // Temp workspace size in bytes needed for Backward() operation.  Note that
+  // in deconvolution, this is handled by the cuDNN forward kernel and the
+  // the cuDNN backprop-to-filter kernel.
   size_t backward_workspace_byte_;
   size_t data_offset_;
   size_t out_offset_;
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 51a77e0af221..7007da6a2910 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -677,7 +677,7 @@ def check_deconvolution_forward_backward(input_shape, num_filter, kernel, stride
     exe.forward(is_train=True)
     out = exe.outputs[0].asnumpy()
     exe.backward(out_grad)
-    assert_almost_equal(out, args_grad[0].asnumpy(), rtol=1E-3, atol=1e-4)
+    assert_almost_equal(out, args_grad[0].asnumpy(), rtol=1E-3, atol=1e-3)
 
     args_grad_addto_npy = [np.random.normal(size=s) for s in arg_shapes]
     args_grad_addto = [mx.nd.array(ele) for ele in args_grad_addto_npy]
@@ -685,7 +685,7 @@ def check_deconvolution_forward_backward(input_shape, num_filter, kernel, stride
     exe.forward(is_train=True)
     out = exe.outputs[0].asnumpy()
     exe.backward(out_grad)
-    assert_almost_equal(out + args_grad_addto_npy[0], args_grad_addto[0].asnumpy(), rtol=1e-4, atol=1e-4)
+    assert_almost_equal(out + args_grad_addto_npy[0], args_grad_addto[0].asnumpy(), rtol=1e-4, atol=1e-3)
 
 
 def check_deconvolution_gradient(input_shape, num_filter, pad):

From b230d1745fd8e5b01c9c741d9567e5561ea538cc Mon Sep 17 00:00:00 2001
From: Sergey Kolychev <sergeykolychev.github@gmail.com>
Date: Fri, 4 Aug 2017 11:57:31 -0700
Subject: [PATCH 23/26] Attempting to add Perl interface to Apache CI. (#7170)

* attempting to add Perl interface to Apache CI.

* second attempt.

* forgot the test file.

* changed the working dir for tests, removed test that started to fail because upstream bug.
---
 Jenkinsfile                                   | 22 +++++++
 perl-package/AI-MXNet/t/test_model_parallel.t | 62 -------------------
 perl-package/test.sh                          | 16 +++++
 tests/ci_build/Dockerfile.cpu                 |  2 +
 tests/ci_build/Dockerfile.gpu                 |  2 +
 tests/ci_build/install/ubuntu_install_perl.sh |  4 ++
 6 files changed, 46 insertions(+), 62 deletions(-)
 delete mode 100644 perl-package/AI-MXNet/t/test_model_parallel.t
 create mode 100755 perl-package/test.sh
 create mode 100755 tests/ci_build/install/ubuntu_install_perl.sh

diff --git a/Jenkinsfile b/Jenkinsfile
index e48ecf207955..b3bf82689e56 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -281,6 +281,28 @@ try {
           }
         }
       },
+      'Perl: CPU': {
+            node('mxnetlinux') {
+                ws('workspace/ut-perl-cpu') {
+                    init_git()
+                    unpack_lib('cpu')
+                    timeout(time: max_time, unit: 'MINUTES') {
+                        sh "${docker_run} cpu ./perl-package/test.sh /workspace/ut-perl-cpu /workspace/ut-perl-cpu"
+                    }
+                }
+            }
+      },
+      'Perl: GPU': {
+            node('mxnetlinux') {
+                ws('workspace/ut-perl-gpu') {
+                    init_git()
+                    unpack_lib('gpu')
+                    timeout(time: max_time, unit: 'MINUTES') {
+                        sh "${docker_run} gpu ./perl-package/test.sh /workspace/ut-perl-gpu /workspace/ut-perl-gpu"
+                    }
+                }
+            }
+      },
       'R: CPU': {
         node('mxnetlinux') {
           ws('workspace/ut-r-cpu') {
diff --git a/perl-package/AI-MXNet/t/test_model_parallel.t b/perl-package/AI-MXNet/t/test_model_parallel.t
deleted file mode 100644
index e20b208029b5..000000000000
--- a/perl-package/AI-MXNet/t/test_model_parallel.t
+++ /dev/null
@@ -1,62 +0,0 @@
-use strict;
-use warnings;
-use Test::More tests => 3;
-use AI::MXNet qw(mx);
-use AI::MXNet::TestUtils qw(reldiff);
-use AI::MXNet::Base;
-
-sub test_chain
-{
-    my $n = 2;
-    my $data1 = mx->sym->Variable('data1');
-    my $data2 = mx->sym->Variable('data2');
-    my $net;
-    {
-        local($mx::AttrScope) = mx->AttrScope(ctx_group=>'dev1');
-        $net = $data1 + $data2;
-        $net = $net * 3;
-    }
-
-    {
-        local($mx::AttrScope) = mx->AttrScope(ctx_group=>'dev2');
-        $net = $net + $data1;
-    }
-    my $arr;
-    my $arr_grad;
-    my $shape = [4, 5];
-    {
-        local($mx::Context) = mx->Context(mx->cpu(0));
-        $arr   = [map { mx->nd->empty($shape) } 0..$n-1];
-        $arr_grad = [map { mx->nd->empty($shape) } 0..$n-1];
-    }
-
-    my $exec1 = $net->bind(
-        ctx          => mx->cpu(),
-        args         => $arr,
-        args_grad    => $arr_grad,
-        group2ctx    => { dev1 => mx->cpu(0), dev2 => mx->cpu(1) }
-    );
-    $arr->[0] .= 1;
-    $arr->[1] .= 2;
-    my $arr2 = [map { $_->copyto(mx->cpu()) } @$arr];
-    my $arr_grad2 = [map { $_->copyto(mx->cpu()) } @$arr_grad];
-    my $exec2 = $net->bind(
-        ctx       => mx->cpu(),
-        args      => $arr2,
-        args_grad => $arr_grad2
-    );
-
-    $exec1->forward(1);
-    $exec2->forward(1);
-    ok(reldiff($exec1->outputs->[0]->aspdl, $exec2->outputs->[0]->aspdl) < 1e-6);
-    my $out_grad = mx->nd->empty($shape, ctx => mx->cpu(1));
-    $out_grad .= 1;
-    $exec1->backward([$out_grad]);
-    $exec2->backward([$out_grad->copyto(mx->cpu())]);
-    zip(sub {
-        my ($a, $b) = @_;
-        ok(reldiff($a->aspdl, $b->aspdl) < 1e-6);
-    }, $arr_grad, $arr_grad2);
-}
-
-test_chain();
diff --git a/perl-package/test.sh b/perl-package/test.sh
new file mode 100755
index 000000000000..c83120f31546
--- /dev/null
+++ b/perl-package/test.sh
@@ -0,0 +1,16 @@
+MXNET_HOME=$1
+HOME=$2
+export LD_LIBRARY_PATH=${MXNET_HOME}/lib
+export PERL5LIB=${HOME}/perl5/lib/perl5
+
+cd ${MXNET_HOME}/perl-package/AI-MXNetCAPI/
+perl Makefile.PL INSTALL_BASE=${HOME}/perl5
+make install || exit -1
+
+cd ${MXNET_HOME}/perl-package/AI-NNVMCAPI/
+perl Makefile.PL INSTALL_BASE=${HOME}/perl5
+make install || exit -1
+
+cd ${MXNET_HOME}/perl-package/AI-MXNet/
+perl Makefile.PL INSTALL_BASE=${HOME}/perl5
+make test || exit -1
diff --git a/tests/ci_build/Dockerfile.cpu b/tests/ci_build/Dockerfile.cpu
index c9ba57c6ad46..c7bb0af0f79c 100644
--- a/tests/ci_build/Dockerfile.cpu
+++ b/tests/ci_build/Dockerfile.cpu
@@ -8,3 +8,5 @@ COPY install/ubuntu_install_scala.sh /install/
 RUN /install/ubuntu_install_scala.sh
 COPY install/ubuntu_install_r.sh /install/
 RUN /install/ubuntu_install_r.sh
+COPY install/ubuntu_install_perl.sh /install/
+RUN /install/ubuntu_install_perl.sh
diff --git a/tests/ci_build/Dockerfile.gpu b/tests/ci_build/Dockerfile.gpu
index cd9986ec01a2..a2893a9fb44f 100644
--- a/tests/ci_build/Dockerfile.gpu
+++ b/tests/ci_build/Dockerfile.gpu
@@ -8,3 +8,5 @@ COPY install/ubuntu_install_scala.sh /install/
 RUN /install/ubuntu_install_scala.sh
 COPY install/ubuntu_install_r.sh /install/
 RUN /install/ubuntu_install_r.sh
+COPY install/ubuntu_install_perl.sh /install/
+RUN /install/ubuntu_install_perl.sh
diff --git a/tests/ci_build/install/ubuntu_install_perl.sh b/tests/ci_build/install/ubuntu_install_perl.sh
new file mode 100755
index 000000000000..da4df67a464a
--- /dev/null
+++ b/tests/ci_build/install/ubuntu_install_perl.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+# install libraries for mxnet's perl package on ubuntu
+apt-get update && apt-get install -y libmouse-perl pdl cpanminus swig libgraphviz-perl
+cpanm -q Function::Parameters

From 43d1d2c58fca2654818d5f16f664859234dc8dac Mon Sep 17 00:00:00 2001
From: Yao Wang <kevinthesunwy@gmail.com>
Date: Fri, 4 Aug 2017 15:41:28 -0700
Subject: [PATCH 24/26] Build versioning website (#7340)

* Build versioning website

Fix build versioned doc

More fix

More fix

More fix

Fix

Fix

More fix

Change url

Fix more

Fix more

Fix

Fix

Fix

Rollback

Rollback

Fix

Fix

Fix

Fix

Fix

Fix

Fix

Fix

Fix

Fix

Add url as command line arguments

More fix

Change addversion path

Fix

Fix

Fix

Fix

Fix

* Small changes
---
 docs/_static/js/navbar.js            |  6 ++-
 docs/_static/mxnet-theme/navbar.html |  4 +-
 docs/_static/mxnet.css               |  5 +-
 docs/build_version_doc/AddVersion.py | 58 ++++++++++++++++++++++
 docs/build_version_doc/build_doc.sh  | 74 ++++++++++++++++++++++++++++
 tests/ci_build/Dockerfile.doc        |  2 +-
 6 files changed, 143 insertions(+), 6 deletions(-)
 create mode 100644 docs/build_version_doc/AddVersion.py
 create mode 100755 docs/build_version_doc/build_doc.sh

diff --git a/docs/_static/js/navbar.js b/docs/_static/js/navbar.js
index 9c3164ee18ea..91e0356d9263 100644
--- a/docs/_static/js/navbar.js
+++ b/docs/_static/js/navbar.js
@@ -3,6 +3,7 @@ var TITLE = ['/get_started/', '/tutorials/', '/how_to/', '/api/', '/architecture
 var APIsubMenu;
 $("#burgerMenu").children().each(function () {
     if($(this).children().first().html() == 'API') APIsubMenu = $(this).clone()
+    if($(this).children().first().html().startsWith('Versions')) VersionsubMenu = $(this).clone()
 });
 
 function navbar() {
@@ -38,9 +39,12 @@ function navbar() {
     }
     $("#plusMenu").empty();
     for (var i = 0; i < plusMenuList.length; ++i) {
-        if(plusMenuList[i].html().length > 20) {
+        if(plusMenuList[i].attr('id') == 'dropdown-menu-position-anchor') {
             $("#plusMenu").append(APIsubMenu);
         }
+        else if(plusMenuList[i].attr('id') == 'dropdown-menu-position-anchor-version') {
+            $("#plusMenu").append(VersionsubMenu);
+        }
         else {
             $("#plusMenu").append("<li></li>");
             plusMenuList[i].removeClass("main-nav-link");
diff --git a/docs/_static/mxnet-theme/navbar.html b/docs/_static/mxnet-theme/navbar.html
index 1887f8cf520d..c88fb58bb5c2 100644
--- a/docs/_static/mxnet-theme/navbar.html
+++ b/docs/_static/mxnet-theme/navbar.html
@@ -74,7 +74,7 @@ <h1 id="logo-wrap">
       <script> function getRootPath(){ return "{{url_root}}" } </script>
       <div class="burgerIcon dropdown">
           <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button">☰</a>
-          <ul id="burgerMenu" class="dropdown-menu">
+          <ul id="burgerMenu" class="dropdown-menu dropdown-menu-right">
               <li><a href="{{url_root}}get_started/install.html">Install</a></li>
               {% for name in ['Tutorials', 'How To'] %}
               <li><a href="{{url_root}}{{name.lower()|replace(" ", "_")}}/index.html">{{name}}</a></li>
@@ -96,7 +96,7 @@ <h1 id="logo-wrap">
       </div>
       <div class="plusIcon dropdown">
         <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button"><span class="glyphicon glyphicon-plus" aria-hidden="true"></span></a>
-        <ul id="plusMenu" class="dropdown-menu"></ul>
+        <ul id="plusMenu" class="dropdown-menu dropdown-menu-right"></ul>
       </div>
       <div id="search-input-wrap">
           {{searchform('', False)}}
diff --git a/docs/_static/mxnet.css b/docs/_static/mxnet.css
index c71d9ef6880a..6f6d8cda1351 100644
--- a/docs/_static/mxnet.css
+++ b/docs/_static/mxnet.css
@@ -189,7 +189,9 @@ img {
     text-decoration: none;
 }
 
-#dropdown-menu-position-anchor {
+#dropdown-menu-position-anchor, 
+#dropdown-menu-position-anchor-version, 
+#dropdown-menu-position-anchor-version-mobile {
     position: relative;
 }
 
@@ -358,7 +360,6 @@ div .burgerIcon a {
 
 li.dropdown-submenu ul.dropdown-menu {
     min-width: 75px;
-    width: 75px
 }
 
 li.dropdown-submenu ul.dropdown-menu li {
diff --git a/docs/build_version_doc/AddVersion.py b/docs/build_version_doc/AddVersion.py
new file mode 100644
index 000000000000..ee46ef5ffd12
--- /dev/null
+++ b/docs/build_version_doc/AddVersion.py
@@ -0,0 +1,58 @@
+import os
+import argparse
+from bs4 import BeautifulSoup as bs
+
+parser = argparse.ArgumentParser(description="Manipulate index page",
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('--file_path', type=str, default='mxnet/docs/_build/html/',
+                        help='file to be modified')
+parser.add_argument('--current_version', type=str, default='master',
+                        help='Current version')
+parser.add_argument('--root_url', type=str, default='https://mxnet.io',
+                        help='Root URL')
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+
+    root_url = args.root_url
+    tag_list = list()
+    with open('tag_list.txt', 'r') as tag_file:
+        for line in tag_file:
+            tag_list.append(line.lstrip().rstrip())
+        tag_list.append('master')
+
+    version_str = '<span id="dropdown-menu-position-anchor-version" ' \
+                  'style="position: relative">' \
+                  '<a href="#" class="main-nav-link dropdown-toggle" data-toggle="dropdown" ' \
+                  'role="button" aria-haspopup="true" aria-expanded="true">Versions(%s)<span class="caret">' \
+                  '</span></a><ul id="package-dropdown-menu" class="dropdown-menu">' % (args.current_version)
+    version_str_mobile = '<li id="dropdown-menu-position-anchor-version-mobile" class="dropdown-submenu" ' \
+                         'style="position: relative">' \
+                         '<a href="#" tabindex="-1">Versions(%s)</a><ul class="dropdown-menu">' % (args.current_version)
+    for i, tag in enumerate(tag_list):
+        url = root_url if i == 0 else root_url + 'versions/%s/index.html' % (tag)
+        version_str += '<li><a class="main-nav-link" href=%s>%s</a></li>' % (url, tag)
+        version_str_mobile += '<li><a tabindex="-1" href=%s>%s</a></li>' % (url, tag)
+    version_str += '</ul></span>'
+    version_str_mobile += '</ul></li>'
+
+    for path, subdirs, files in os.walk(args.file_path):
+        for name in files:
+            if not name.endswith('.html'):
+                continue
+            with open(os.path.join(path, name), 'r') as html_file:
+                content = bs(html_file, 'html.parser')
+            navbar = content.find(id="main-nav")
+            navbar_mobile = content.find(id="burgerMenu")
+            if navbar and navbar_mobile:
+                version_tag = content.find(id="dropdown-menu-position-anchor-version")
+                version_tag_mobile = content.find(id="dropdown-menu-position-anchor-version-mobile")
+                if version_tag:
+                    version_tag.extract()
+                if version_tag_mobile:
+                    version_tag_mobile.extract()
+                navbar.append(version_str)
+                navbar_mobile.append(version_str_mobile)
+                outstr = str(content).replace('&lt;', '<').replace('&gt;', '>')
+                with open(os.path.join(path, name), "w") as outf:
+                    outf.write(outstr)
\ No newline at end of file
diff --git a/docs/build_version_doc/build_doc.sh b/docs/build_version_doc/build_doc.sh
new file mode 100755
index 000000000000..046dae2d02d0
--- /dev/null
+++ b/docs/build_version_doc/build_doc.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+
+web_url="$1"
+web_folder="VersionedWeb"
+local_build="latest"
+web_branch="$2"
+git clone $web_url $web_folder
+cd $web_folder
+git checkout -b $web_branch "origin/$web_branch"
+cd ..
+mkdir "$local_build"
+
+# Fetch tag information
+tag_list_file="tag_list.txt"
+cp "$web_folder/tag.txt" "$tag_list_file"
+tag_list=()
+while read -r line 
+do
+    tag_list+=("$line")
+done < "$tag_list_file"
+latest_tag=${tag_list[0]}
+echo "latest_tag is: $latest_tag"
+commit_id=$(git rev-parse HEAD)
+curr_tag=${TAG}
+curr_tag=${curr_tag:5}
+echo "Current tag is $curr_tag"
+if [[ "$curr_tag" != 'master' ]] && [ $curr_tag != $latest_tag ]
+then
+    latest_tag=$curr_tag
+fi
+
+# Build new released tag
+if [ $latest_tag != ${tag_list[0]} ]
+then
+    echo "Building new tag"
+    git submodule update
+    make docs || exit 1
+    echo -e "$latest_tag\n$(cat $tag_list_file)" > "$tag_list_file"
+    cat $tag_list_file
+    tests/ci_build/ci_build.sh doc python docs/build_version_doc/AddVersion.py --file_path "docs/_build/html/" \
+                                          --current_version "$latest_tag" --root_url "http://mxnet.incubator.apache.org/"
+    cp -a "docs/_build/html/." "$local_build"
+    cp $tag_list_file "$local_build/tag.txt"
+    rm -rf "$web_folder/.git"
+    cp -a "$web_folder/versions/." "$local_build/versions"
+    mkdir "$local_build/versions/${tag_list[0]}"
+    cp -a "$web_folder/." "$local_build/versions/${tag_list[0]}" || exit 1
+    rm -rf "$local_build/versions/${tag_list[0]}/versions"
+    rm -rf "$web_folder/*"
+    cp -a "$local_build/." "$web_folder"
+fi
+
+# Build latest master
+git checkout VersionedDoc
+git checkout -- .
+git submodule update
+echo "Building master"
+make docs || exit 1
+
+rm -rfv "$web_folder/versions/master/*"
+cp -a "docs/_build/html/." "$web_folder/versions/master"
+tests/ci_build/ci_build.sh doc python docs/build_version_doc/AddVersion.py --file_path "$web_folder/versions/master" \
+                                      --root_url "http://mxnet.incubator.apache.org/"
+
+# Update version list for all previous version website
+if [ $latest_tag != ${tag_list[0]} ]
+then
+    total=${#tag_list[*]}
+    for (( i=0; i<=$(( $total -1 )); i++ ))
+    do
+        tests/ci_build/ci_build.sh doc python docs/build_version_doc/AddVersion.py --file_path "$web_folder/versions/${tag_list[$i]}" \
+                                              --current_version "${tag_list[$i]}" --root_url "http://mxnet.incubator.apache.org/"
+    done
+fi
diff --git a/tests/ci_build/Dockerfile.doc b/tests/ci_build/Dockerfile.doc
index 622d946665cc..43d1fa97ac37 100644
--- a/tests/ci_build/Dockerfile.doc
+++ b/tests/ci_build/Dockerfile.doc
@@ -12,4 +12,4 @@ RUN wget http://downloads.lightbend.com/scala/2.11.8/scala-2.11.8.deb && \
     dpkg -i scala-2.11.8.deb && rm scala-2.11.8.deb
 
 RUN apt-get install -y doxygen libatlas-base-dev graphviz pandoc
-RUN pip install sphinx==1.3.5 CommonMark==0.5.4 breathe mock recommonmark pypandoc
+RUN pip install sphinx==1.3.5 CommonMark==0.5.4 breathe mock recommonmark pypandoc beautifulsoup4

From 15ffc8231f3f3c870770055294c50bbeb13ac39a Mon Sep 17 00:00:00 2001
From: lxn2 <lxn2@users.noreply.github.com>
Date: Fri, 4 Aug 2017 17:32:06 -0700
Subject: [PATCH 25/26] Add DISCLAIMER and lxn2 GPG keys (#7344)

---
 DISCLAIMER | 12 +++++++++++
 KEYS       | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 71 insertions(+)
 create mode 100644 DISCLAIMER

diff --git a/DISCLAIMER b/DISCLAIMER
new file mode 100644
index 000000000000..8adc57f6e6b0
--- /dev/null
+++ b/DISCLAIMER
@@ -0,0 +1,12 @@
+Apache MXNet (incubating) is an effort undergoing incubation at The
+Apache Software Foundation (ASF), sponsored by the Apache Incubator PMC.
+
+Incubation is required of all newly accepted
+projects until a further review indicates that the
+infrastructure, communications, and decision making process have
+stabilized in a manner consistent with other successful ASF
+projects.
+
+While incubation status is not necessarily a reflection
+of the completeness or stability of the code, it does indicate
+that the project has yet to be fully endorsed by the ASF.
diff --git a/KEYS b/KEYS
index 28c497fceb68..19ec1a3e5f15 100644
--- a/KEYS
+++ b/KEYS
@@ -71,3 +71,62 @@ DkFmj1upWAayCoXTpKzsHBvJZPC+Wqf9Pl3O47apelg7KxU3S011YfXpVPvCTKBv
 kD2o/5GKWS5QkSUEUXXY1oDiLg==
 =f8kJ
 -----END PGP PUBLIC KEY BLOCK-----
+pub   rsa4096 2017-07-12 [SC]
+      406DCA257CD2BE237B79AE6BC9D353CA4AFF2E24
+uid           [ultimate] Ly Nguyen (CODE SIGNING KEY) <lxn2@apache.org>
+sig 3        C9D353CA4AFF2E24 2017-07-12  Ly Nguyen (CODE SIGNING KEY) <lxn2@apache.org>
+sub   rsa4096 2017-07-12 [E]
+sig          C9D353CA4AFF2E24 2017-07-12  Ly Nguyen (CODE SIGNING KEY) <lxn2@apache.org>
+
+-----BEGIN PGP PUBLIC KEY BLOCK-----
+
+mQINBFlmSIMBEADIr6FzNJ6o/owjqgqWdOtreIRuU47/uzNRZw8c2lEys2Fw+3CI
+iUitkWpb7jR0BGLk+8yUk+1VGdXPuJ+zj8XWcCnCJ7TUy3Hudp/BrX7y388m9hP9
+3LP5yx+AUKbXRZiEr5EG2lyTmJBB5lmreVlRMs74Ie3uFtH6US/DVZMqULEtumcH
+yCL30kKugUjfftO1mbx901kB0WpB705od3Wrde0Jd9sniMz4HkXMsd93gExh/s1H
+3XApXes+yDIEILiUJRawgzgcPIuTyOq4bbafoiFd8ipZU0G7AQPtNUAnpTUtrUaJ
+5CDGzOiqGUgwi+M3zwsRcW2MjDi9MyNTmlW2P6Gifzn3EaJ0EVdz4fPmIokC5h+H
+6nMHqSPUEu0WA/qpirVrOiUku34lpkP0vZwb8UOyjgBCFTxDMPX70DuUmCbij1rr
+vGM0rKLV+LFclEQFpnXckUnza8f/Zbk9T3yWcPQykXyi7+1Z1WJSPVkF4l8ynpDy
+4DdUnLGdF8HZAGHdroi/jGVrH2NYy42XQqOZoLfk2BTGiFYpQem/Bfzo3OdEPBT7
+zpZUVqixtXbnGseL1sdHao1BdinIbvSpPOPEbObINenk65NtXWc+9YbauGkJ5kwd
+opAkBmZC4IycFWkpmHecbGXJN61eYvARuXKAev7DeYH7g6Zuzp4n07rtIwARAQAB
+tC5MeSBOZ3V5ZW4gKENPREUgU0lHTklORyBLRVkpIDxseG4yQGFwYWNoZS5vcmc+
+iQJOBBMBCgA4FiEEQG3KJXzSviN7ea5rydNTykr/LiQFAllmSIMCGwMFCwkIBwMF
+FQoJCAsFFgIDAQACHgECF4AACgkQydNTykr/LiT2/Q//aW1qOLX7msuJDqhlHFIM
+hCUZzWClljfCHMHZJooJY5YOcvzE5mVgwVdWjgAgZfgk/bFsNhuOb+jIqlatsNfI
+Eg7sm6VjfHRo3pP1W7NN+CQNu5JnEEZAIVLy2gn+Eq1rQc7g2pfylVh/HV14TGon
+OWbk7BfaZubGLtLJTIimHAPd+TrRsGsLnd9JiDZj0gsPPKV6HHXHgZoAeStIUPNX
+13mN/WMDAAqroPPUfMEMXPbmJgNf/ukIFxsS/y8MwU32BjVCBvvh8ojN3RIgUJnX
+chdjT9i/QVKi9TyoF20R7mR80x/P9CBwqKoN9+QuHjTPDuZkol4xD3jyzOsKHPwZ
+CpltwdhI2JCYJzEIFtrZ0R59fXJ+8NNXZzIOqnx83qarC+eSf8cunqPS/ZBIvEJ0
+qM1adZlJiY96La10wXSjYnEc+XEw+dad3D3ChVsvDceJirelaAVrRS2Dz4ugNShy
+W0cZFFUL0aCTNNJnF9sHAfexbbg06BTzSSAeYrEWLmmpjEYHXAtFyToHzk0jTUr4
+66SeIUVHIqBLk8yx1L9zQK38JS9usYj1PFJri9J6iYyqiIS7zRinoO8MIySZOOGp
+Z3Q5xJbnwzjwl4frGaXg2/zyD7rfQGG3P23WOselgNWMKuYtVAA+AHo/CxLIinKk
+JAMljesV3vfeawK5HHnfcgK5Ag0EWWZIgwEQAMsmr5lOFe4n9iGdTciYFXxZYSEX
+ZqmtWyxNsXkih2icfohygx/YLFBSkdXSfIywS7w7+Na4OYdhp3uaRdU+yA4ianY7
+qH5guni98KtyZmsRnnjT1DgyR0pNNqAdAyfWeCglMx5SWLLtzKxHazqF0t6Jb6M/
+sAew+KdoTXsYzKb9d/R81spvefJoBopaxKLF1tijaX98RiquKLlFBD+88XP6pxSB
+nwNxNybgJVlGT/RdxPiRiRj0CySuvx27i8w8Rc2HaT9CFumzdy6moz+RJbuuIjDN
+QzIOpNy4+LJKSysPGh8AwRu6xCl9gnfbJ9thiFwYGZ7S3lVvS23/poI1YzLZZY+5
+XvpiiogF7j5Aj/zTTli8BI/CiNVrGKJuzeJJyLFfBMmrbysi9mV/fR8wC7xd5P9g
+LjElkA4j1Xv5I47AVsILAbHLhphpxNDoKBmr1EbP/CJitEYjRmdjn4Mo6sYwMlVN
+CA+rl/VMS3Nc0Iixu/Y070H3kE9IfitksiuXIJfeX5RW/uWegEO1e1dSpi+rreb8
+lvVtQk4tMUHyM16qPqO08tPGSunt6J0HiPi7J+xDwbJjJS7gNDW4AYHG5q4/dZsx
+PtpcZC7zFOlFV0BwFftYnluccDhsWPc48mDmmhOe9p42irMAx6ms/Y42jgh4OmgD
+bjMzKIyYFI40URGnABEBAAGJAjYEGAEKACAWIQRAbcolfNK+I3t5rmvJ01PKSv8u
+JAUCWWZIgwIbDAAKCRDJ01PKSv8uJCAtD/97SuVGnCP3kbWfI/qfTTVKwuWTdbIg
+rPvOjGo5F57l1PAgARt8N1ccqREbR3JwhRdsU3ewz5eDQEyEZVffPgufhqZr8liI
+EP783m83VgRSMKYt6HzORX0os2BapsHHuejvlME9XpN0UG5AnvbzXDxP3wJufB1K
+GkmC+rlpqfyMu60xFXzym9QuePksbdf/xXZduvLGaB1u+AYtvHp3+NGV382vat7C
+xwRShVJTb8Zr9y5tA+JDqfhDDb5CepcPH6Uk2frU8aV7vZ3hmVmGcDcUddu3U9hg
+L7Lcpr1E0D7xOuQ4QMAFhcDO+aB8aPv+JRkH4Y6wDFPrEgcEJ1YK6hhW5KSdslyK
+QrKHKMSl+hwPmh9fKX4wC+FjMMXJ/PHtEG3N3f7/TyyO4iza5xDIJkYcyKkDXc0l
+VcHLJvtjsJziMJNV3lKAeTp/uzbaJHRhLmpPHukQPnlpjfhnmsYh3wydnd03pfzQ
+k6XJ4iGeSSQqtW6T14yqkCl5HDH2ms1ufhe4Os217CMXnaRbM/K6Zl4iGGozzXgd
+no02+jTN3NqmUw0hUBR/9ZEn+IKmZ6f0Azsgio0M9ez1T0CCDZvo19kJw9b3VdOF
+TZQhIRekaaV+bCQQxnwDOJ31bIUUpxaMdvygjq55Gri/5C75TsMNcgbhqYWLGKe2
+kRsGTxyO+fQ6/Q==
+=FuXU
+-----END PGP PUBLIC KEY BLOCK-----

From 0d8d27ece04613197711be0843af0de79822aa3b Mon Sep 17 00:00:00 2001
From: Yao Wang <kevinthesunwy@gmail.com>
Date: Fri, 4 Aug 2017 22:03:58 -0700
Subject: [PATCH 26/26] Fix gluon bottleneck v2 (#7339)

* Fix Gluon Resnet BottleneckV2

* Fix
---
 python/mxnet/gluon/model_zoo/vision/resnet.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/mxnet/gluon/model_zoo/vision/resnet.py b/python/mxnet/gluon/model_zoo/vision/resnet.py
index 5e2adad52781..48ba07941acb 100644
--- a/python/mxnet/gluon/model_zoo/vision/resnet.py
+++ b/python/mxnet/gluon/model_zoo/vision/resnet.py
@@ -177,11 +177,11 @@ class BottleneckV2(HybridBlock):
     def __init__(self, channels, stride, downsample=False, in_channels=0, **kwargs):
         super(BottleneckV2, self).__init__(**kwargs)
         self.bn1 = nn.BatchNorm()
-        self.conv1 = _conv3x3(channels//4, 1, in_channels)
+        self.conv1 = nn.Conv2D(channels//4, kernel_size=1, strides=1, use_bias=False)
         self.bn2 = nn.BatchNorm()
         self.conv2 = _conv3x3(channels//4, stride, channels//4)
         self.bn3 = nn.BatchNorm()
-        self.conv3 = _conv3x3(channels, 1, channels//4)
+        self.conv3 = nn.Conv2D(channels, kernel_size=1, strides=1, use_bias=False)
         if downsample:
             self.downsample = nn.Conv2D(channels, 1, stride, use_bias=False,
                                         in_channels=in_channels)