From a587c4ef0fde916c6da931f8b6dd5fada92fe749 Mon Sep 17 00:00:00 2001
From: zixuanweeei <zixuan.wei@intel.com>
Date: Mon, 2 Sep 2019 18:18:03 +0800
Subject: [PATCH 1/3] MKL-DNN RNN checks NDArray version

---
 src/operator/rnn-inl.h | 79 +++++++++++-------------------------------
 src/operator/rnn.cc    | 14 ++++----
 2 files changed, 29 insertions(+), 64 deletions(-)
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index 5eae413b078b..5652960baa43 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -409,10 +409,11 @@ class RNNOp {
   std::vector<mkldnn::memory> bias_memory;
   std::vector<mkldnn::memory> y_memory;
   std::vector<mkldnn::memory> hcy_memory;
+  size_t weights_version;
   bool has_cache;
   bool init_mem_;
   size_t reserve_mem_size_;
-  Storage::Handle mem_space_;
+  NDArray mem_space_;
 #endif
   explicit RNNOp(RNNParam param, Context ctx) {
     this->param_ = param;
@@ -522,12 +523,6 @@ class RNNOp {
   }
 
   ~RNNOp() {
-#if MXNET_USE_MKLDNN == 1
-    if (init_mem_) {
-      Storage::Get()->Free(mem_space_);
-      init_mem_ = false;
-    }
-#endif  // MXNET_USE_MKLDNN
 #if MXNET_USE_CUDNN == 1
     CUDNN_CALL(cudnnDestroyTensorDescriptor(hx_desc_));
     CUDNN_CALL(cudnnDestroyTensorDescriptor(cx_desc_));
@@ -560,17 +555,6 @@ class RNNOp {
     CUDNN_CALL(cudnnDestroyRNNDataDescriptor(dy_data_desc_));
 #endif  // MXNET_USE_CUDNN_GE_7200
 #endif  // MXNET_USE_CUDNN
-
-    if (ctx_.dev_type == kCPU) {
-      if (init_space_) {
-        Storage::Get()->Free(reserve_cpu_space_);
-        init_space_ = false;
-      }
-      if (temp_init_space_) {
-        Storage::Get()->Free(temp_cpu_space_);
-        temp_init_space_ = false;
-      }
-    }
   }
 
   void Forward(const OpContext &ctx, const std::vector<TBlob> &in_data,
@@ -855,37 +839,30 @@ class RNNOp {
 #endif  // MXNET_USE_CUDNN == 1 && defined(__CUDACC__)
 
     if (ctx_.dev_type == kCPU) {
+      // allocate temp space
+      const size_t work_cpu_space_size = GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_,
+          param_.state_size, direction, param_.mode);
+      if (!temp_init_space_ || temp_cpu_space_size_ < work_cpu_space_size) {
+        temp_cpu_space_size_ = work_cpu_space_size;
+        temp_cpu_space_ = NDArray(TShape({static_cast<dim_t>(temp_cpu_space_size_)}), ctx_,
+            false, in_data[rnn_enum::kData].type_flag_);
+        temp_init_space_ = true;
+      }
+      DType* work_cpu_space = static_cast<DType*>(temp_cpu_space_.data().dptr_);
+
       if (ctx.is_train) {
-        // allocate temp space
-        const size_t work_cpu_space_size =
-            GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_,
-                              param_.state_size, direction, param_.mode);
-        if (temp_init_space_ && temp_cpu_space_size_ < work_cpu_space_size) {
-            Storage::Get()->Free(temp_cpu_space_);
-            temp_init_space_ = false;
-        }
-        if (!temp_init_space_) {
-          temp_cpu_space_ = Storage::Get()->Alloc
-              (work_cpu_space_size * sizeof(DType), Context::CPU());
-          temp_cpu_space_size_ = work_cpu_space_size;
-          temp_init_space_ = true;
-        }
-        DType* work_cpu_space = static_cast<DType*>(temp_cpu_space_.dptr);
+        // allocate reserve space
 
         const size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction,
                                                      param_.seq_length_, param_.batch_size_,
                                                      param_.state_size, param_.mode);
-        if (init_space_ && reserve_cpu_space_size_ < r_size) {
-          Storage::Get()->Free(reserve_cpu_space_);
-          init_space_ = false;
-        }
-        if (!init_space_) {
-          reserve_cpu_space_ = Storage::Get()->Alloc(r_size * sizeof(DType), Context::CPU());
+        if (!init_space_ || reserve_cpu_space_size_ < r_size) {
           reserve_cpu_space_size_ = r_size;
+          reserve_cpu_space_ = NDArray(TShape({static_cast<dim_t>(reserve_cpu_space_size_)}), ctx_,
+              false, in_data[rnn_enum::kData].type_flag_);
           init_space_ = true;
         }
-
-        DType* reserve_space_ptr = static_cast<DType*>(reserve_cpu_space_.dptr);
+        DType* reserve_space_ptr = static_cast<DType*>(reserve_cpu_space_.data().dptr_);
 
         RNNForwardTraining<DType>(work_cpu_space,
                                   reserve_space_ptr,
@@ -945,20 +922,6 @@ class RNNOp {
 #endif  // MXNET_USE_MKLDNN == 1
           //  Before integrating MKLDNN GRU fp32 inference
           //  using below code for keep func being OK
-          const size_t work_cpu_space_size =
-              GetRNNWorkspaceSize(param_.seq_length_, param_.batch_size_,
-                                  param_.state_size, direction, param_.mode);
-          if (temp_init_space_ && temp_cpu_space_size_ < work_cpu_space_size) {
-            Storage::Get()->Free(temp_cpu_space_);
-            temp_init_space_ = false;
-          }
-          if (!temp_init_space_) {
-            temp_cpu_space_ = Storage::Get()->Alloc
-                (work_cpu_space_size * sizeof(DType), Context::CPU());
-            temp_cpu_space_size_ = work_cpu_space_size;
-            temp_init_space_ = true;
-          }
-          DType* work_cpu_space = static_cast<DType*>(temp_cpu_space_.dptr);
           RNNForwardInference<DType>(work_cpu_space,
                                      param_.state_outputs,
                                      param_.num_layers,
@@ -1171,7 +1134,7 @@ class RNNOp {
       if (!temp_init_space_ || temp_cpu_space_size_ != work_cpu_space_size) {
         LOG(FATAL) << "Check temp init error";
       }
-      DType* work_cpu_space = static_cast<DType*>(temp_cpu_space_.dptr);
+      DType* work_cpu_space = static_cast<DType*>(temp_cpu_space_.data().dptr_);
       size_t r_size = GetRNNReserveSpaceSize(param_.num_layers, direction,
                                              param_.seq_length_, param_.batch_size_,
                                              param_.state_size, param_.mode);
@@ -1180,7 +1143,7 @@ class RNNOp {
         LOG(FATAL) << "Check forward init error";
       }
 
-      DType* reserve_space_ptr = static_cast<DType*>(reserve_cpu_space_.dptr);
+      DType* reserve_space_ptr = static_cast<DType*>(reserve_cpu_space_.data().dptr_);
       RNNBackward<DType>(work_cpu_space,
                          reserve_space_ptr,
                          param_.num_layers,
@@ -1551,7 +1514,7 @@ class RNNOp {
 #endif  // MXNET_USE_CUDNN
   bool init_space_, temp_init_space_;
   size_t reserve_cpu_space_size_, temp_cpu_space_size_;
-  Storage::Handle reserve_cpu_space_, temp_cpu_space_;
+  NDArray reserve_cpu_space_, temp_cpu_space_;
 };  //  class RNNOp
 
 static OpStatePtr CreateRNNState(const nnvm::NodeAttrs &attrs,
diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc
index 86fb1c7d1ec6..fd016d6819b9 100644
--- a/src/operator/rnn.cc
+++ b/src/operator/rnn.cc
@@ -270,22 +270,24 @@ static void RNNStatefulComputeCPU(const OpStatePtr& state_ptr,
 
       const size_t r_size = GetMKLDNNRNNCacheMemorySize(L, D, T, N, I, H, param.mode);
       if (op.init_mem_ && op.reserve_mem_size_ < r_size) {
-        Storage::Get()->Free(op.mem_space_);
         op.init_mem_ = false;
       }
+      const size_t weights_version = inputs[rnn_enum::kParams].version();
       if (!op.init_mem_) {
-        op.mem_space_ = Storage::Get()->Alloc(
-            r_size * sizeof(DType),
-            Context::CPU());
+        op.mem_space_ = NDArray(TShape({static_cast<dim_t>(r_size)}), op.ctx_, false, dtype);
         op.reserve_mem_size_ = r_size;
         op.init_mem_ = true;
         op.has_cache = false;
+        // Assign weights_version
+        op.weights_version = weights_version;
       }
-      if (op.has_cache && op.x_memory.size() == 0) {
+      // Check if NDArray was changed.
+      if (op.weights_version != weights_version) {
         op.has_cache = false;
+        op.weights_version = weights_version;
       }
 
-      DType* workptr = static_cast<DType*>(op.mem_space_.dptr);
+      DType* workptr = static_cast<DType*>(op.mem_space_.data().dptr_);
       mkldnn::memory::dims src_layer_tz_0 = {T, N, I};
       mkldnn::memory::dims src_layer_tz = {T, N, D * H};
       mkldnn::memory::dims dst_layer_tz = {T, N, D * H};

From 8c311d2137797ad96a167d363a521085bf8f6dac Mon Sep 17 00:00:00 2001
From: zixuanweeei <zixuan.wei@intel.com>
Date: Mon, 2 Sep 2019 18:33:31 +0800
Subject: [PATCH 2/3] Add UT

---
 tests/python/unittest/test_operator.py | 39 ++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 39ae0a02f0fe..b70072c9f25d 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -75,6 +75,45 @@ def check_rnn_consistency(cell1, cell2, T, N, I, H, grad_req, rtol=1e-2, atol=1e
         assert(mod2.get_input_grads()[0] == None)
 
 
+@with_seed()
+@assert_raises_cudnn_not_satisfied(min_version='5.1.10')
+def test_rnn_with_new_param():
+    rnn_modes = ['rnn_relu', 'rnn_tanh', 'gru', 'lstm']
+    ngates_ = [1, 1, 3, 4]
+    num_layers, input_size, seq_len, batch_size, state_size = 3, 128, 5, 64, 8
+    for bidirectional in [False, True]:
+        directions = 2 if bidirectional else 1
+        for mode, ngates in zip(rnn_modes, ngates_):
+            first_layer_size = (input_size * state_size + state_size * state_size + state_size * 2) * ngates
+            rest_layer_size = (state_size * directions * state_size + state_size * state_size + state_size * 2) \
+                 * ngates * (num_layers - 1)
+            param_size = (first_layer_size + rest_layer_size) * directions
+            sym = mx.sym.RNN(mode=mode, num_layers=num_layers, bidirectional=bidirectional,
+                state_outputs=False, state_size=state_size, name='rnn')
+
+            bind_dict = {
+                'rnn_data': mx.ndarray.random.uniform(low=-1, high=1, shape=(seq_len, batch_size, input_size)),
+                'rnn_parameters': mx.ndarray.random.uniform(low=-1, high=1, shape=(param_size)),
+                'rnn_state': mx.ndarray.zeros(shape=(num_layers * directions, batch_size, state_size))
+            }
+            if mode == 'lstm':
+                bind_dict['rnn_state_cell'] = mx.ndarray.zeros(
+                    shape=(num_layers * directions, batch_size, state_size))
+
+            ex = sym.bind(mx.cpu(), bind_dict)
+            ex.forward(is_train=True)
+            ex01 = ex.output_dict['rnn_output'].asnumpy()
+            ex.forward(is_train=False)
+            ex02 = ex.output_dict['rnn_output'].asnumpy()
+            assert_allclose(ex01, ex02, rtol=1e-2, atol=1e-4)
+            bind_dict['rnn_parameters'] = mx.ndarray.random.uniform(low=-1, high=1, shape=(param_size))
+            ex.copy_params_from(bind_dict)
+            ex.forward(is_train=True)
+            ex03 = ex.output_dict['rnn_output'].asnumpy()
+            ex.forward(is_train=False)
+            ex04 = ex.output_dict['rnn_output'].asnumpy()
+            assert_allclose(ex03, ex04, rtol=1e-2, atol=1e-4)
+
 
 @with_seed()
 @assert_raises_cudnn_not_satisfied(min_version='5.1.10')

From d2eed962b9ea5490210d9a4cc5c5c819efaf1d5d Mon Sep 17 00:00:00 2001
From: zixuanweeei <zixuan.wei@intel.com>
Date: Mon, 2 Sep 2019 20:05:01 +0800
Subject: [PATCH 3/3] Use default_context()

---
 tests/python/unittest/test_operator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index b70072c9f25d..ef86ad207928 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -100,7 +100,7 @@ def test_rnn_with_new_param():
                 bind_dict['rnn_state_cell'] = mx.ndarray.zeros(
                     shape=(num_layers * directions, batch_size, state_size))
 
-            ex = sym.bind(mx.cpu(), bind_dict)
+            ex = sym.bind(default_context(), bind_dict)
             ex.forward(is_train=True)
             ex01 = ex.output_dict['rnn_output'].asnumpy()
             ex.forward(is_train=False)