diff --git a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/Utils.scala b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/Utils.scala
index ba2b9e2ffa0..6f43d2a1549 100644
--- a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/Utils.scala
+++ b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/Utils.scala
@@ -392,6 +392,54 @@ object Utils {
     Array(padH, padH, padW, padW, oheight, owidth)
   }
 
+  private[nn] def getOutSizeAndPaddingForDNN(
+    inputHeight: Int,
+    inputWidth: Int,
+    dH: Int,
+    dW: Int,
+    kH: Int,
+    kW: Int,
+    padH: Int,
+    padW: Int,
+    ceilMode: Boolean,
+    dilationHeight: Int = 1,
+    dilationWidth: Int = 1,
+    inputdepth: Int = -1,
+    dt: Int = -1,
+    kt: Int = -1,
+    padt: Int = 0,
+    dilationDepth: Int = 1): Array[Int] = {
+    // compute padding left, right, top and bottom
+    var pad_t = padH
+    var pad_b = padH
+    var pad_l = padW
+    var pad_r = padW
+
+    var oheight = 0
+    var owidth = 0
+    var odepth = 0
+
+    val dilationKernelHeight = dilationHeight * (kH - 1) + 1
+    val dilationKernelWidth = dilationWidth * (kW - 1) + 1
+
+    oheight = math.ceil(1.0 * (inputHeight - dilationKernelHeight + 2*padH) / dH).toInt + 1
+    owidth = math.ceil(1.0 * (inputWidth - dilationKernelWidth + 2*padW) / dW).toInt + 1
+
+    if (padH != 0 || padW != 0 || padt != 0 || kH == 1 || kW == 1) {
+      if ((oheight - 1) * dH >= inputHeight + padH) oheight -= 1
+      if ((owidth - 1) * dW >= inputWidth + padW) owidth -= 1
+    }
+
+    val h = inputHeight + pad_t
+//    var pad_b = padH
+    while ((h + pad_b) < (dH * (oheight - 1) + kH)) pad_b = pad_b + 1
+    val w = inputWidth + pad_l
+//    var pad_r = padW
+    while ((w + pad_r) < (dW * (owidth - 1) + kW)) pad_r = pad_r + 1
+
+    Array(pad_t, pad_b, pad_l, pad_r, oheight, owidth)
+  }
+
   private[nn] def getOutputShape(outputHeight: Int, outputWidth: Int, nOutputPlane: Int,
     batchSize: Int = -1, format: DataFormat): Array[Int] = {
     format match {
@@ -472,6 +520,41 @@ object Utils {
     out
   }
 
+  private[nn] def getPaddingAndOutputSize(
+    inputHeight: Int,
+    inputWidth: Int,
+    dH: Int,
+    dW: Int,
+    kH: Int,
+    kW: Int,
+    padH: Int,
+    padW: Int
+  ): (Int, Int, Int, Int, Int, Int) = {
+    // compute padding left, right, top and bottom
+    var pad_t = padH
+    var pad_b = padH
+    var pad_l = padW
+    var pad_r = padW
+
+    var oheight = 0
+    var owidth = 0
+    var odepth = 0
+
+    oheight = math.ceil(1.0 * (inputHeight - kH + 2 * padH) / dH).toInt + 1
+    owidth = math.ceil(1.0 * (inputWidth - kW + 2 * padW) / dW).toInt + 1
+
+    if (padH != 0 || padW != 0 || kH == 1 || kW == 1) {
+      if ((oheight - 1) * dH >= inputHeight + padH) oheight -= 1
+      if ((owidth - 1) * dW >= inputWidth + padW) owidth -= 1
+    }
+
+    val h = inputHeight + pad_t
+    while ((h + pad_b) < (dH * (oheight - 1) + kH)) pad_b = pad_b + 1
+    val w = inputWidth + pad_l
+    while ((w + pad_r) < (dW * (owidth - 1) + kW)) pad_r = pad_r + 1
+
+    (pad_t, pad_b, pad_l, pad_r, oheight, owidth)
+  }
   /**
    * Calculate forward time and backward time.
    * @param times
diff --git a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/abstractnn/AbstractModule.scala b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/abstractnn/AbstractModule.scala
index 43101375258..e05960c70d0 100644
--- a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/abstractnn/AbstractModule.scala
+++ b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/abstractnn/AbstractModule.scala
@@ -320,7 +320,7 @@ abstract class AbstractModule[A <: Activity: ClassTag, B <: Activity: ClassTag,
    * If the module has parameters, this will zero the accumulation of the gradients with respect
    * to these parameters. Otherwise, it does nothing.
    */
-  final def zeroGradParameters(): Unit = {
+  def zeroGradParameters(): Unit = {
     if (parameters() != null) {
       parameters()._1.zip(parameters()._2)foreach{ case (weight, grad) =>
         grad.resizeAs(weight).zero()
diff --git a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/AvgPooling.scala b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/AvgPooling.scala
new file mode 100644
index 00000000000..d4394a304eb
--- /dev/null
+++ b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/AvgPooling.scala
@@ -0,0 +1,93 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.intel.analytics.bigdl.nn.mkldnn
+
+import com.intel.analytics.bigdl.mkl._
+import com.intel.analytics.bigdl.nn.Utils
+import com.intel.analytics.bigdl.nn.abstractnn.Activity
+import com.intel.analytics.bigdl.tensor.Tensor
+
+class AvgPooling(
+  kW: Int,
+  kH: Int,
+  dW: Int = 1,
+  dH: Int = 1,
+  padW: Int = 0,
+  padH: Int = 0
+) extends MklDnnLayer {
+  @transient
+  private var paddingTL: Array[Int] = _
+  @transient
+  private var paddingBR: Array[Int] = _
+  @transient
+  private var fwdPD: Long = _
+
+  override private[mkldnn] def initFwdPrimitives(inputs: Array[MemoryData], phase: Phase) = {
+    _inputFormats = singleNativeData(inputs)
+    val strides = Array(dW, dH)
+    val kernel = Array(kH, kW)
+    val n = _inputFormats(0).shape(0)
+    val c = _inputFormats(0).shape(1)
+    val h = _inputFormats(0).shape(2)
+    val w = _inputFormats(0).shape(3)
+    val (pt, pb, pl, pr, oh, ow) =
+      Utils.getPaddingAndOutputSize(h, w, dH, dW, kH, kW, padH, padW)
+    paddingTL = Array(pt, pl)
+    paddingBR = Array(pb, pr)
+    val outputMD = MklDnn.MemoryDescInit(4, Array(n, c, oh, ow), DataType.F32, Memory.Format.any)
+    val description = MklDnn.PoolingForwardDescInit(
+      PropKind.Forward, AlgKind.PoolingAvgExcludePadding,
+      _inputFormats(0).getMemoryDescription(), outputMD, strides, kernel, paddingTL, paddingBR,
+      MklDnn.PaddingKind.mkldnnPaddingZero)
+    fwdPD = MklDnn.PrimitiveDescCreate(description, runtime.engine, 0L)
+    _outputFormats = Array(MemoryData.primitiveOutput(fwdPD))
+    output = initTensor(_outputFormats(0))
+    updateOutputPrimitives = Array(MklDnn.PrimitiveCreate2(fwdPD,
+      _inputFormats.map(_.getPrimitive(runtime)), Array(0), 1,
+      _outputFormats.map(_.getPrimitive(runtime)), 2))
+    (_inputFormats, _outputFormats)
+  }
+
+  override private[mkldnn] def initBwdPrimitives(grad: Array[MemoryData], phase: Phase) = {
+    _gradOutputFormats = singleNativeData(grad)
+    _gradOutputFormatsForWeight = _gradOutputFormats
+    val strides = Array(dW, dH)
+    val kernel = Array(kH, kW)
+    val description = MklDnn.PoolingBackwardDescInit(AlgKind.PoolingAvgExcludePadding,
+      _inputFormats(0).getMemoryDescription(),
+      _gradOutputFormats(0).getMemoryDescription(),
+      strides, kernel, paddingTL, paddingBR, MklDnn.PaddingKind.mkldnnPaddingZero)
+
+    val pd = MklDnn.PrimitiveDescCreate(description, runtime.engine, fwdPD)
+    _gradInputFormats = Array(MemoryData.primitiveGradInput(pd))
+    updateGradInputPrimitives = Array(MklDnn.PrimitiveCreate2(pd,
+      _gradOutputFormats.map(_.getPrimitive(runtime)),
+      Array(0, 0), 2, _gradInputFormats.map(_.getPrimitive(runtime)), 1))
+    gradInput = initTensor(_gradInputFormats(0))
+    (_gradOutputFormats, _gradInputFormats)
+  }
+}
+
+object AvgPooling {
+  def apply(
+    kW: Int,
+    kH: Int,
+    dW: Int = 1,
+    dH: Int = 1,
+    padW: Int = 0,
+    padH: Int = 0
+  ): AvgPooling = new AvgPooling(kW, kH, dW, dH, padW, padH)
+}
diff --git a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/CAddTable.scala b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/CAddTable.scala
new file mode 100644
index 00000000000..465ab1fba0c
--- /dev/null
+++ b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/CAddTable.scala
@@ -0,0 +1,67 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.intel.analytics.bigdl.nn.mkldnn
+
+import com.intel.analytics.bigdl.mkl.{DataType, Memory, MklDnn}
+import com.intel.analytics.bigdl.nn.abstractnn.Activity
+import com.intel.analytics.bigdl.utils.T
+
+class CAddTable extends MklDnnLayer {
+  override private[mkldnn] def initFwdPrimitives(inputs: Array[MemoryData], phase: Phase) = {
+    _inputFormats = nativeData(inputs)
+    val shape = inputs(0).shape.clone()
+    for(i <- 1 until inputs.length) {
+      require(shape.length == inputs(i).shape.length, "dimension not match")
+      for(j <- 0 until shape.length) {
+        require(shape(j) == inputs(i).shape(j), "size not match")
+      }
+    }
+
+    val outputMD = MklDnn.MemoryDescInit(shape.length, shape, DataType.F32, Memory.Format.any)
+    val scales = inputs.map(_ => 1f)
+    val pd = MklDnn.SumPrimitiveDescCreate(outputMD, inputs.length, scales,
+      inputs.map(_.getPrimitiveDescription(runtime)))
+    _outputFormats = Array(MemoryData.primitiveOutput(pd))
+    updateOutputPrimitives = Array(MklDnn.PrimitiveCreate2(pd,
+      _inputFormats.map(_.getPrimitive(runtime)), new Array[Int](inputs.length),
+      _inputFormats.length, _outputFormats.map(_.getPrimitive(runtime)), 1))
+    output = initTensor(_outputFormats(0))
+    (_inputFormats, _outputFormats)
+  }
+
+  override private[mkldnn] def initBwdPrimitives(grad: Array[MemoryData], phase: Phase) = {
+    _gradOutputFormats = grad
+    _gradOutputFormatsForWeight = grad
+    _gradInputFormats = new Array[MemoryData](_inputFormats.length).map(a => grad(0))
+    gradInput = T()
+    (_gradOutputFormats, _gradInputFormats)
+  }
+
+  override def updateGradInput(input: Activity, gradOutput: Activity): Activity = {
+    require(gradOutput.isTensor, "gradOutput should be a tensor")
+    val _gradInput = gradInput.toTable
+    var i = 1
+    while(i <= _inputFormats.length) {
+      _gradInput(i) = gradOutput
+      i += 1
+    }
+    gradInput
+  }
+}
+
+object CAddTable {
+  def apply(): CAddTable = new CAddTable()
+}
diff --git a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/ConcatTable.scala b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/ConcatTable.scala
new file mode 100644
index 00000000000..b78e8c6dd54
--- /dev/null
+++ b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/ConcatTable.scala
@@ -0,0 +1,191 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.intel.analytics.bigdl.nn.mkldnn
+import com.intel.analytics.bigdl.mkl.{DataType, Memory, MklDnn}
+import com.intel.analytics.bigdl.nn.abstractnn.Activity
+import com.intel.analytics.bigdl.tensor.{DnnTensor, Tensor}
+import com.intel.analytics.bigdl.utils.{T, Table}
+
+import scala.collection.mutable.ArrayBuffer
+
+class ConcatTable extends MklDnnContainer {
+
+  output = T()
+
+  @transient
+  private var sumPrimitive: Array[Long] = null
+  @transient
+  private var tensors: Array[Tensor[Float]] = null
+  @transient
+  private var tensorPrimitives: Array[Long] = null
+
+  override def updateOutput(input: Activity): Activity = {
+    require(modules.length > 0, "empty modules of concat table")
+    var i = 0
+    while (i < modules.length) {
+      val currentOutput = modules(i).forward(
+        reorderManager.infer(_inputFormats, mklDnnModules(i).inputFormats(), input))
+      output.toTable(i + 1) = currentOutput
+      i += 1
+    }
+    output
+  }
+
+  override def updateGradInput(input: Activity, gradOutput: Activity): Activity = {
+    require(modules.length > 0, "empty modules of concat table")
+
+    var i = 0
+    while (i < modules.length) {
+      tensors(i) = modules(i).updateGradInput(input, gradOutput.toTable(i + 1))
+        .asInstanceOf[Tensor[Float]]
+      i += 1
+    }
+    MklDnnOps.streamSubmit(runtime.stream, 1, sumPrimitive, 1, tensorPrimitives, tensors)
+    gradInput
+  }
+
+  override def accGradParameters(input: Activity, gradOutput: Activity): Unit = {
+    var i = 0
+    while (i < modules.length) {
+      modules(i).accGradParameters(input, gradOutput.toTable(i + 1))
+      i += 1
+    }
+  }
+
+  override private[mkldnn] def initFwdPrimitives(inputs: Array[MemoryData], phase: Phase) = {
+    require(mklDnnModules != null, "You should call compile first")
+    require(inputs.length == 1, "Concat only accept one tensor")
+    val buffer = new ArrayBuffer[MemoryData]()
+    mklDnnModules.foreach(m => {
+      val (realInput, out) = m.initFwdPrimitives(inputs, phase)
+      require(out.length == 1, "output should be one tensor")
+      inputs.zip(realInput).map {case(f, t) => reorderManager.register(f, t)}
+      buffer.append(out(0))
+    })
+    _outputFormats = buffer.toArray
+    _inputFormats = inputs
+    (inputs, _outputFormats)
+  }
+
+  override private[mkldnn] def initBwdPrimitives(grads: Array[MemoryData], phase: Phase) = {
+    require(grads.length == mklDnnModules.length, "grad tensor number is not correct")
+    _gradOutputFormats = new Array[MemoryData](grads.length)
+    val subGradInputs = new Array[MemoryData](grads.length)
+    tensorPrimitives = new Array[Long](grads.length + 1)
+    var shape: Array[Int] = null
+    for(i <- 0 until grads.length) {
+      val m = mklDnnModules(i)
+      val (realGrads, gradInput) = m.initBwdPrimitives(Array(grads(i)), phase)
+      require(realGrads.length == 1, "real grad length should be 1")
+      _gradOutputFormats(i) = realGrads(0)
+      require(gradInput.length == 1, "real grad length should be 1")
+      subGradInputs(i) = gradInput(0)
+      tensorPrimitives(i) = gradInput(0).getPrimitive(runtime)
+      if (shape == null) {
+        shape = gradInput(0).shape.clone()
+      } else {
+        require(shape.length == gradInput(0).shape.length, "backward grad shape should be same")
+        for(j <- 0 until shape.length) {
+          require(shape(j) == gradInput(0).shape(j), "backward grad shape size should be same")
+        }
+      }
+    }
+    val outputMD = MklDnn.MemoryDescInit(shape.length, shape, DataType.F32, Memory.Format.any)
+    val scales = grads.map(_ => 1f)
+    val pd = MklDnn.SumPrimitiveDescCreate(outputMD, grads.length, scales,
+      subGradInputs.map(_.getPrimitiveDescription(runtime)))
+    _gradInputFormats = Array(MemoryData.primitiveOutput(pd))
+    tensorPrimitives(grads.length) = _gradInputFormats(0).getPrimitive(runtime)
+    sumPrimitive = Array(MklDnn.PrimitiveCreate2(pd,
+      subGradInputs.map(_.getPrimitive(runtime)), new Array[Int](grads.length),
+      grads.length, _gradInputFormats.map(_.getPrimitive(runtime)), 1))
+    gradInput = initTensor(_gradInputFormats(0))
+    tensors = new Array[Tensor[Float]](grads.length + 1)
+    tensors(grads.length) = gradInput.asInstanceOf[Tensor[Float]]
+    (_gradOutputFormats, _gradInputFormats)
+  }
+
+  override private[mkldnn] def initGradWPrimitives(grads: Array[MemoryData], phase: Phase) = {
+    val realGradsBuffer = new ArrayBuffer[MemoryData]()
+    for(i <- 0 until grads.length) {
+      val m = mklDnnModules(i)
+      val realGradOutput = m.initGradWPrimitives(Array(grads(i)), phase)
+      require(realGradOutput.length == 1, s"real grad length should be 1, " +
+        s"but it's ${realGradOutput.length}")
+      realGradsBuffer.append(realGradOutput(0))
+    }
+    _gradOutputWeightFormats = realGradsBuffer.toArray
+    _gradOutputWeightFormats
+  }
+
+  override private[mkldnn] def inputFormats() = {
+    require(_inputFormats != null, "You should call initFwdPrimitives first")
+    _inputFormats
+  }
+
+  override private[mkldnn] def gradInputFormats() = {
+    require(_gradInputFormats != null, "You should call initBwdPrimitives first")
+    _gradInputFormats
+  }
+
+  override private[mkldnn] def outputFormats() = {
+    require(_outputFormats != null, "You should call initFwdPrimitives first")
+    _outputFormats
+  }
+
+  override private[mkldnn] def gradOutputFormats() = {
+    require(_gradOutputFormats != null, "You should call initBwdPrimitives first")
+    _gradOutputFormats
+  }
+
+  private var _inputFormats: Array[MemoryData] = _
+  private var _gradInputFormats: Array[MemoryData] = _
+  private var _outputFormats: Array[MemoryData] = _
+  private var _gradOutputFormats: Array[MemoryData] = _
+  private var _gradOutputWeightFormats: Array[MemoryData] = _
+
+  override private[mkldnn] def gradOutputWeightFormats() = _gradOutputWeightFormats
+
+  override def toString(): String = {
+    val tab = "\t"
+    val line = "\n"
+    val next = "  |`-> "
+    val lastNext = "   `-> "
+    val ext = "  |    "
+    val extlast = "       "
+    val last = "   ... -> "
+    var str = s"${getPrintName}"
+    str = str + " {" + line + tab + "input"
+    var i = 1
+    while (i <= modules.length) {
+      if (i == modules.length) {
+        str = str + line + tab + lastNext + "(" + i + "): " +
+          modules(i-1).toString.replace(line, line + tab + extlast)
+      } else {
+        str = str + line + tab + next + "(" + i + "): " +
+          modules(i-1).toString.replace(line, line + tab + ext)
+      }
+      i += 1
+    }
+    str = str + line + tab + last + "output"
+    str = str + line + "}"
+    str
+  }
+}
+
+object ConcatTable {
+  def apply(): ConcatTable = new ConcatTable()
+}
diff --git a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/DnnBase.scala b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/DnnBase.scala
new file mode 100644
index 00000000000..ff6103d7249
--- /dev/null
+++ b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/DnnBase.scala
@@ -0,0 +1,312 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.intel.analytics.bigdl.nn.mkldnn
+
+import com.intel.analytics.bigdl.mkl.{DataType, MklDnn}
+import com.intel.analytics.bigdl.nn.DynamicContainer
+import com.intel.analytics.bigdl.nn.abstractnn.{AbstractModule, Activity}
+import com.intel.analytics.bigdl.tensor.{DenseType, DnnTensor, Tensor}
+import com.intel.analytics.bigdl.utils.T
+
+import scala.collection.mutable.ArrayBuffer
+
+/**
+ * Helper utilities when integrating Module with MKL-DNN
+ */
+trait MklDnnModule extends MklDnnModuleHelper {
+  /**
+   * MklDnn runtime, which includes a MKL-DNN engine and a MKL-DNN stream.
+   * Note that this instance will be erased when send to remote worker, so you
+   * should recreate a MklDnnRuntime.
+   */
+  @transient
+  protected var runtime : MklDnnRuntime = _
+
+  def setRuntime(runtime: MklDnnRuntime): Unit = {
+    this.runtime = runtime
+  }
+
+  /**
+   * Init the MKL-DNN primitives for the layer. Note that these primitives will be erased when
+   * sent to a remote worker.
+   */
+  private[mkldnn] def initFwdPrimitives(inputs: Array[MemoryData], phase: Phase)
+  : (Array[MemoryData], Array[MemoryData])
+  private[mkldnn] def initBwdPrimitives(grad: Array[MemoryData], phase: Phase)
+  : (Array[MemoryData], Array[MemoryData])
+  private[mkldnn] def initGradWPrimitives(grad: Array[MemoryData], phase: Phase): Array[MemoryData]
+  = grad
+
+  private[mkldnn] def inputFormats(): Array[MemoryData]
+  private[mkldnn] def gradInputFormats(): Array[MemoryData]
+  private[mkldnn] def outputFormats(): Array[MemoryData]
+  private[mkldnn] def gradOutputFormats(): Array[MemoryData]
+  private[mkldnn] def gradOutputWeightFormats(): Array[MemoryData]
+}
+
+trait MklDnnModuleHelper {
+  protected def initActivity(formats: Array[MemoryData]): Activity = {
+    if (formats.length == 1) {
+      initTensor(formats(0))
+    } else {
+      T.array(formats.map(initTensor(_)))
+    }
+  }
+
+  protected def initTensor(format: MemoryData): Tensor[Float] = {
+    format match {
+      case d: NativeData =>
+        DnnTensor[Float](d.shape)
+      case d: HeapData =>
+        Tensor[Float](d.shape)
+      case _ => throw new UnsupportedOperationException("memory format is not supported")
+    }
+  }
+  protected def singleNativeData(formats: Array[MemoryData]): Array[MemoryData] = {
+    require(formats.length == 1, "Only accept one tensor as input")
+    nativeData(formats)
+  }
+  protected def nativeData(formats: Array[MemoryData]): Array[MemoryData] = {
+    formats.map(
+      f => {
+        f match {
+          case i: NativeData => i
+          case i: HeapData => i.toNative()
+          case _ => throw new UnsupportedOperationException("Not support memory format")
+        }
+      }
+    )
+  }
+}
+
+trait MklDnnLayer extends AbstractModule[Activity, Activity, Float] with MklDnnModule {
+  /**
+   * MKL-DNN primitives of the module. Note you should only initialize this field by calling
+   * initPrimitives method. This field will be erased when sending model to remote worker. So you
+   * need to reinitialize it after sending the model.
+   */
+  @transient
+  protected var updateOutputPrimitives: Array[Long] = _
+  @transient
+  protected var updateGradInputPrimitives: Array[Long] = _
+  @transient
+  protected var accGradientPrimitives: Array[Long] = _
+
+  protected var _inputFormats: Array[MemoryData] = _
+  protected var _gradInputFormats: Array[MemoryData] = _
+  protected var _outputFormats: Array[MemoryData] = _
+  protected var _gradOutputFormats: Array[MemoryData] = _
+  protected var _gradOutputFormatsForWeight: Array[MemoryData] = _
+
+  @transient
+  private var updateOutputMemoryPrimitives: Array[Long] = _
+  @transient
+  private var updateOutputTensors: Array[Tensor[Float]] = _
+  @transient
+  private var updateGradInputMemoryPrimitives: Array[Long] = _
+  @transient
+  private var updateGradInputTensors: Array[Tensor[Float]] = _
+  @transient
+  private var cachedInput: Activity = _
+  @transient
+  private var cachedGradOutput: Activity = _
+
+  override private[mkldnn] def initGradWPrimitives(grad: Array[MemoryData],
+    phase: Phase): Array[MemoryData] = {
+    _gradOutputFormatsForWeight = grad
+    grad
+  }
+
+  override def updateOutput(input: Activity): Activity = {
+    if (updateOutputMemoryPrimitives == null) {
+      updateOutputMemoryPrimitives =
+        inputFormats().map(_.getPrimitive(runtime)) ++ outputFormats().map(_.getPrimitive(runtime))
+    }
+    if (updateOutputTensors == null || cachedInput == null || !cachedInput.eq(input)) {
+      val buffer = new ArrayBuffer[Tensor[Float]]()
+      if (input.isTensor) {
+        buffer.append(input.asInstanceOf[Tensor[Float]])
+      } else {
+        val table = input.toTable
+        var i = 1
+        while (i <= table.length()) {
+          buffer.append(table(i))
+          i += 1
+        }
+      }
+      if (output.isTensor) {
+        buffer.append(output.asInstanceOf[Tensor[Float]])
+      } else {
+        val table = output.toTable
+        var i = 1
+        while (i <= table.length()) {
+          buffer.append(table(i))
+          i += 1
+        }
+      }
+      updateOutputTensors = buffer.toArray
+      cachedInput = input
+    }
+    MklDnnOps.streamSubmit(
+      runtime.stream, 1, updateOutputPrimitives, updateOutputPrimitives.length,
+      updateOutputMemoryPrimitives,
+      updateOutputTensors
+    )
+    output
+  }
+
+  override def updateGradInput(input: Activity, gradOutput: Activity): Activity = {
+    if (updateGradInputMemoryPrimitives == null) {
+      updateGradInputMemoryPrimitives =
+        gradOutputFormats().map(_.getPrimitive(runtime)) ++
+          gradInputFormats().map(_.getPrimitive(runtime))
+    }
+    if (updateGradInputTensors == null || cachedInput == null || !cachedInput.eq(input) ||
+      cachedGradOutput == null || !cachedGradOutput.eq(gradOutput)) {
+      val buffer = new ArrayBuffer[Tensor[Float]]()
+      if (gradOutput.isTensor) {
+        buffer.append(gradOutput.asInstanceOf[Tensor[Float]])
+      } else {
+        val table = gradOutput.toTable
+        var i = 1
+        while (i <= table.length()) {
+          buffer.append(table(i))
+          i += 1
+        }
+      }
+      if (gradInput.isTensor) {
+        buffer.append(gradInput.asInstanceOf[Tensor[Float]])
+      } else {
+        val table = gradInput.toTable
+        var i = 1
+        while (i <= table.length()) {
+          buffer.append(table(i))
+          i += 1
+        }
+      }
+      updateGradInputTensors = buffer.toArray
+      cachedInput = input
+      cachedGradOutput = gradOutput
+    }
+    MklDnnOps.streamSubmit(runtime.stream, 1, updateGradInputPrimitives,
+      updateGradInputPrimitives.length,
+      updateGradInputMemoryPrimitives, updateGradInputTensors)
+    gradInput
+  }
+
+
+  override private[mkldnn] def inputFormats() = {
+    require(_inputFormats != null, "You should call initFwdPrimitives first")
+    _inputFormats
+  }
+
+  override private[mkldnn] def gradInputFormats() = {
+    require(_gradInputFormats != null, "You should call initBwdPrimitives first")
+    _gradInputFormats
+  }
+
+  override private[mkldnn] def outputFormats() = {
+    require(_outputFormats != null, "You should call initFwdPrimitives first")
+    _outputFormats
+  }
+
+  override private[mkldnn] def gradOutputFormats() = {
+    require(_gradOutputFormats != null, "You should call initBwdPrimitives first")
+    _gradOutputFormats
+  }
+
+  override private[mkldnn] def gradOutputWeightFormats() = {
+    _gradOutputFormatsForWeight
+  }
+
+  def updateWithNewTensor(from: Array[Tensor[Float]], index: Int,
+    value: Activity): Unit = {
+    from(index).getTensorType match {
+      case DenseType => from(index) = value.toTensor[Float]
+      case _ =>
+    }
+  }
+
+  def parametersWithShape(): (Array[MemoryData], Array[MemoryData]) = {
+    (null, null)
+  }
+}
+
+/**
+ * Helper utilities when integrating containers with MKL-DNN
+ */
+trait MklDnnContainer extends DynamicContainer[Activity, Activity, Float] with MklDnnModule {
+  protected val reorderManager = new ReorderManager()
+  protected var mklDnnModules : Array[MklDnnModule] = _
+
+  override def add(module: AbstractModule[_ <: Activity, _ <: Activity, Float]): this.type = {
+    require(mklDnnModules == null, "You should not call add after compilation")
+    require(module.isInstanceOf[MklDnnModule], "layer should be MklDnnModule")
+    super.add(module)
+  }
+
+  /**
+   * Create MklDnnRuntime and compile the model
+   * @param phase
+   */
+  final def compile(phase: Phase, formats: Array[MemoryData]): Unit = {
+    compile(phase, new MklDnnRuntime(), formats)
+  }
+
+  /**
+   * Compile the model, which includes infer memory shapes, allocate memory, optimize computing
+   * path and create MKL-DNN primitives
+   * @param phase
+   * @param runtime
+   */
+  final def compile(phase: Phase, runtime: MklDnnRuntime, formats: Array[MemoryData]): Unit = {
+    freeze()
+    fusion(phase)
+    initPrimitives(phase, runtime, formats)
+  }
+
+  final def initPrimitives(phase: Phase, runtime: MklDnnRuntime, formats: Array[MemoryData])
+  : Unit = {
+    setRuntime(runtime)
+    val outputFormats = initFwdPrimitives(formats, phase)._2
+    if (phase == Phase.TrainingPhase) {
+      initBwdPrimitives(outputFormats, phase)
+      initGradWPrimitives(outputFormats, phase)
+    }
+  }
+
+  override def setRuntime(runtime: MklDnnRuntime): Unit = {
+    super.setRuntime(runtime)
+    reorderManager.setRuntime(runtime)
+    modules.foreach { case m: MklDnnModule => m.setRuntime(runtime) }
+  }
+
+  /**
+   * Modify the computing path by fuse some layers into one to improve the performance
+   */
+  private[mkldnn] def fusion(phase: Phase): Unit = {
+    modules.filter(_.isInstanceOf[MklDnnContainer])
+      .map { case mc: MklDnnContainer => mc.fusion(phase) }
+  }
+
+  private def freeze(): Unit = {
+    if (mklDnnModules == null) {
+      mklDnnModules = modules.map(_.asInstanceOf[MklDnnModule]).toArray
+    }
+    modules.filter(_.isInstanceOf[MklDnnContainer])
+      .map { case mc: MklDnnContainer => mc.freeze() }
+  }
+}
diff --git a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/Identity.scala b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/Identity.scala
new file mode 100644
index 00000000000..559457f0c8a
--- /dev/null
+++ b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/Identity.scala
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.intel.analytics.bigdl.nn.mkldnn
+
+import com.intel.analytics.bigdl.nn.abstractnn.{Activity, AbstractModule}
+import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric
+
+import scala.reflect.ClassTag
+
+/**
+ * Identity just return the input to output.
+ * It's useful in same parallel container to get an origin input.
+ */
+class Identity() extends MklDnnLayer {
+
+  override def updateOutput(input: Activity): Activity = {
+    output = input
+    output
+  }
+
+  override def updateGradInput(input: Activity, gradOutput: Activity): Activity = {
+
+    gradInput = gradOutput
+    gradInput
+  }
+
+  override private[mkldnn] def initFwdPrimitives(inputs: Array[MemoryData], phase: Phase) = {
+    _inputFormats = inputs
+    _outputFormats = inputs
+    (inputs, inputs)
+  }
+
+  override private[mkldnn] def initBwdPrimitives(grad: Array[MemoryData], phase: Phase) = {
+    _gradOutputFormats = grad
+    _gradOutputFormatsForWeight = grad
+    _gradInputFormats = grad
+    (grad, grad)
+  }
+}
+
+object Identity {
+  def apply[@specialized(Float, Double) T: ClassTag]()
+    (implicit ev: TensorNumeric[T]) : Identity = {
+    new Identity()
+  }
+}
+
diff --git a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/Input.scala b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/Input.scala
new file mode 100644
index 00000000000..aee8667b752
--- /dev/null
+++ b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/Input.scala
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.intel.analytics.bigdl.nn.mkldnn
+import com.intel.analytics.bigdl.nn.abstractnn.Activity
+import com.intel.analytics.bigdl.tensor.Tensor
+
+class Input(shape: Array[Int], layout: Int) extends MklDnnLayer {
+  override private[mkldnn] def initFwdPrimitives(inputs: Array[MemoryData], phase: Phase) = {
+    _outputFormats = Array(HeapData(shape, layout))
+    _inputFormats = inputs
+    (inputs, _outputFormats)
+  }
+
+  override private[mkldnn] def initBwdPrimitives(grad: Array[MemoryData], phase: Phase) = {
+    _gradInputFormats = Array(HeapData(shape, layout))
+    _gradOutputFormats = grad
+    _gradOutputFormatsForWeight = grad
+    (grad, _gradInputFormats)
+  }
+
+  override def updateOutput(input: Activity): Activity = {
+    output = input
+    output
+  }
+
+  override def updateGradInput(input: Activity, gradOutput: Activity): Activity = {
+    gradInput = gradOutput
+    gradInput
+  }
+}
+
+object Input {
+  def apply(shape: Array[Int], layout: Int): Input = new Input(shape, layout)
+}
diff --git a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/JoinTable.scala b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/JoinTable.scala
new file mode 100644
index 00000000000..63a353298e5
--- /dev/null
+++ b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/JoinTable.scala
@@ -0,0 +1,115 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.intel.analytics.bigdl.nn.mkldnn
+
+import com.intel.analytics.bigdl.mkl.{DataType, Memory, MklDnn, Query}
+import com.intel.analytics.bigdl.nn.abstractnn.Activity
+import com.intel.analytics.bigdl.tensor.Tensor
+
+import scala.collection.mutable.ArrayBuffer
+
+class JoinTable(val dimension: Int) extends MklDnnLayer {
+  @transient
+  private var memoryPrims: Array[Array[Long]] = _
+
+  override private[mkldnn] def initFwdPrimitives(inputs: Array[MemoryData], phase: Phase) = {
+    require(inputs.length > 0, s"at least one tensor, but is ${inputs.length}")
+    _inputFormats = nativeData(inputs)
+
+    val totalShape = inputs(0).shape.clone()
+    val layout = inputs(0).layout
+    var i = 1
+    while(i < inputs.length) {
+      val curShape = inputs(i).shape
+      require(layout == inputs(i).layout, "layout not match")
+      require(totalShape.length == curShape.length, "tensor dimension not match")
+      require(inputs(i).isInstanceOf[NativeData], "memory should be native")
+      var j = 0
+      while(j < curShape.length) {
+        if (j == dimension - 1) {
+          totalShape(j) += curShape(j)
+        } else {
+          require(totalShape(j) == curShape(j), "tensor size not match")
+        }
+        j += 1
+      }
+      i += 1
+    }
+    val primDesc = MklDnn.ConcatPrimitiveDescCreate(
+      MklDnn.MemoryDescInit(totalShape.length, totalShape, DataType.F32, Memory.Format.any),
+      inputs.length, dimension - 1, _inputFormats.map(_.getPrimitiveDescription(runtime)))
+
+    _outputFormats = Array(MemoryData.primitiveOutput(primDesc))
+    updateOutputPrimitives = Array(MklDnn.PrimitiveCreate2(primDesc,
+      _inputFormats.map(_.getPrimitive(runtime)),
+      new Array[Int](inputs.length), inputs.length,
+      _outputFormats.map(_.getPrimitive(runtime)), 1)
+    )
+    output = initTensor(_outputFormats(0))
+    (_inputFormats, _outputFormats)
+  }
+
+  override private[mkldnn] def initBwdPrimitives(grads: Array[MemoryData], phase: Phase) = {
+    _gradOutputFormats = singleNativeData(grads)
+    _gradOutputFormatsForWeight = _gradOutputFormats
+    _gradInputFormats = _inputFormats.map(f => {
+      NativeData(f.shape, f.layout)
+    })
+    val prims = new ArrayBuffer[Long]()
+    val buffer = new ArrayBuffer[Array[Long]]()
+    val offset = new Array[Int](_gradOutputFormats(0).shape.length)
+    for(i <- 0 until _gradInputFormats.length) {
+      val viewPD = MklDnn.ViewPrimitiveDescCreate(
+        _gradOutputFormats(0).getPrimitiveDescription(runtime), _gradInputFormats(i).shape, offset)
+      val viewFormat = MemoryData.primitiveOutput(viewPD)
+      val reorderPD = MklDnn.ReorderPrimitiveDescCreate(
+        viewFormat.getPrimitiveDescription(runtime),
+        _gradInputFormats(i).getPrimitiveDescription(runtime))
+      val reorderPrim = MklDnn.PrimitiveCreate2(reorderPD,
+        Array(viewFormat.getPrimitive(runtime)), Array(0), 1,
+        Array(_gradInputFormats(i).getPrimitive(runtime)), 1)
+      prims.append(reorderPrim)
+      buffer.append(Array(viewFormat.getPrimitive(runtime),
+        _gradInputFormats(i).getPrimitive(runtime)))
+      offset(dimension - 1) += _gradInputFormats(i).shape(dimension - 1)
+    }
+    updateGradInputPrimitives = prims.toArray
+    gradInput = initActivity(_gradInputFormats)
+    memoryPrims = buffer.toArray
+
+    (_gradOutputFormats, _gradInputFormats)
+  }
+
+  override def updateGradInput(input: Activity, gradOutput: Activity): Activity = {
+    require(gradOutput.isTensor, "gradOutput should be tensor")
+    require(gradInput.isTable, "gradInput should be table")
+    val _gradOutput = gradOutput.asInstanceOf[Tensor[Float]]
+    val _gradInput = gradInput.toTable
+    val length = _gradInput.length()
+    require(length == updateGradInputPrimitives.length, "gradOutput number not match")
+    var i = 0
+    while(i < length) {
+      MklDnnOps.streamSubmit(runtime.stream, 1, Array(updateGradInputPrimitives(i)),
+        1, memoryPrims(i), Array(_gradOutput, _gradInput[Tensor[Float]](i + 1)))
+      i += 1
+    }
+    gradInput
+  }
+}
+
+object JoinTable {
+  def apply(dimension: Int): JoinTable = new JoinTable(dimension)
+}
diff --git a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/LRN.scala b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/LRN.scala
new file mode 100644
index 00000000000..141cf6d8500
--- /dev/null
+++ b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/LRN.scala
@@ -0,0 +1,97 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.intel.analytics.bigdl.nn.mkldnn
+
+import com.intel.analytics.bigdl.mkl.{AlgKind, MklDnn, PropKind}
+import com.intel.analytics.bigdl.nn.abstractnn.Activity
+import com.intel.analytics.bigdl.tensor.Tensor
+
+class LRN(
+  size: Int = 5,
+  alpha: Double = 1.0,
+  beta: Double = 0.75,
+  k: Double = 1.0
+) extends MklDnnLayer {
+  private val UNDEFINED = 0
+
+  @transient
+  private var workSpace : Tensor[Float] = _
+  @transient
+  private var workSpaceFormat: MemoryData = _
+  @transient
+  private var fwdPrimDesc: Long = UNDEFINED
+  @transient
+  private var fwdMemPrims: Array[Long] = _
+  @transient
+  private var bwdMemPrims: Array[Long] = _
+
+  override private[mkldnn] def initFwdPrimitives(inputs: Array[MemoryData], phase: Phase) = {
+    _inputFormats = singleNativeData(inputs)
+    val description = MklDnn.LRNForwardDescInit(
+      PropKind.ForwardTraining, AlgKind.LrnAcrossChannels,
+      _inputFormats(0).getMemoryDescription(), size, alpha.toFloat, beta.toFloat, k.toFloat)
+    fwdPrimDesc = MklDnn.PrimitiveDescCreate(description, runtime.engine, 0L)
+    _outputFormats = Array(MemoryData.primitiveOutput(fwdPrimDesc))
+    workSpaceFormat = MemoryData.primitiveWorkSpace(fwdPrimDesc)
+    workSpace = initTensor(workSpaceFormat)
+    updateOutputPrimitives = Array(MklDnn.PrimitiveCreate2(fwdPrimDesc,
+      _inputFormats.map(_.getPrimitive(runtime)), Array(0), 1, Array(_outputFormats(0),
+        workSpaceFormat).map(_.getPrimitive(runtime)), 2))
+    output = initTensor(_outputFormats(0))
+    fwdMemPrims = Array(_inputFormats(0), _outputFormats(0), workSpaceFormat)
+      .map(_.getPrimitive(runtime))
+    (_inputFormats, _outputFormats)
+  }
+
+  override private[mkldnn] def initBwdPrimitives(grad: Array[MemoryData], phase: Phase) = {
+    _gradOutputFormats = singleNativeData(grad)
+    _gradOutputFormatsForWeight = _gradOutputFormats
+    val description = MklDnn.LRNBackwardDescInit(AlgKind.LrnAcrossChannels,
+      _inputFormats(0).getMemoryDescription(),
+      _gradOutputFormats(0).getMemoryDescription(), size, alpha.toFloat, beta.toFloat, k.toFloat)
+    require(fwdPrimDesc != UNDEFINED, "You should call initFwdPrimitives first")
+    val primDesc = MklDnn.PrimitiveDescCreate(description, runtime.engine, fwdPrimDesc)
+    _gradInputFormats = Array(MemoryData.primitiveGradInput(primDesc))
+    updateGradInputPrimitives = Array(MklDnn.PrimitiveCreate2(primDesc,
+      Array(_inputFormats(0), _gradOutputFormats(0), workSpaceFormat).map(_.getPrimitive(runtime)),
+      Array(0, 0, 0), 3, _gradInputFormats.map(_.getPrimitive(runtime)), 1))
+    gradInput = initTensor(_gradInputFormats(0))
+    bwdMemPrims = Array(_inputFormats(0), _gradOutputFormats(0), workSpaceFormat,
+      _gradInputFormats(0)).map(_.getPrimitive(runtime))
+    (_gradOutputFormats, _gradInputFormats)
+  }
+
+  override def updateOutput(input: Activity): Activity = {
+    val buffer = Array(input.asInstanceOf[Tensor[Float]], output.asInstanceOf[Tensor[Float]],
+      workSpace)
+    MklDnnOps.streamSubmit(runtime.stream, 1, updateOutputPrimitives, 1, fwdMemPrims, buffer)
+    output
+  }
+
+  override def updateGradInput(input: Activity, gradOutput: Activity): Activity = {
+    val buffer = Array(
+      input.asInstanceOf[Tensor[Float]], gradOutput.asInstanceOf[Tensor[Float]], workSpace,
+      gradInput.asInstanceOf[Tensor[Float]])
+    MklDnnOps.streamSubmit(runtime.stream, 1, updateGradInputPrimitives, 1,
+      bwdMemPrims, buffer)
+    gradInput
+  }
+}
+
+object LRN {
+  def apply(size: Int = 5, alpha: Double = 1.0, beta: Double = 0.75, k: Double = 1.0): LRN =
+    new LRN(size, alpha, beta, k)
+}
diff --git a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/Linear.scala b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/Linear.scala
new file mode 100644
index 00000000000..7bb9bc3f480
--- /dev/null
+++ b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/Linear.scala
@@ -0,0 +1,300 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.intel.analytics.bigdl.nn.mkldnn
+
+import com.intel.analytics.bigdl.mkl.{DataType, Memory, MklDnn, PropKind, Query, Stream => DnnStream}
+import com.intel.analytics.bigdl.nn.abstractnn.{Activity, Initializable, TensorModule}
+import com.intel.analytics.bigdl.nn.{InitializationMethod, RandomUniform, VariableFormat}
+import com.intel.analytics.bigdl.optim.Regularizer
+import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric
+import com.intel.analytics.bigdl.tensor._
+import com.intel.analytics.bigdl.utils.{T, Table}
+
+import scala.collection.mutable.ArrayBuffer
+import scala.reflect.ClassTag
+
+class Linear(
+  val inputSize: Int,
+  val outputSize: Int,
+  private val initWeight: Tensor[Float] = null,
+  private val initBias: Tensor[Float] = null,
+  private val initGradWeight: Tensor[Float] = null,
+  private val initGradBias: Tensor[Float] = null) extends MklDnnLayer with Initializable {
+
+  val weight: DnnTensor[Float] = DnnTensor[Float](Array(outputSize, inputSize))
+  val bias: DnnTensor[Float] = DnnTensor[Float](Array(outputSize))
+  val gradWeight: DnnTensor[Float] = DnnTensor[Float](Array(outputSize, inputSize))
+  val gradBias: DnnTensor[Float] = DnnTensor[Float](Array(outputSize))
+
+  var forwardPrimDesc: Long = 0L
+
+  var updateOutputMemoryPrimitives: Array[Long] = _
+  var updateOutputTensors: Array[Tensor[Float]] = _
+  var updateGradInputMemoryPrimitives: Array[Long] = _
+  var updateGradInputTensors: Array[Tensor[Float]] = _
+  var updateGradWMemoryPrimitives: Array[Long] = _
+  var updateGradWTensors: Array[Tensor[Float]] = _
+
+  object ParamsShape {
+    var weight: MemoryData = _
+    var bias: MemoryData = _
+    var gradWeight: MemoryData = _
+    var gradBias: MemoryData = _
+  }
+
+  {
+    val stdv = 1.0 / math.sqrt(weight.size(2))
+    val wInit: InitializationMethod = RandomUniform(-stdv, stdv)
+    val bInit: InitializationMethod = RandomUniform(-stdv, stdv)
+    setInitMethod(wInit, bInit)
+  }
+
+  override def reset(): Unit = {
+    if (initWeight == null) {
+      val t = Tensor[Float](Array(outputSize, inputSize))
+      weightInitMethod.init(t, VariableFormat.OUT_IN)
+      weight.copy(t)
+    } else {
+      weight.copy(initWeight)
+    }
+
+    if (initBias == null) {
+      val t = Tensor[Float](Array(outputSize))
+      biasInitMethod.init(t, VariableFormat.ONE_D)
+      bias.copy(t)
+    } else {
+      bias.copy(initBias)
+    }
+
+    zeroGradParameters()
+  }
+
+  override private[mkldnn] def initFwdPrimitives(inputs: Array[MemoryData], phase: Phase) = {
+    val weightShape = inputs(0).shape.length match {
+      case 4 => Array(weight.size(1)) ++ inputs(0).shape.slice(1, 4)
+      case _ => weight.size()
+    }
+
+    val inputShape = inputs(0).shape
+    require(inputs(0).shape.length > 1, s"mkldnn linear unspported input dimension")
+
+    val outputShape = Array(inputs(0).shape(0), outputSize)
+
+    MklDnn.MemoryDescInit(inputShape.length, inputShape,
+      DataType.F32, Memory.Format.any)
+
+    val src = NativeData(inputShape, Memory.Format.any)
+    val wei = NativeData(weightShape, Memory.Format.any)
+    val bis = NativeData(bias.size(), Memory.Format.x)
+    val dst = NativeData(outputShape, Memory.Format.any)
+
+    val desc = MklDnn.LinearForwardDescInit(
+      PropKind.Forward,
+      src.getMemoryDescription(),
+      wei.getMemoryDescription(),
+      bis.getMemoryDescription(),
+      dst.getMemoryDescription())
+    forwardPrimDesc = MklDnn.PrimitiveDescCreate(desc, runtime.engine, 0)
+
+    val List(realSrc, realWei, realDst) = List(Query.SrcPd, Query.WeightsPd, Query.DstPd).map {x =>
+      MemoryData.operationWant(forwardPrimDesc, x)
+    }
+
+    ParamsShape.weight = realWei
+    ParamsShape.bias = bis
+
+    val srcs = Array(realSrc.getPrimitive(runtime), realWei.getPrimitive(runtime),
+      bis.getPrimitive(runtime))
+    val indexes = Array.fill(srcs.length)(0)
+    val dsts = Array(realDst.getPrimitive(runtime))
+
+    val primitive = MklDnn.PrimitiveCreate2(forwardPrimDesc, srcs, indexes, srcs.length,
+      dsts, dsts.length)
+
+    updateOutputMemoryPrimitives = srcs ++ dsts
+    updateOutputPrimitives = Array(primitive)
+    output = initTensor(dst)
+
+    _inputFormats = Array(realSrc)
+    _outputFormats = Array(realDst)
+    (_inputFormats, _outputFormats)
+  }
+
+  override def updateOutput(input: Activity): Activity = {
+    if (updateOutputTensors == null) {
+      val buffer = new ArrayBuffer[Tensor[Float]]()
+      buffer.append(input.asInstanceOf[Tensor[Float]])
+      buffer.append(weight)
+      buffer.append(bias)
+      buffer.append(output.asInstanceOf[Tensor[Float]])
+      updateOutputTensors = buffer.toArray
+    }
+
+    updateWithNewTensor(updateOutputTensors, 0, input)
+
+    MklDnnOps.streamSubmit(runtime.stream, 1, updateOutputPrimitives, updateOutputPrimitives.length,
+      updateOutputMemoryPrimitives, updateOutputTensors)
+
+    output
+  }
+
+  override private[mkldnn] def initBwdPrimitives(grad: Array[MemoryData], phase: Phase) = {
+    val weightShape = inputFormats()(0).shape.length match {
+      case 4 => Array(weight.size(1)) ++ inputFormats()(0).shape.slice(1, 4)
+      case _ => weight.size()
+    }
+
+    val inputShape = inputFormats()(0).shape
+
+    val outputShape = Array(inputFormats()(0).shape(0), outputSize)
+
+    val src = NativeData(inputShape, Memory.Format.any)
+    val wei = NativeData(weightShape, Memory.Format.any)
+    val bis = NativeData(bias.size(), Memory.Format.x)
+    val dst = NativeData(outputShape, Memory.Format.any)
+
+    val desc = MklDnn.LinearBackwardDataDescInit(
+      src.getMemoryDescription(),
+      wei.getMemoryDescription(),
+      grad(0).getMemoryDescription())
+    val backwardPrimDesc = MklDnn.PrimitiveDescCreate(desc, runtime.engine, forwardPrimDesc)
+
+    val List(realDiffSrc, realWei, realDiffDst) =
+      List(Query.DiffSrcPd, Query.WeightsPd, Query.DiffDstPd).map { x =>
+        MemoryData.operationWant(backwardPrimDesc, x)
+      }
+
+    val srcs = Array(realDiffDst.getPrimitive(runtime), realWei.getPrimitive(runtime))
+    val indexes = Array.fill(srcs.length)(0)
+    val dsts = Array(realDiffSrc.getPrimitive(runtime))
+
+    val primitive = MklDnn.PrimitiveCreate2(backwardPrimDesc, srcs, indexes, srcs.length,
+      dsts, dsts.length)
+
+    updateGradInputMemoryPrimitives = srcs ++ dsts
+    updateGradInputPrimitives = Array(primitive)
+    gradInput = initTensor(realDiffSrc)
+
+    _gradInputFormats = Array(realDiffSrc)
+    _gradOutputFormats = Array(realDiffDst)
+    (_gradOutputFormats, _gradInputFormats)
+  }
+
+  override private[mkldnn] def initGradWPrimitives(grad: Array[MemoryData],
+    phase: Phase): Array[MemoryData] = {
+    val weightShape = inputFormats()(0).shape.length match {
+      case 4 => Array(weight.size(1)) ++ inputFormats()(0).shape.slice(1, 4)
+      case _ => weight.size()
+    }
+
+    val inputShape = inputFormats()(0).shape
+
+    val outputShape = Array(inputFormats()(0).shape(0), outputSize)
+
+
+    val src = NativeData(inputShape, Memory.Format.any)
+    val wei = NativeData(weightShape, Memory.Format.any)
+    val bis = NativeData(bias.size(), Memory.Format.x)
+    val dst = NativeData(outputShape, Memory.Format.any)
+
+    val desc = MklDnn.LinearBackwardWeightsDescInit(
+      src.getMemoryDescription(), wei.getMemoryDescription(), bis.getMemoryDescription(),
+      dst.getMemoryDescription())
+    val gradWeightPrimDesc = MklDnn.PrimitiveDescCreate(desc, runtime.engine, forwardPrimDesc)
+
+    val List(realWei, realDiffDst) = List(Query.DiffWeightsPd, Query.DiffDstPd).map { x =>
+      MemoryData.operationWant(gradWeightPrimDesc, x)
+    }
+
+    ParamsShape.gradWeight = realWei
+    ParamsShape.gradBias = bis
+
+    val srcs = Array(inputFormats()(0).getPrimitive(runtime), realDiffDst.getPrimitive(runtime))
+    val indexes = Array.fill(srcs.length)(0)
+    val dsts = Array(realWei.getPrimitive(runtime), bis.getPrimitive(runtime))
+
+    val primitive = MklDnn.PrimitiveCreate2(gradWeightPrimDesc, srcs, indexes, srcs.length,
+      dsts, dsts.length)
+
+    updateGradWMemoryPrimitives = srcs ++ dsts
+    accGradientPrimitives = Array(primitive)
+
+    _gradOutputFormatsForWeight = Array(realDiffDst)
+    (_gradOutputFormatsForWeight)
+  }
+
+  override def updateGradInput(input: Activity, gradOutput: Activity): Activity = {
+    if (updateGradInputTensors == null) {
+      val buffer = new ArrayBuffer[Tensor[Float]]()
+      buffer.append(gradOutput.asInstanceOf[Tensor[Float]])
+      buffer.append(weight)
+      buffer.append(gradInput.asInstanceOf[Tensor[Float]])
+      updateGradInputTensors = buffer.toArray
+    }
+
+    updateWithNewTensor(updateGradInputTensors, 0, gradOutput)
+
+    MklDnnOps.streamSubmit(runtime.stream, 1, updateGradInputPrimitives,
+      updateGradInputPrimitives.length, updateGradInputMemoryPrimitives, updateGradInputTensors)
+
+    gradInput
+  }
+
+  override def accGradParameters(input: Activity, gradOutput: Activity): Unit = {
+    if (updateGradWTensors == null) {
+      val buffer = new ArrayBuffer[Tensor[Float]]()
+      buffer.append(input.asInstanceOf[Tensor[Float]])
+      buffer.append(gradOutput.asInstanceOf[Tensor[Float]])
+      buffer.append(gradWeight)
+      buffer.append(gradBias)
+      updateGradWTensors = buffer.toArray
+    }
+
+    updateWithNewTensor(updateGradInputTensors, 0, input)
+    updateWithNewTensor(updateGradInputTensors, 1, gradOutput)
+
+    MklDnnOps.streamSubmit(runtime.stream, 1, accGradientPrimitives,
+      accGradientPrimitives.length, updateGradWMemoryPrimitives, updateGradWTensors)
+  }
+
+  override def parameters(): (Array[Tensor[Float]], Array[Tensor[Float]]) = {
+    (Array(weight, bias), Array(gradWeight, gradBias))
+  }
+
+  override def parametersWithShape(): (Array[MemoryData], Array[MemoryData]) = {
+    (Array(ParamsShape.weight, ParamsShape.bias), Array(ParamsShape.gradWeight,
+      ParamsShape.gradBias))
+  }
+
+  override def zeroGradParameters(): Unit = {
+    gradWeight.zero()
+    gradBias.zero()
+  }
+}
+
+object Linear {
+  def apply(
+    inputSize: Int,
+    outputSize: Int,
+    withBias: Boolean = true,
+    initWeight: Tensor[Float] = null,
+    initBias: Tensor[Float] = null,
+    initGradWeight: Tensor[Float] = null,
+    initGradBias: Tensor[Float] = null): Linear = {
+    new Linear(inputSize, outputSize, initWeight, initBias, initGradWeight, initGradBias)
+  }
+}
diff --git a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/MaxPooling.scala b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/MaxPooling.scala
new file mode 100644
index 00000000000..9436af72f90
--- /dev/null
+++ b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/MaxPooling.scala
@@ -0,0 +1,125 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.intel.analytics.bigdl.nn.mkldnn
+
+import com.intel.analytics.bigdl.mkl._
+import com.intel.analytics.bigdl.nn.Utils
+import com.intel.analytics.bigdl.nn.abstractnn.Activity
+import com.intel.analytics.bigdl.tensor.Tensor
+
+class MaxPooling(
+  kW: Int,
+  kH: Int,
+  dW: Int = 1,
+  dH: Int = 1,
+  padW: Int = 0,
+  padH: Int = 0
+) extends MklDnnLayer {
+  @transient
+  private var workSpaceFormat: MemoryData = _
+  @transient
+  private var workSpace: Tensor[Float] = _
+  @transient
+  private var fwdMemPrims: Array[Long] = _
+  @transient
+  private var bwdMemPrims: Array[Long] = _
+  @transient
+  private var paddingTL: Array[Int] = _
+  @transient
+  private var paddingBR: Array[Int] = _
+  @transient
+  private var fwdPD: Long = _
+
+  override private[mkldnn] def initFwdPrimitives(inputs: Array[MemoryData], phase: Phase) = {
+    _inputFormats = singleNativeData(inputs)
+    val strides = Array(dW, dH)
+    val kernel = Array(kH, kW)
+    val n = _inputFormats(0).shape(0)
+    val c = _inputFormats(0).shape(1)
+    val h = _inputFormats(0).shape(2)
+    val w = _inputFormats(0).shape(3)
+    val (pt, pb, pl, pr, oh, ow) =
+      Utils.getPaddingAndOutputSize(h, w, dH, dW, kH, kW, padH, padW)
+    paddingTL = Array(pt, pl)
+    paddingBR = Array(pb, pr)
+          Utils.getSAMEOutSizeAndPadding(h, w, dH, dW, kH, kW)
+          Utils.getOutSizeAndPaddingForDNN(h, w, dH, dW, kH, kW, padH, padW, true)
+    val outputMD = MklDnn.MemoryDescInit(4, Array(n, c, oh, ow), DataType.F32, Memory.Format.any)
+    val description = MklDnn.PoolingForwardDescInit(
+      PropKind.Forward, AlgKind.PoolingMax,
+      _inputFormats(0).getMemoryDescription(), outputMD, strides, kernel, paddingTL, paddingBR,
+      MklDnn.PaddingKind.mkldnnPaddingZero)
+    fwdPD = MklDnn.PrimitiveDescCreate(description, runtime.engine, 0L)
+    _outputFormats = Array(MemoryData.primitiveOutput(fwdPD))
+    output = initTensor(_outputFormats(0))
+    workSpaceFormat = MemoryData.primitiveWorkSpace(fwdPD)
+    workSpace = initTensor(workSpaceFormat)
+    updateOutputPrimitives = Array(MklDnn.PrimitiveCreate2(fwdPD,
+      _inputFormats.map(_.getPrimitive(runtime)), Array(0), 1,
+      Array(_outputFormats(0), workSpaceFormat).map(_.getPrimitive(runtime)), 2))
+    fwdMemPrims = Array(_inputFormats(0), _outputFormats(0), workSpaceFormat)
+      .map(_.getPrimitive(runtime))
+    (_inputFormats, _outputFormats)
+  }
+
+  override private[mkldnn] def initBwdPrimitives(grad: Array[MemoryData], phase: Phase) = {
+    _gradOutputFormats = singleNativeData(grad)
+    _gradOutputFormatsForWeight = _gradOutputFormats
+    val strides = Array(dW, dH)
+    val kernel = Array(kH, kW)
+    val description = MklDnn.PoolingBackwardDescInit(AlgKind.PoolingMax,
+      _inputFormats(0).getMemoryDescription(),
+      _gradOutputFormats(0).getMemoryDescription(),
+      strides, kernel, paddingTL, paddingBR, MklDnn.PaddingKind.mkldnnPaddingZero)
+
+    val pd = MklDnn.PrimitiveDescCreate(description, runtime.engine, fwdPD)
+    _gradInputFormats = Array(MemoryData.primitiveGradInput(pd))
+    updateGradInputPrimitives = Array(MklDnn.PrimitiveCreate2(pd,
+      Array(_gradOutputFormats(0), workSpaceFormat).map(_.getPrimitive(runtime)),
+      Array(0, 0), 2, _gradInputFormats.map(_.getPrimitive(runtime)), 1))
+    gradInput = initTensor(_gradInputFormats(0))
+    bwdMemPrims = Array(_inputFormats(0), _gradOutputFormats(0), workSpaceFormat,
+      _gradInputFormats(0)).map(_.getPrimitive(runtime))
+    (_gradOutputFormats, _gradInputFormats)
+  }
+
+  override def updateOutput(input: Activity): Activity = {
+    val buffer = Array(input.asInstanceOf[Tensor[Float]], output.asInstanceOf[Tensor[Float]],
+      workSpace)
+    MklDnnOps.streamSubmit(runtime.stream, 1, updateOutputPrimitives, 1, fwdMemPrims, buffer)
+    output
+  }
+
+  override def updateGradInput(input: Activity, gradOutput: Activity): Activity = {
+    val buffer = Array(
+      input.asInstanceOf[Tensor[Float]], gradOutput.asInstanceOf[Tensor[Float]], workSpace,
+      gradInput.asInstanceOf[Tensor[Float]])
+    MklDnnOps.streamSubmit(runtime.stream, 1, updateGradInputPrimitives, 1,
+      bwdMemPrims, buffer)
+    gradInput
+  }
+}
+
+object MaxPooling {
+  def apply(
+    kW: Int,
+    kH: Int,
+    dW: Int = 1,
+    dH: Int = 1,
+    padW: Int = 0,
+    padH: Int = 0
+  ): MaxPooling = new MaxPooling(kW, kH, dW, dH, padW, padH)
+}
diff --git a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/MemoryData.scala b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/MemoryData.scala
new file mode 100644
index 00000000000..bc5297dd327
--- /dev/null
+++ b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/MemoryData.scala
@@ -0,0 +1,259 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.intel.analytics.bigdl.nn.mkldnn
+
+import com.intel.analytics.bigdl.mkl.{DataType, Memory, MklDnn, Query}
+import com.intel.analytics.bigdl.tensor.{DnnTensor, Tensor}
+
+sealed trait MemoryData extends Serializable {
+  def shape: Array[Int]
+  def layout: Int
+  def setShape(shape: Array[Int]): Unit
+  def setLayout(layout: Int): Unit
+
+  def isLayoutFixed(): Boolean = {
+    layout != Memory.Format.format_undef && layout != Memory.Format.any
+  }
+
+  def cloneFormat(): MemoryData
+
+  private val UNDEFINED: Long = -1
+
+  @transient
+  private var primitive: Long = UNDEFINED
+  @transient
+  private var primitiveDesc: Long = UNDEFINED
+  @transient
+  private var description: Long = UNDEFINED
+
+  def getMemoryDescription(): Long = {
+    if (description == UNDEFINED) {
+      description = MklDnn.MemoryDescInit(shape.length, shape, DataType.F32, layout)
+    }
+    description
+  }
+
+  def getPrimitiveDescription(runtime: MklDnnRuntime): Long = {
+    if (primitiveDesc == UNDEFINED) {
+      primitiveDesc =
+        MklDnn.MemoryPrimitiveDescCreate(getMemoryDescription(), runtime.engine)
+    }
+    primitiveDesc
+  }
+
+  def getPrimitive(runtime: MklDnnRuntime): Long = {
+    if (primitive == UNDEFINED) {
+      primitive =
+        MklDnn.PrimitiveCreate0(getPrimitiveDescription(runtime))
+    }
+    primitive
+  }
+
+  def setPrimitiveDescription(desc: Long): Unit = {
+    primitiveDesc = desc
+  }
+
+  def setMemoryDescription(desc: Long): Unit = {
+    description = desc
+  }
+}
+
+case class HeapData(private var _shape: Array[Int], private var _layout: Int) extends MemoryData {
+
+  override def setShape(shape: Array[Int]): Unit = _shape = shape.clone()
+
+  override def setLayout(layout: Int): Unit = _layout = layout
+
+  override def shape: Array[Int] = _shape.clone()
+
+  override def layout: Int = _layout
+
+  override def hashCode(): Int = {
+    val seed = 37
+    var hash = 1
+    hash = hash * seed + this.layout
+    var d = 0
+    while (d < this.shape.length) {
+      hash = hash * seed + this.shape(d)
+      d += 1
+    }
+
+    hash
+  }
+
+  override def equals(obj: Any): Boolean = {
+    if (obj == null) {
+      return false
+    }
+    if (!obj.isInstanceOf[HeapData]) {
+      return false
+    }
+    val other = obj.asInstanceOf[HeapData]
+    if (this.eq(other)) {
+      return true
+    }
+    if (this.layout != other.layout) {
+      return false
+    }
+    if (this.shape == null && other.shape == null) {
+      return true
+    }
+    if (this.shape != null && other.shape != null) {
+      if (this.shape.length != other.shape.length) return false
+      var i = 0
+      while(i < this.shape.length) {
+        if (this.shape(i) != other.shape(i)) return false
+        i += 1
+      }
+      return true
+    } else {
+      return false
+    }
+  }
+
+  override def toString: String = {
+    s"HeapData([${shape.mkString("x")}], ${layout})"
+  }
+
+  override def cloneFormat(): MemoryData = new HeapData(_shape, _layout)
+
+  def toNative(): NativeData = {
+    NativeData(shape, layout)
+  }
+}
+
+case class NativeData(private var _shape: Array[Int], private var _layout: Int) extends MemoryData {
+  override def shape: Array[Int] = _shape.clone()
+
+  override def layout: Int = _layout
+
+  override def setShape(shape: Array[Int]): Unit = _shape = shape.clone()
+
+  override def setLayout(layout: Int): Unit = _layout = layout
+
+  override def hashCode(): Int = {
+    val seed = 41
+    var hash = 1
+    hash = hash * seed + this.layout
+    var d = 0
+    while (d < this.shape.length) {
+      hash = hash * seed + this.shape(d)
+      d += 1
+    }
+
+    hash
+  }
+
+  override def equals(obj: Any): Boolean = {
+    if (obj == null) {
+      return false
+    }
+    if (!obj.isInstanceOf[NativeData]) {
+      return false
+    }
+    val other = obj.asInstanceOf[NativeData]
+    if (this.eq(other)) {
+      return true
+    }
+    if (this.layout != other.layout) {
+      return false
+    }
+    if (this.shape == null && other.shape == null) {
+      return true
+    }
+    if (this.shape != null && other.shape != null) {
+      if (this.shape.length != other.shape.length) return false
+      var i = 0
+      while(i < this.shape.length) {
+        if (this.shape(i) != other.shape(i)) return false
+        i += 1
+      }
+      return true
+    } else {
+      return false
+    }
+  }
+
+  override def toString: String = {
+    s"NativeData([${shape.mkString("x")}], ${layout})"
+  }
+
+  override def cloneFormat(): MemoryData = new NativeData(_shape, _layout)
+}
+
+private[mkldnn] object MemoryData {
+  def noUndef(formats: Array[MemoryData]): Boolean = {
+    if (formats == null || formats.length == 0) return true
+    formats.foreach(f => if (f.layout == Memory.Format.format_undef) return false)
+    return true
+  }
+
+  def isSizeCompatible(actual: MemoryData, expect: MemoryData): Boolean = {
+    if (expect == null) return true
+    if (actual == null) return false
+    if (actual.shape.length != expect.shape.length) return false
+    actual.shape.zip(expect.shape).foreach {case (a, e) => if (a != e) return false}
+    return true
+  }
+
+  def primitiveOutput(pd: Long): NativeData = {
+    val outputPD = MklDnn.PrimitiveDescQueryPd(pd, Query.DstPd, 0)
+    val memoryDesc = MklDnn.PrimitiveDescQueryMemory(outputPD)
+    val shape = Memory.GetShape(memoryDesc)
+    val layout = Memory.GetLayout(memoryDesc)
+
+    val memory = NativeData(shape, layout)
+    memory.setMemoryDescription(memoryDesc)
+    memory.setPrimitiveDescription(outputPD)
+    memory
+  }
+
+  def primitiveGradInput(pd: Long): NativeData = {
+    val gradInputPD = MklDnn.PrimitiveDescQueryPd(pd, Query.DiffSrcPd, 0)
+    val memoryDesc = MklDnn.PrimitiveDescQueryMemory(gradInputPD)
+    val shape = Memory.GetShape(memoryDesc)
+    val layout = Memory.GetLayout(memoryDesc)
+
+    val memory = NativeData(shape, layout)
+    memory.setMemoryDescription(memoryDesc)
+    memory.setPrimitiveDescription(gradInputPD)
+    memory
+  }
+
+  def operationWant(primDesc: Long, queryType: Int): NativeData = {
+    val memoryPrimDesc = MklDnn.PrimitiveDescQueryPd(primDesc, queryType, 0)
+    val memoryDesc = MklDnn.PrimitiveDescQueryMemory(memoryPrimDesc)
+    val shape = Memory.GetShape(memoryDesc)
+    val layout = Memory.GetLayout(memoryDesc)
+
+    val memory = NativeData(shape, layout)
+    memory.setMemoryDescription(memoryDesc)
+    memory.setPrimitiveDescription(memoryPrimDesc)
+    memory
+  }
+
+  def primitiveWorkSpace(pd: Long): NativeData = {
+    val workspacePD = MklDnn.PrimitiveDescQueryPd(pd, Query.WorkspacePd, 0)
+    val memoryDesc = MklDnn.PrimitiveDescQueryMemory(workspacePD)
+    val shape = Memory.GetShape(memoryDesc)
+    val layout = Memory.GetLayout(memoryDesc)
+
+    val memory = NativeData(shape, layout)
+    memory.setMemoryDescription(memoryDesc)
+    memory.setPrimitiveDescription(workspacePD)
+    memory
+  }
+}
diff --git a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/MklDnnOps.scala b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/MklDnnOps.scala
new file mode 100644
index 00000000000..1984bd255e8
--- /dev/null
+++ b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/MklDnnOps.scala
@@ -0,0 +1,58 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.intel.analytics.bigdl.nn.mkldnn
+
+import com.intel.analytics.bigdl.mkl.{Memory, MklDnn, Engine => DnnEngine, Stream => DnnStream}
+import com.intel.analytics.bigdl.tensor.{DnnTensor, Tensor}
+
+private[mkldnn] object MklDnnOps {
+  def memorySetDataHandle(memory: Long, data: Tensor[Float], offset: Int): Long = {
+    require(MklDnn.isLoaded, "mkldnn isn't loaded")
+    MklDnn.MemorySetDataHandle(memory, data.storage().array(), offset)
+  }
+
+  def memoryReleaseDataHandle(data: Tensor[Float], ptr: Long): Unit = {
+    require(MklDnn.isLoaded, "mkldnn isn't loaded")
+    MklDnn.MemoryReleaseDataHandle(data.storage().array(), ptr)
+  }
+
+  def streamSubmit(loc: Long, block: Int, primitives: Array[Long], length: Int,
+                   memory_primitives: Array[Long], buffers: Array[Tensor[Float]]): Unit = {
+    require(MklDnn.isLoaded, "mkldnn isn't loaded")
+    require(memory_primitives.length == buffers.length)
+
+    val handle = new Array[Long](memory_primitives.length)
+    for (i <- memory_primitives.indices) {
+      if (memory_primitives(i) != 0L) {
+        if (buffers(i).isInstanceOf[DnnTensor[_]]) {
+          Memory.SetDataHandle(memory_primitives(i),
+            buffers(i).asInstanceOf[DnnTensor[Float]].storageAddress(), 0)
+        } else {
+          handle(i) = MklDnnOps.memorySetDataHandle(
+            memory_primitives(i), buffers(i), buffers(i).storageOffset() - 1)
+        }
+      }
+    }
+
+    DnnStream.Submit(loc, block, primitives)
+
+    for (i <- memory_primitives.indices) {
+      if (handle(i) != 0L) {
+         MklDnnOps.memoryReleaseDataHandle(buffers(i), handle(i))
+      }
+    }
+  }
+}
diff --git a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/MklDnnRuntime.scala b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/MklDnnRuntime.scala
new file mode 100644
index 00000000000..b4410347931
--- /dev/null
+++ b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/MklDnnRuntime.scala
@@ -0,0 +1,24 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.intel.analytics.bigdl.nn.mkldnn
+
+import com.intel.analytics.bigdl.mkl.{Engine, MklDnn, Stream}
+
+class MklDnnRuntime {
+  MklDnn.isLoaded
+  val engine : Long = Engine.Create(Engine.Kind.Cpu, 0)
+  val stream : Long = Stream.Create(Stream.Kind.Eager)
+}
diff --git a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/Phase.scala b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/Phase.scala
new file mode 100644
index 00000000000..ddc8c298684
--- /dev/null
+++ b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/Phase.scala
@@ -0,0 +1,25 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.intel.analytics.bigdl.nn.mkldnn
+
+sealed class Phase
+
+object Phase {
+  case object TrainingPhase extends Phase
+
+  case object InferencePhase extends Phase
+
+}
diff --git a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/ReLU.scala b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/ReLU.scala
new file mode 100644
index 00000000000..372aaf91ad2
--- /dev/null
+++ b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/ReLU.scala
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.intel.analytics.bigdl.nn.mkldnn
+
+import com.intel.analytics.bigdl.mkl.{AlgKind, MklDnn, PropKind, Query}
+
+class ReLU(value: Float = 0.0f) extends MklDnnLayer {
+  private val UNDEFINED: Long = 0
+
+  @transient
+  private var fwdPrimDesc: Long = UNDEFINED
+
+  override private[mkldnn] def initFwdPrimitives(inputs: Array[MemoryData], phase: Phase) = {
+    _inputFormats = singleNativeData(inputs)
+    val description = MklDnn.EltwiseForwardDescInit(
+      PropKind.Forward, AlgKind.EltwiseRelu, _inputFormats(0).getMemoryDescription(), value, 0)
+    fwdPrimDesc = MklDnn.PrimitiveDescCreate(description, runtime.engine, 0L)
+    _outputFormats = Array(MemoryData.primitiveOutput(fwdPrimDesc))
+    updateOutputPrimitives = Array(
+      MklDnn.PrimitiveCreate2(fwdPrimDesc,
+        Array(_inputFormats(0).getPrimitive(runtime)), Array(0), _inputFormats.length,
+        _outputFormats.map(_.getPrimitive(runtime)), _outputFormats.length)
+    )
+    output = initTensor(_outputFormats(0))
+    (_inputFormats, _outputFormats)
+  }
+
+  override private[mkldnn] def initBwdPrimitives(grad: Array[MemoryData], phase: Phase) = {
+    _gradOutputFormats = singleNativeData(grad)
+    _gradOutputFormatsForWeight = _gradOutputFormats
+    val description = MklDnn.EltwiseBackwardDescInit(AlgKind.EltwiseRelu,
+      _gradOutputFormats(0).getMemoryDescription(), _inputFormats(0).getMemoryDescription(),
+      value, 0)
+    require(fwdPrimDesc != UNDEFINED, "You should call initFwdPrimitives first")
+    val primDesc = MklDnn.PrimitiveDescCreate(description, runtime.engine, fwdPrimDesc)
+    _gradInputFormats = Array(MemoryData.primitiveGradInput(primDesc))
+    updateGradInputPrimitives = Array(
+      MklDnn.PrimitiveCreate2(primDesc, Array(_inputFormats(0),
+        _gradOutputFormats(0)).map(_.getPrimitive(runtime)), Array(0), 2,
+        _gradInputFormats.map(_.getPrimitive(runtime)), _gradInputFormats.length))
+    gradInput = initTensor(_gradInputFormats(0))
+    (_gradOutputFormats, _gradInputFormats)
+  }
+}
+
+object ReLU {
+  def apply(value: Float = 0.0f): ReLU = new ReLU(value)
+}
diff --git a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/ReorderManager.scala b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/ReorderManager.scala
new file mode 100644
index 00000000000..baefb35dadc
--- /dev/null
+++ b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/ReorderManager.scala
@@ -0,0 +1,114 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.intel.analytics.bigdl.nn.mkldnn
+
+import com.intel.analytics.bigdl.nn.abstractnn.Activity
+import com.intel.analytics.bigdl.tensor.Tensor
+import com.intel.analytics.bigdl.utils.T
+
+import scala.collection.mutable
+
+private[mkldnn] class ReorderManager() {
+  // (MemoryFormatId, TargetFormat) -> Reorder
+  val reorders = mutable.HashMap[(Int, MemoryData), ReorderMemory]()
+  // ReorderId -> RefCount
+  val refCounts = mutable.HashMap[Int, Int]()
+  val useCounts = mutable.HashMap[Int, Int]()
+
+  private var runtime: MklDnnRuntime = _
+
+  def register(from: MemoryData, to: MemoryData): Unit = {
+    require(runtime != null, "Please call setRuntime first")
+    val mId = System.identityHashCode(from)
+    if (needReorder(from, to)) {
+      if (reorders.contains((mId, to))) {
+        refCounts(System.identityHashCode(reorders((mId, to)))) += 1
+      } else {
+        val reorder = ReorderMemory(to)
+        reorder.setRuntime(runtime)
+        reorder.initFwdPrimitives(Array(from), Phase.InferencePhase)
+        reorders((mId, to)) = reorder
+        val reorderId = System.identityHashCode(reorder)
+        refCounts(reorderId) = 1
+        useCounts(reorderId) = 0
+      }
+    }
+  }
+
+  def setRuntime(runtime: MklDnnRuntime): Unit = {
+    this.runtime = runtime
+  }
+
+  def infer(from: Array[MemoryData], to: Array[MemoryData], output: Activity)
+  : Activity = {
+    if (from.length == 1) {
+      require(output.isTensor, "output activity should be a tensor")
+      inferTensor(from(0), to(0), output.asInstanceOf[Tensor[Float]])
+    } else {
+      require(output.toTable.length() == from.length,
+        "output activity length doesn't match")
+      val outputTable = T()
+      var i = 0
+      while(i < from.length) {
+        outputTable(i + 1) = inferTensor(from(i), to(i), output.toTable(i + 1))
+        i += 1
+      }
+      output
+    }
+  }
+
+  private def inferTensor(from: MemoryData, to : MemoryData, output: Tensor[Float])
+  : Tensor[Float] = {
+    val mId = System.identityHashCode(from)
+    if (reorders.contains((mId, to))) {
+      val reorder = reorders((mId, to))
+      val reorderId = System.identityHashCode(reorder)
+      val result = if (useCounts(reorderId) == 0) {
+        reorder.forward(output).asInstanceOf[Tensor[Float]]
+      } else {
+        reorder.output.asInstanceOf[Tensor[Float]]
+      }
+      useCounts(reorderId) += 1
+      if (useCounts(reorderId) == refCounts(reorderId)) {
+        useCounts(reorderId) = 0
+      }
+      result
+    } else {
+      output
+    }
+  }
+
+  private def needReorder(from: MemoryData, to: MemoryData): Boolean = {
+    from match {
+      case h: HeapData =>
+        to match {
+          case hh: HeapData =>
+            require(h.layout == hh.layout, "Heap data layout should be same")
+            false
+          case nn: NativeData => true
+          case _ => throw new UnsupportedOperationException("Not support such memory format")
+        }
+      case n: NativeData =>
+        to match {
+          case hh: HeapData => true
+          case nn: NativeData =>
+            nn.layout != n.layout
+          case _ => throw new UnsupportedOperationException("Not support such memory format")
+        }
+      case _ => throw new UnsupportedOperationException("Not support such memory format")
+    }
+  }
+}
diff --git a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/ReorderMemory.scala b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/ReorderMemory.scala
new file mode 100644
index 00000000000..2e27c69184d
--- /dev/null
+++ b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/ReorderMemory.scala
@@ -0,0 +1,95 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.intel.analytics.bigdl.nn.mkldnn
+
+import com.intel.analytics.bigdl.mkl.{DataType, Memory, MklDnn}
+import com.intel.analytics.bigdl.nn.abstractnn.TensorModule
+import com.intel.analytics.bigdl.tensor.{DnnTensor, Tensor}
+
+class ReorderMemory(inputFormat: MemoryData, outputFormat: MemoryData,
+  gradInputFormat: MemoryData, gradOutputFormat: MemoryData
+) extends MklDnnLayer {
+
+  _outputFormats = Array(outputFormat)
+  _gradInputFormats = Array(gradInputFormat)
+
+
+  override private[mkldnn] def initFwdPrimitives(inputs: Array[MemoryData], phase: Phase) = {
+    _inputFormats = if (inputFormat == null) inputs else Array(inputFormat)
+    require(_inputFormats.length == 1, "Only accept one tensor as input")
+
+    require(_inputFormats(0).shape.product == outputFormat.shape.product,
+      "input output memory not match")
+    val fwdReorderPrimDesc = MklDnn.ReorderPrimitiveDescCreate(
+      _inputFormats(0).getPrimitiveDescription(runtime),
+      outputFormat.getPrimitiveDescription(runtime))
+    val fwdReorderPrim = MklDnn.PrimitiveCreate2(fwdReorderPrimDesc,
+      Array(_inputFormats(0).getPrimitive(runtime)), Array(0), 1,
+      Array(outputFormat.getPrimitive(runtime)), 1)
+
+    updateOutputPrimitives = Array(fwdReorderPrim)
+    output = initTensor(outputFormat)
+    (_inputFormats, _outputFormats)
+  }
+
+  override private[mkldnn] def initBwdPrimitives(grads: Array[MemoryData], phase: Phase) = {
+    _gradInputFormats = (gradInputFormat, inputFormat) match {
+      case (null, null) => inputFormats()
+      case (null, x) => Array(x)
+      case (x, _) => Array(x)
+    }
+
+    _gradOutputFormats = if (gradOutputFormat == null) grads else Array(gradOutputFormat)
+    _gradOutputFormatsForWeight = if (gradOutputFormat == null) grads else Array(gradOutputFormat)
+    require(_gradOutputFormats.length == 1, "Only accept one tensor as input")
+
+    require(_gradOutputFormats(0).shape.product == _gradInputFormats(0).shape.product,
+      "input output memory not match")
+    val bwdReorderPrimDesc = MklDnn.ReorderPrimitiveDescCreate(
+      _gradOutputFormats(0).getPrimitiveDescription(runtime),
+      _gradInputFormats(0).getPrimitiveDescription(runtime))
+    val bwdReorderPrim = MklDnn.PrimitiveCreate2(bwdReorderPrimDesc,
+      _gradOutputFormats.map(_.getPrimitive(runtime)), Array(0), 1,
+      _gradInputFormats.map(_.getPrimitive(runtime)), 1)
+
+    updateGradInputPrimitives = Array(bwdReorderPrim)
+    gradInput = initTensor(_gradInputFormats(0))
+    (_gradOutputFormats, _gradInputFormats)
+  }
+
+  override def toString(): String = {
+    if (_inputFormats != null) {
+      s"nn.mkl.ReorderMemory(${_inputFormats(0)} -> ${outputFormat})"
+    } else {
+      s"nn.mkl.ReorderMemory(_ -> ${outputFormat})"
+    }
+  }
+}
+
+object ReorderMemory {
+  def apply(inputFormat: MemoryData, outputFormat: MemoryData, gradInputFormat: MemoryData,
+    gradOutputFomat: MemoryData): ReorderMemory = {
+    new ReorderMemory(inputFormat, outputFormat, gradInputFormat, gradOutputFomat)
+  }
+
+  def apply(outputFormat: MemoryData, gradInputFormat: MemoryData): ReorderMemory = {
+    new ReorderMemory(null, outputFormat, gradInputFormat, null)
+  }
+
+  def apply(outputFormat: MemoryData): ReorderMemory = {
+    new ReorderMemory(null, outputFormat, null, null)
+  }
+}
diff --git a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/ResNet50Perf.scala b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/ResNet50Perf.scala
new file mode 100644
index 00000000000..7071b2c1cc4
--- /dev/null
+++ b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/ResNet50Perf.scala
@@ -0,0 +1,309 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.intel.analytics.bigdl.nn.mkldnn
+
+import com.intel.analytics.bigdl.Module
+import com.intel.analytics.bigdl.mkl.{Memory, MklDnn}
+import com.intel.analytics.bigdl.nn._
+import com.intel.analytics.bigdl.nn.abstractnn.Activity
+import com.intel.analytics.bigdl.nn.mkldnn.Phase.{InferencePhase, TrainingPhase}
+import com.intel.analytics.bigdl.nn.mkldnn.ResNet.DatasetType.ImageNet
+import com.intel.analytics.bigdl.numeric.NumericFloat
+import com.intel.analytics.bigdl.tensor.Tensor
+import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric
+import com.intel.analytics.bigdl.utils.RandomGenerator._
+import com.intel.analytics.bigdl.utils.{Engine, T, Table}
+import org.apache.log4j.Logger
+import scopt.OptionParser
+
+import scala.reflect.ClassTag
+
+object ResNet50Perf {
+
+  val logger = Logger.getLogger(getClass)
+
+  val parser = new OptionParser[ResNet50PerfParams]("BigDL Local ResNet-50 Performance Test") {
+    opt[Int]('b', "batchSize")
+      .text("Batch size of input data")
+      .action((v, p) => p.copy(batchSize = v))
+    opt[Int]('i', "iteration")
+      .text("Iteration of perf test. The result will be average of each iteration time cost")
+      .action((v, p) => p.copy(iteration = v))
+    opt[Boolean]('t', "training")
+      .text(s"Perf test training or testing")
+      .action((v, p) => p.copy(training = v))
+  }
+
+  def main(argv: Array[String]): Unit = {
+    System.setProperty("bigdl.mkldnn.fusion.convbn", "true")
+    System.setProperty("bigdl.mkldnn.fusion.bnrelu", "true")
+    System.setProperty("bigdl.mkldnn.fusion.convrelu", "true")
+    System.setProperty("bigdl.mkldnn.fusion.convsum", "true")
+
+    val coreNumber: Int = Runtime.getRuntime.availableProcessors() / 2
+    System.setProperty("bigdl.mklNumThreads", s"$coreNumber")
+    Engine.setCoreNumber(1)
+    MklDnn.setNumThreads(coreNumber)
+
+    parser.parse(argv, new ResNet50PerfParams()).foreach { params =>
+      val batchSize = params.batchSize
+      val training = params.training
+      val iterations = params.iteration
+
+      val classNum = 1000
+
+      val inputFormat = Memory.Format.nchw
+      val inputShape = Array(batchSize, 3, 224, 224)
+      val input = Tensor(inputShape).rand()
+      val label = Tensor(batchSize).apply1(_ => Math.floor(RNG.uniform(0, 1) * 1000).toFloat)
+
+      val model = ResNet(batchSize, classNum, T("depth" -> 50, "dataSet" -> ImageNet))
+      val criterion = CrossEntropyCriterion()
+
+      if (training) {
+        model.compile(TrainingPhase, Array(HeapData(inputShape, inputFormat)))
+        model.training()
+      } else {
+        model.compile(InferencePhase, Array(HeapData(inputShape, inputFormat)))
+        model.evaluate()
+      }
+
+      var iteration = 0
+      while (iteration < iterations) {
+        val start = System.nanoTime()
+        val output = model.forward(input)
+
+        if (training) {
+          val _loss = criterion.forward(output, label)
+          val errors = criterion.backward(output, label).toTensor
+          model.backward(input, errors)
+        }
+
+        val takes = System.nanoTime() - start
+
+        val throughput = "%.2f".format(batchSize.toFloat / (takes / 1e9))
+        logger.info(s"Iteration $iteration, takes $takes s, throughput is $throughput imgs/sec")
+
+        iteration += 1
+      }
+    }
+  }
+}
+
+case class ResNet50PerfParams (
+    batchSize: Int = 16,
+    iteration: Int = 50,
+    training: Boolean = true
+)
+
+object ResNet {
+  def modelInit(model: Module[Float]): Unit = {
+    def initModules(model: Module[Float]): Unit = {
+      model match {
+        case container: Container[Activity, Activity, Float]
+        => container.modules.foreach(m => initModules(m))
+        case conv: SpatialConvolution =>
+          val n: Float = conv.kernelW * conv.kernelW * conv.nOutputPlane
+          val weight = Tensor[Float].resize(conv.weight.size()).apply1 { _ =>
+            RNG.normal(0, Math.sqrt(2.0f / n)).toFloat
+          }
+          val bias = Tensor[Float].resize(conv.bias.size()).apply1(_ => 0.0f)
+          conv.weight.copy(weight)
+          conv.bias.copy(bias)
+        case bn: SpatialBatchNormalization =>
+          val runningMean = Tensor[Float].resize(bn.runningMean.size()).fill(0)
+          val runningVairance = Tensor[Float].resize(bn.runningVariance.size()).fill(1)
+          bn.runningMean.copy(runningMean)
+          bn.runningVariance.copy(runningVairance)
+        case linear: Linear =>
+          val bias = Tensor[Float](linear.bias.size()).apply1(_ => 0.0f)
+          linear.bias.copy(bias)
+        case _ => Unit
+      }
+    }
+    initModules(model)
+  }
+
+  var iChannels = 0
+  def apply(batchSize: Int, classNum: Int, opt: Table): Sequential = {
+
+    val depth = opt.get("depth").getOrElse(18)
+    val shortCutType = opt.get("shortcutType")
+    val shortcutType = shortCutType.getOrElse(ShortcutType.B).asInstanceOf[ShortcutType]
+    val dataSet = opt.getOrElse[DatasetType]("dataSet", DatasetType.CIFAR10)
+    val optnet = opt.get("optnet").getOrElse(true)
+
+    def shortcut(nInputPlane: Int, nOutputPlane: Int, stride: Int, name: String): Module[Float] = {
+      val useConv = shortcutType == ShortcutType.C ||
+        (shortcutType == ShortcutType.B && nInputPlane != nOutputPlane)
+
+      if (useConv) {
+        Sequential()
+          .add(Convolution(nInputPlane, nOutputPlane, 1, 1, stride, stride, optnet = optnet)
+            .setName(s"res${name}_branch1"))
+          .add(SbnDnn(nOutputPlane).setName(s"bn${name}_branch1"))
+      } else if (nInputPlane != nOutputPlane) {
+        throw new IllegalArgumentException(s"useConv false")
+      } else {
+        Identity()
+      }
+    }
+
+    def bottleneck(n: Int, stride: Int, name: String = ""): Module[Float] = {
+      val nInputPlane = iChannels
+      iChannels = n * 4
+
+      val s = Sequential()
+      s.add(Convolution(nInputPlane, n, 1, 1, 1, 1, 0, 0, optnet = optnet).setName(
+          s"res${name}_branch2a"))
+        .add(SbnDnn(n).setName(s"bn${name}_branch2a"))
+        .add(ReLU().setName(s"res${name}_branch2a_relu"))
+        .add(Convolution(n, n, 3, 3, stride, stride, 1, 1, optnet = optnet).setName(
+          s"res${name}_branch2b"))
+        .add(SbnDnn(n).setName(s"bn${name}_branch2b"))
+        .add(ReLU().setName(s"res${name}_branch2b_relu"))
+        .add(Convolution(n, n*4, 1, 1, 1, 1, 0, 0, optnet = optnet).setName(
+          s"res${name}_branch2c"))
+        .add(SbnDnn(n * 4).setInitMethod(Zeros, Zeros).setName(s"bn${name}_branch2c"))
+
+      val model = Sequential()
+        .add(ConcatTable().
+          add(s).
+          add(shortcut(nInputPlane, n*4, stride, name)).setName(s"$name/concatTable"))
+        .add(CAddTable().setName(s"res$name"))
+        .add(ReLU().setName(s"res${name}_relu"))
+      model
+    }
+
+    def getName(i: Int, name: String): String = {
+      val name1 = i match {
+        case 1 => name + "a"
+        case 2 => name + "b"
+        case 3 => name + "c"
+        case 4 => name + "d"
+        case 5 => name + "e"
+        case 6 => name + "f"
+      }
+      return name1
+    }
+
+    def layer(block: (Int, Int, String) => Module[Float], features: Int,
+      count: Int, stride: Int = 1, name : String): Module[Float] = {
+      val s = Sequential()
+      for (i <- 1 to count) {
+        s.add(block(features, if (i == 1) stride else 1, getName(i, name)))
+      }
+      s
+    }
+
+    val model = Sequential()
+    if (dataSet == DatasetType.ImageNet) {
+      val cfg = Map(
+        50 -> ((3, 4, 6, 3), 2048, bottleneck: (Int, Int, String) => Module[Float])
+      )
+
+      require(cfg.keySet.contains(depth), s"Invalid depth ${depth}")
+
+      val (loopConfig, nFeatures, block) = cfg.get(depth).get
+      iChannels = 64
+
+      model.add(ReorderMemory(HeapData(Array(batchSize, 3, 224, 224), Memory.Format.nchw)))
+        .add(SpatialConvolution(3, 64, 7, 7, 2, 2, 3, 3, propagateBack = false)
+        .setName("conv1").setReLU(true))
+        .add(SbnDnn(64).setName("bn_conv1"))
+        .add(ReLU().setName("conv1_relu"))
+        .add(MaxPooling(3, 3, 2, 2).setName("pool1"))
+        .add(layer(block, 64, loopConfig._1, name = "2"))
+        .add(layer(block, 128, loopConfig._2, 2, name = "3"))
+        .add(layer(block, 256, loopConfig._3, 2, name = "4"))
+        .add(layer(block, 512, loopConfig._4, 2, name = "5"))
+        .add(AvgPooling(7, 7, 1, 1).setName("pool5"))
+        .add(Linear(nFeatures, classNum).setInitMethod(RandomNormal(0.0, 0.01), Zeros).setName(
+          "fc1000"))
+        .add(ReorderMemory(HeapData(Array(batchSize, classNum), Memory.Format.nc)))
+    } else {
+      throw new IllegalArgumentException(s"Invalid dataset ${dataSet}")
+    }
+
+    modelInit(model)
+    model
+  }
+
+  /**
+   * dataset type
+   * @param typeId type id
+   */
+  sealed abstract class DatasetType(typeId: Int)
+    extends Serializable
+
+  /**
+   *  define some dataset type
+   */
+  object DatasetType {
+    case object CIFAR10 extends DatasetType(0)
+    case object ImageNet extends DatasetType(1)
+  }
+
+  /**
+   * ShortcutType
+   * @param typeId type id
+   */
+  sealed abstract class ShortcutType(typeId: Int)
+    extends Serializable
+
+  /**
+   * ShortcutType-A is used for Cifar-10, ShortcutType-B is used for ImageNet.
+   * ShortcutType-C is used for others.
+   */
+  object ShortcutType{
+    case object A extends ShortcutType(0)
+    case object B extends ShortcutType(1)
+    case object C extends ShortcutType(2)
+  }
+}
+
+object Convolution {
+  def apply(
+    nInputPlane: Int,
+    nOutputPlane: Int,
+    kernelW: Int,
+    kernelH: Int,
+    strideW: Int = 1,
+    strideH: Int = 1,
+    padW: Int = 0,
+    padH: Int = 0,
+    nGroup: Int = 1,
+    propagateBack: Boolean = true,
+    optnet: Boolean = true,
+    weightDecay: Double = 1e-4): SpatialConvolution = {
+    val conv = SpatialConvolution(nInputPlane, nOutputPlane, kernelW, kernelH,
+      strideW, strideH, padW, padH, nGroup, propagateBack)
+    conv.setInitMethod(MsraFiller(false), Zeros)
+    conv
+  }
+}
+
+object SbnDnn {
+  def apply[@specialized(Float, Double) T: ClassTag](
+    nOutput: Int,
+    eps: Double = 1e-3,
+    momentum: Double = 0.9,
+    affine: Boolean = true)
+    (implicit ev: TensorNumeric[T]): SpatialBatchNormalization = {
+    SpatialBatchNormalization(nOutput, eps, momentum, affine).setInitMethod(Ones, Zeros)
+  }
+}
diff --git a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/SelectTable.scala b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/SelectTable.scala
new file mode 100644
index 00000000000..c25e5b5bc85
--- /dev/null
+++ b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/SelectTable.scala
@@ -0,0 +1,99 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.intel.analytics.bigdl.nn.mkldnn
+
+import com.intel.analytics.bigdl.nn.Utils
+import com.intel.analytics.bigdl.nn.abstractnn.Activity
+import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric
+import com.intel.analytics.bigdl.utils.{T, Table}
+
+import scala.reflect.ClassTag
+
+/**
+ * Creates a module that takes a table as input and outputs the element at index `index`
+ * (positive or negative). This can be either a table or a Tensor.
+ * The gradients of the non-index elements are zeroed Tensors of the same size.
+ * This is true regardless of the depth of the encapsulated Tensor as the function used
+ * internally to do so is recursive.
+ * @param index the index to be selected
+ */
+@SerialVersionUID(- 7562114420457472987L)
+class SelectTable(val index: Int)(implicit ev: TensorNumeric[Float]) extends MklDnnLayer {
+
+  override def updateOutput(in: Activity): Activity = {
+    val input = in.asInstanceOf[Table]
+    val index = if (this.index < 0) input.length() + this.index else this.index
+
+    require(input.contains(index), "index does not exist in the input table")
+    output = input[Activity](index)
+
+    output
+  }
+
+  override def updateGradInput(in: Activity, gradOutput: Activity): Table = {
+    val input = in.asInstanceOf[Table]
+    gradInput = T()
+    Utils.zeroTableCopy(gradInput.asInstanceOf[Table], input)
+    val index = if (this.index < 0) {
+      input.length() + this.index + 1
+    } else {
+      this.index
+    }
+
+    Utils.recursiveCopy(gradInput.asInstanceOf[Table](index), gradOutput)
+
+    require(gradInput.asInstanceOf[Table].contains(index), "Index exceeds the size of input table")
+
+    gradInput.asInstanceOf[Table]
+  }
+
+  override def toString: String = s"mkldnn.SelectTable($index)"
+
+
+  override def canEqual(other: Any): Boolean = other.isInstanceOf[SelectTable]
+
+  override def equals(other: Any): Boolean = other match {
+    case that: SelectTable =>
+      super.equals(that) &&
+        (that canEqual this) &&
+        index == that.index
+    case _ => false
+  }
+
+  override def hashCode(): Int = {
+    val state = Seq(super.hashCode(), index)
+    state.map(_.hashCode()).foldLeft(0)((a, b) => 31 * a + b)
+  }
+
+  override private[mkldnn] def initFwdPrimitives(inputs: Array[MemoryData], phase: Phase) = {
+    _inputFormats = inputs
+    _outputFormats = Array(inputs(index))
+    (inputs, _outputFormats)
+  }
+
+  override private[mkldnn] def initBwdPrimitives(grad: Array[MemoryData], phase: Phase) = {
+    _gradInputFormats = Array(grad(index))
+    _gradOutputFormats = grad
+    (grad, _gradInputFormats)
+  }
+}
+
+object SelectTable {
+  def apply(dimension: Int)(implicit ev: TensorNumeric[Float]) : SelectTable = {
+    new SelectTable(dimension)
+  }
+}
+
diff --git a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/Sequential.scala b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/Sequential.scala
new file mode 100644
index 00000000000..c8a4172f594
--- /dev/null
+++ b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/Sequential.scala
@@ -0,0 +1,391 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.intel.analytics.bigdl.nn.mkldnn
+
+import com.intel.analytics.bigdl.Module
+import com.intel.analytics.bigdl.nn.abstractnn.{AbstractModule, Activity}
+import com.intel.analytics.bigdl.nn.mkldnn.Phase.{InferencePhase, TrainingPhase}
+import com.intel.analytics.bigdl.nn.{Sequential => Seq}
+import com.intel.analytics.bigdl.tensor.Tensor
+
+import scala.collection.mutable.ArrayBuffer
+
+class Sequential extends MklDnnContainer {
+
+  val fuseConvBn = System.getProperty("bigdl.mkldnn.fusion.convbn", "false").toBoolean
+  val fuseBnRelu = System.getProperty("bigdl.mkldnn.fusion.bnrelu", "false").toBoolean
+  val fuseConvRelu = System.getProperty("bigdl.mkldnn.fusion.convrelu", "false").toBoolean
+  val fuseConvSum = System.getProperty("bigdl.mkldnn.fusion.convsum", "false").toBoolean
+
+  override def add(module: AbstractModule[_ <: Activity, _ <: Activity, Float]): this.type = {
+    require(mklDnnModules == null, "You should not call add after compilation")
+    require(module.isInstanceOf[MklDnnModule], "layer should be MklDnnModule")
+    super.add(module)
+  }
+
+  override private[mkldnn] def fusion(phase: Phase): Unit = {
+    modules.clear()
+    modules.appendAll(getFusedModules(phase).map {x =>
+      x.asInstanceOf[AbstractModule[Activity, Activity, Float]]
+    })
+    mklDnnModules = modules.map(_.asInstanceOf[MklDnnModule]).toArray
+  }
+
+  override private[mkldnn] def initFwdPrimitives(inputs: Array[MemoryData], phase: Phase) = {
+    var lastOutputFormats = inputs
+    var firstRealInputFormats: Array[MemoryData] = null
+    for (i <- 0 until mklDnnModules.length) {
+      val m = mklDnnModules(i)
+      val (realInputFormats, outputFormats) = m.initFwdPrimitives(lastOutputFormats, phase)
+      lastOutputFormats.zip(realInputFormats).foreach {
+        case (o, i) => reorderManager.register(o, i)
+      }
+      if (i == 0) firstRealInputFormats = realInputFormats
+      lastOutputFormats = outputFormats
+    }
+    (firstRealInputFormats, lastOutputFormats)
+  }
+
+  override private[mkldnn] def initBwdPrimitives(grads: Array[MemoryData], phase: Phase) = {
+    var lastGradInputFormats = grads
+    var firstRealGradOutputFormats: Array[MemoryData] = null
+    for (i <- mklDnnModules.length - 1 to 0 by -1) {
+      val m = mklDnnModules(i)
+      val (realGradOutput, gradInputFomrats) = m.initBwdPrimitives(lastGradInputFormats, phase)
+      lastGradInputFormats.zip(realGradOutput).foreach {
+        case (gi, go) => reorderManager.register(gi, go)
+      }
+      if (i == mklDnnModules.length - 1) firstRealGradOutputFormats = realGradOutput
+      lastGradInputFormats = gradInputFomrats
+    }
+    (firstRealGradOutputFormats, lastGradInputFormats)
+  }
+
+  override private[mkldnn] def initGradWPrimitives(grads: Array[MemoryData], phase: Phase) = {
+    var lastGradInputFormats = grads
+    var firstRealGradOutputFormats: Array[MemoryData] = null
+    for (i <- mklDnnModules.length - 1 to 0 by -1) {
+      val m = mklDnnModules(i)
+      val realGradOutput = m.initGradWPrimitives(lastGradInputFormats, phase)
+      lastGradInputFormats.zip(realGradOutput).foreach {
+        case (gi, go2) => reorderManager.register(gi, go2)
+      }
+      if (i == mklDnnModules.length - 1) firstRealGradOutputFormats = realGradOutput
+      lastGradInputFormats = m.gradInputFormats()
+    }
+    firstRealGradOutputFormats
+  }
+
+  override def updateOutput(input: Activity): Activity = {
+    var i = 0
+    var lastOutput = input
+    while (i < mklDnnModules.length - 1) {
+      lastOutput = reorderManager.infer(
+        mklDnnModules(i).outputFormats(),
+        mklDnnModules(i + 1).inputFormats(),
+        modules(i).forward(lastOutput)
+      )
+      i += 1
+    }
+
+    this.output = modules(i).forward(lastOutput)
+    output
+  }
+
+  override def updateGradInput(input: Activity, gradOutput: Activity): Activity = {
+    var i = modules.length - 1
+    var lastGradInput = gradOutput
+    while (i > 0) {
+      val curInput = reorderManager.infer(
+        mklDnnModules(i - 1).outputFormats(),
+        mklDnnModules(i).inputFormats(),
+        modules(i - 1).output
+      )
+      lastGradInput = reorderManager.infer(
+        mklDnnModules(i).gradInputFormats(),
+        mklDnnModules(i - 1).gradOutputFormats(),
+        modules(i).updateGradInput(curInput, lastGradInput)
+      )
+      i -= 1
+    }
+    lastGradInput = modules(0).updateGradInput(input, lastGradInput)
+
+    this.gradInput = lastGradInput
+    gradInput
+  }
+
+  override def accGradParameters(input: Activity, gradOutput: Activity): Unit = {
+    var i = modules.length - 1
+    var currentModule = modules(i)
+    var lastGradInput = gradOutput
+    while (i > 0) {
+      currentModule = modules(i)
+      val curInput = reorderManager.infer(
+        mklDnnModules(i - 1).outputFormats(),
+        mklDnnModules(i).inputFormats(),
+        modules(i - 1).output
+      )
+      currentModule.accGradParameters(curInput, lastGradInput)
+      lastGradInput = reorderManager.infer(
+        mklDnnModules(i).gradInputFormats(),
+        mklDnnModules(i - 1).gradOutputWeightFormats(),
+        modules(i).gradInput
+      )
+      i -= 1
+    }
+
+    modules(i).accGradParameters(input, lastGradInput)
+  }
+
+  override private[mkldnn] def inputFormats() = {
+    modules(0).asInstanceOf[MklDnnModule].inputFormats()
+  }
+
+  override private[mkldnn] def gradInputFormats() = {
+    modules(0).asInstanceOf[MklDnnModule].gradInputFormats()
+  }
+
+  override private[mkldnn] def outputFormats() = {
+    modules.last.asInstanceOf[MklDnnModule].outputFormats()
+  }
+
+  override private[mkldnn] def gradOutputFormats() = {
+    modules.last.asInstanceOf[MklDnnModule].gradOutputFormats()
+  }
+
+  override private[mkldnn] def gradOutputWeightFormats() = {
+    modules.last.asInstanceOf[MklDnnModule].gradOutputWeightFormats()
+  }
+
+  type ArrayBufferModules[Float] = ArrayBuffer[AbstractModule[Activity, Activity, Float]]
+  private def convWithBn(modules: Array[MklDnnModule], phase: Phase): Array[MklDnnModule] = {
+    if (fuseConvBn && phase == InferencePhase) {
+      val newModules: ArrayBuffer[MklDnnModule] = ArrayBuffer.empty
+      var lastBn: SpatialBatchNormalization = null
+
+      modules.zip(modules.drop(1) ++ Array(null)).foreach { case (f, s) =>
+        (f, s) match {
+          case (conv: SpatialConvolution, bn: SpatialBatchNormalization) =>
+            mergeConvBn(conv, bn)
+            newModules.append(conv)
+            lastBn = bn
+          case (f: MklDnnContainer, s) => f.fusion(phase); newModules.append(f)
+          case _ => if (lastBn != f) { newModules.append(f) }
+        }
+      }
+
+      newModules.toArray
+    } else {
+      modules
+    }
+  }
+
+  private def convWithReLU(modules: Array[MklDnnModule], phase: Phase): Array[MklDnnModule] = {
+    if (fuseConvRelu) {
+      val newModules: ArrayBuffer[MklDnnModule] = ArrayBuffer.empty
+      var lastReLU: ReLU = null
+
+      modules.zip(modules.drop(1) ++ Array(null)).foreach { case (f, s) =>
+        (f, s) match {
+          case (conv: SpatialConvolution, relu: ReLU) =>
+            newModules.append(conv)
+            conv.setReLU()
+            lastReLU = relu
+          case (f: MklDnnContainer, s) =>
+            f.fusion(phase)
+            newModules.append(f)
+          case _ => if (lastReLU != f) {
+            newModules.append(f)
+          }
+        }
+      }
+
+      newModules.toArray
+    } else {
+      modules
+    }
+  }
+
+  private def bnWithReLU(modules: Array[MklDnnModule], phase: Phase): Array[MklDnnModule] = {
+    if (fuseBnRelu) {
+      val newModules: ArrayBuffer[MklDnnModule] = ArrayBuffer.empty
+      var lastReLU: ReLU = null
+
+      modules.zip(modules.drop(1) ++ Array(null)).foreach { case (f, s) =>
+        (f, s) match {
+          case (bn: SpatialBatchNormalization, relu: ReLU) =>
+            newModules.append(bn)
+            bn.setReLU(true)
+            lastReLU = relu
+          case (f: MklDnnContainer, s) => f.fusion(phase); newModules.append(f)
+          case _ => if (lastReLU != f) { newModules.append(f) }
+        }
+      }
+
+      newModules.toArray
+    } else {
+      modules
+    }
+  }
+
+  private def convWithSum(modules: Array[MklDnnModule], phase: Phase): Array[MklDnnModule] = {
+    val newModules: ArrayBuffer[MklDnnModule] = ArrayBuffer.empty
+    if (!fuseConvSum || modules.length <= 2 || phase == TrainingPhase) {
+      newModules.appendAll(modules)
+    } else {
+      var lastConv: SpatialConvolution = null
+      var lastReLU: ReLU = null
+
+      modules.zip(modules.drop(1) ++ Array(null)).foreach {
+        case (f: ConcatTable, s: CAddTable) => val (conv, sbt) = convSum(f, s)
+          newModules.append(f)
+          lastConv = conv
+          if (sbt != null) {
+            newModules.append(sbt)
+          }
+        case (f: MklDnnContainer, s) => f.fusion(phase); newModules.append(f)
+        case (f: CAddTable, s: ReLU) => if (lastConv != null) {
+          lastConv.setReLU()
+          lastReLU = s
+          lastConv = null
+        } else {
+          newModules.append(f)
+        }
+        case (f, s) => if (lastReLU != f) { newModules.append(f); lastReLU = null}
+      }
+    }
+
+    newModules.toArray
+  }
+
+  private def getFusedModules(phase: Phase): Array[MklDnnModule] = {
+    val f1Modules = convWithBn(mklDnnModules, phase)
+    val f2Modules = convWithReLU(f1Modules, phase)
+    val f3Modules = bnWithReLU(f2Modules, phase)
+    val f4Modules = convWithSum(f3Modules, phase)
+    f4Modules
+  }
+
+  private def mergeConvBn(conv: SpatialConvolution, bn: SpatialBatchNormalization): Unit = {
+
+    val originVar = Tensor[Float].resize(bn.runningVariance.size()).copy(bn.runningVariance)
+    val originMean = Tensor[Float].resize(bn.runningMean.size()).copy(bn.runningMean)
+
+    val convWeight = Tensor[Float].resize(conv.weight.size()).copy(conv.weight)
+    val convBias = Tensor[Float].resize(conv.bias.size()).copy(conv.bias)
+
+    (0 until bn.nOutput).foreach { j =>
+      val variance = originVar.storage().array()(j + originVar.storageOffset() - 1)
+      val base = Math.sqrt(variance.asInstanceOf[Float] + bn.eps).toFloat
+      require(base != 0.0, s"the eps of ${bn.getName()} should be more than 0")
+
+      val weight = if (conv.nGroup == 1) {
+        convWeight.select(1, j + 1)
+      } else {
+        convWeight.select(2, j + 1)
+      }
+      weight.div(base)
+
+      val bias = convBias.storage().array()(j)
+      val mean = originMean.storage().array()(j)
+      convBias.storage().array()(j) = (bias - mean) / base
+    }
+
+    conv.weight.copy(convWeight)
+    conv.bias.copy(convBias)
+  }
+
+  private def getLast(
+    module: AbstractModule[Activity, Activity, Float]): AbstractModule[Activity, Activity, Any] = {
+    val ret = module match {
+      case sequential: Sequential => sequential.modules.last
+      case _ => module
+    }
+
+    ret.asInstanceOf[AbstractModule[Activity, Activity, Any]]
+  }
+
+  private def convSum(concatTable: ConcatTable, cAddTable: CAddTable): (SpatialConvolution,
+    SelectTable) = {
+    var branch1: AbstractModule[Activity, Activity, Any] = null
+    var branch2: AbstractModule[Activity, Activity, Any] = null
+
+    var continue = concatTable.modules.length == 2
+
+    if (continue) {
+      branch1 = getLast(concatTable.modules(0))
+      branch2 = getLast(concatTable.modules(1))
+
+      def isConvOrIdentity(module: AbstractModule[Activity, Activity, Any]): Boolean = {
+        module.isInstanceOf[SpatialConvolution] || module.isInstanceOf[Identity]
+      }
+
+      continue = continue && isConvOrIdentity(branch1) && isConvOrIdentity(branch2)
+    }
+
+    if (continue) {
+      // make sure the last module is conv
+      if (!branch2.isInstanceOf[SpatialConvolution]) {
+        // swap the modules
+        var tmp: AbstractModule[Activity, Activity, Float] = null
+
+        tmp = concatTable.modules(0)
+        concatTable.modules(0) = concatTable.modules(1)
+        concatTable.modules(1) = tmp
+
+        tmp = branch1.asInstanceOf[AbstractModule[Activity, Activity, Float]]
+        branch1 = branch2
+        branch2 = tmp.asInstanceOf[AbstractModule[Activity, Activity, Any]]
+      }
+
+      // get the index of conv, by default the output should be the first conv.
+      val (convIndex, conv, theOther) = (1, branch2.asInstanceOf[SpatialConvolution], branch1)
+      conv.setSum()
+
+      // delete CAddTable
+      val selectTable = SelectTable(convIndex)
+
+      // change the branch2's output to branch1's output
+      // FIXME maybe we should not set the conv operation
+      conv.setSumOp(theOther.asInstanceOf[Module[Float]])
+      (conv, selectTable)
+    } else {
+      (null, null)
+    }
+  }
+
+  override def toString(): String = {
+    val tab = "  "
+
+    s"${getPrintName}{${line + tab}[input -> ${
+      modules.zipWithIndex.map {
+        case (m: AbstractModule[Activity, Activity, Float], i: Int) => "(" + (i + 1) + ")"
+      }.
+        mkString(" -> ")
+    } -> output]${line + tab}" +
+      s"${
+        modules.zipWithIndex.map {
+          case (model: AbstractModule[Activity, Activity, Float], index: Int)
+          => s"(${index + 1}): ${model.setLine(line + tab)}"
+        }.
+          mkString(line + tab)
+      }$line}"
+  }
+}
+
+object Sequential {
+  def apply(): Sequential = new Sequential()
+}
diff --git a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/SoftMax.scala b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/SoftMax.scala
new file mode 100644
index 00000000000..9c59a2b1135
--- /dev/null
+++ b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/SoftMax.scala
@@ -0,0 +1,109 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.intel.analytics.bigdl.nn.mkldnn
+
+import com.intel.analytics.bigdl.mkl.{MklDnn, PropKind, Stream => DnnStream}
+import com.intel.analytics.bigdl.nn
+import com.intel.analytics.bigdl.nn.abstractnn.Activity
+import com.intel.analytics.bigdl.nn.mkldnn.Phase.{InferencePhase, TrainingPhase}
+import com.intel.analytics.bigdl.tensor.{DenseType, Tensor}
+
+import scala.collection.mutable.ArrayBuffer
+
+class SoftMax() extends MklDnnLayer {
+  val nnSoftMax = nn.SoftMax[Float]()
+
+  var updateOutputTensors: Array[Tensor[Float]] = _
+  var updateOutputMemoryPrimitives: Array[Long] = _
+
+  override private[mkldnn] def initFwdPrimitives(inputs: Array[MemoryData], phase: Phase) = {
+    phase match {
+      case TrainingPhase => (inputs, inputs) // do nothing, because mkl dnn doesn't support training
+      case InferencePhase =>
+        val axis = inputs(0).shape.length match {
+          case 1 => 0
+          case 2 => 1
+//          case 3 => 1 // TODO should support this?
+          case 4 => 1
+          case _ => throw new UnsupportedOperationException("1D, 2D, or 4D tensor expected")
+        }
+
+        _inputFormats = singleNativeData(inputs)
+        val desc = MklDnn.SoftMaxForwardDescInit(PropKind.ForwardInference,
+          inputFormats()(0).getMemoryDescription(), axis)
+        val forwardPrimDesc = MklDnn.PrimitiveDescCreate(desc, runtime.engine, 0L)
+
+        _outputFormats = Array(MemoryData.primitiveOutput(forwardPrimDesc))
+
+        val srcs = Array(inputs(0).getPrimitive(runtime))
+        val indexes = Array(0)
+        val dsts = Array(_outputFormats(0).getPrimitive(runtime))
+
+        val primitive = MklDnn.PrimitiveCreate2(forwardPrimDesc, srcs, indexes, srcs.length, dsts,
+          dsts.length)
+
+        updateOutputPrimitives = Array(primitive)
+        updateOutputMemoryPrimitives = srcs ++ dsts
+
+        output = initTensor(_outputFormats(0))
+
+        (_inputFormats, _outputFormats)
+      case _ => throw new UnsupportedOperationException
+    }
+  }
+
+  override private[mkldnn] def initBwdPrimitives(grad: Array[MemoryData], phase: Phase) = {
+    (grad, grad)
+  }
+
+  override def updateOutput(input: Activity): Activity = {
+      if (this.isTraining()) {
+        nnSoftMax.forward(input)
+        output = nnSoftMax.output
+      } else {
+        if (updateOutputTensors == null) {
+          val buffer = new ArrayBuffer[Tensor[Float]]()
+          buffer.append(input.asInstanceOf[Tensor[Float]])
+          buffer.append(output.asInstanceOf[Tensor[Float]])
+          updateOutputTensors = buffer.toArray
+        }
+
+        input.toTensor[Float].getTensorType match {
+          case DenseType => updateOutputTensors(0) = input.toTensor
+          case _ =>
+        }
+
+        MklDnnOps.streamSubmit(runtime.stream, 1,
+          updateOutputPrimitives,
+          updateOutputPrimitives.length,
+          updateOutputMemoryPrimitives, updateOutputTensors)
+    }
+
+    output
+  }
+
+  override def updateGradInput(input: Activity, gradOutput: Activity): Activity = {
+    gradInput = nnSoftMax.backward(input, gradOutput)
+    gradInput
+  }
+}
+
+object SoftMax {
+  def apply(): SoftMax = {
+    new SoftMax()
+  }
+}
diff --git a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/SpatialBatchNormalization.scala b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/SpatialBatchNormalization.scala
new file mode 100644
index 00000000000..10a9d120ef0
--- /dev/null
+++ b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/SpatialBatchNormalization.scala
@@ -0,0 +1,307 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.intel.analytics.bigdl.nn.mkldnn
+
+import com.intel.analytics.bigdl.mkl.{AlgKind, Memory, MklDnn, PropKind, Query}
+import com.intel.analytics.bigdl.nn.abstractnn.{Activity, Initializable}
+import com.intel.analytics.bigdl.nn.mkldnn.Phase.{InferencePhase, TrainingPhase}
+import com.intel.analytics.bigdl.nn.{Ones, VariableFormat, Zeros}
+import com.intel.analytics.bigdl.tensor._
+
+import scala.collection.mutable.ArrayBuffer
+
+class SpatialBatchNormalization(
+  val nOutput: Int,
+  val eps: Double = 1e-5,
+  val momentum: Double = 0.1,
+  val affine: Boolean = true,
+  private val initWeight: Tensor[Float] = null,
+  private val initBias: Tensor[Float] = null,
+  private val initGradWeight: Tensor[Float] = null,
+  private val initGradBias: Tensor[Float] = null
+) extends MklDnnLayer with Initializable {
+
+  private var forwardDesc: Long = 0L
+  private var _relu: Boolean = false
+
+  def setReLU(value: Boolean): this.type = {
+    _relu = value
+    this
+  }
+  def relu: Boolean = _relu
+
+  var updateOutputTensors: Array[Tensor[Float]] = _
+  var updateOutputMemoryPrimitives: Array[Long] = _
+  var updateGradInputTensors: Array[Tensor[Float]] = _
+  var updateGradInputMemoryPrimitives: Array[Long] = _
+
+  @transient var mean: DnnTensor[Float] = DnnTensor[Float](nOutput)
+  @transient var variance: DnnTensor[Float] = DnnTensor[Float](nOutput)
+  @transient var runningMean: DnnTensor[Float] = DnnTensor[Float](nOutput)
+  @transient var runningVariance: DnnTensor[Float] = DnnTensor[Float](nOutput)
+  @transient var weightAndBias: DnnTensor[Float] = DnnTensor[Float](Array(nOutput * 2))
+  @transient var gradWeightAndBias: DnnTensor[Float] = DnnTensor[Float](Array(nOutput * 2))
+
+  var scaleFactor: Float = 0.0f
+  var biasFactor: Float = 0.0f
+
+  {
+    val wInit = Ones // RandomUniform(0, 1)
+    val bInit = Zeros
+    setInitMethod(wInit, bInit)
+  }
+
+  override def reset(): Unit = {
+    val init = Tensor[Float]().resize(Array(2, nOutput))
+    val weight = init.select(1, 1)
+    val bias = init.select(1, 2)
+
+    if (initWeight != null) {
+      require(initWeight.size(1) == nOutput)
+      weight.copy(initWeight)
+    } else {
+      weightInitMethod.init(weight, VariableFormat.ONE_D)
+    }
+
+    if (initBias != null) {
+      require(initBias.size(1) == nOutput)
+      bias.copy(initBias)
+    } else {
+      biasInitMethod.init(bias, VariableFormat.ONE_D)
+    }
+
+    weightAndBias.copy(init.view(2 * nOutput))
+
+    val zeros = Tensor[Float](Array(nOutput)).fill(0)
+    mean.copy(zeros)
+    variance.copy(zeros)
+  }
+
+  object Index {
+    val input = 0
+    val weight = 1
+    val output = 2
+    val mean = 3
+    val variance = 4
+  }
+
+  override private[mkldnn] def initFwdPrimitives(inputs: Array[MemoryData], phase: Phase) = {
+    _inputFormats = inputs
+
+    val m = inputFormats()(0).shape.product / this.nOutput
+    biasFactor = if (m > 1) { m.toFloat / (m - 1) } else { 1 }
+
+    val List(mean, variance, runningMean, runningVariance): List[NativeData] =
+      (0 until 4).map { _ =>
+        NativeData(Array(nOutput), Memory.Format.x)
+      }.toList
+    // weight and bias should be combined
+    val weightAndBias: NativeData = NativeData(Array(nOutput * 2), Memory.Format.x)
+
+    forwardDesc = phase match {
+      case TrainingPhase =>
+        MklDnn.BatchNormForwardDescInit(PropKind.Forward,
+          inputs(0).getMemoryDescription(), eps.toFloat, MklDnn.BatchNormFlag.mkldnn_use_scaleshift)
+      case InferencePhase =>
+        // we always use the weight and bias / scale and offset. So the flags should be combined
+        // with use_scaleshift and use_global_stats.
+        MklDnn.BatchNormForwardDescInit(PropKind.ForwardInference,
+          inputs(0).getMemoryDescription(), eps.toFloat,
+          MklDnn.BatchNormFlag.mkldnn_use_global_stats | MklDnn.BatchNormFlag.mkldnn_use_scaleshift)
+      case _ => throw new UnsupportedOperationException
+    }
+
+    val primDesc = if (relu) {
+      val postOps = MklDnn.CreatePostOps()
+      MklDnn.PostOpsAppendEltwise(postOps, 1.0f, AlgKind.EltwiseRelu, 0.0f, 0.0f)
+      val attr = MklDnn.CreateAttr()
+      MklDnn.AttrSetPostOps(attr, postOps)
+      MklDnn.PrimitiveDescCreateV2(forwardDesc, attr, runtime.engine, 0)
+      // TODO we should destroy these ops
+    } else {
+      MklDnn.PrimitiveDescCreate(forwardDesc, runtime.engine, 0)
+    }
+
+    _inputFormats = Array(MemoryData.operationWant(primDesc, Query.SrcPd))
+    _outputFormats = Array(MemoryData.operationWant(primDesc, Query.DstPd))
+
+    val (srcs, dsts) = if (phase == TrainingPhase) {
+      val srcs = Array(inputFormats()(0), weightAndBias).map(_.getPrimitive(runtime))
+      val dsts = Array(outputFormats()(0), mean, variance).map(_.getPrimitive(runtime))
+      (srcs, dsts)
+    } else {
+      val srcs = Array(inputFormats()(0), runningMean, runningVariance, weightAndBias).map { x =>
+        x.getPrimitive(runtime)
+      }
+      val dsts = Array(outputFormats()(0).getPrimitive(runtime))
+      (srcs, dsts)
+    }
+    val indexes = Array.fill(srcs.length)(0)
+
+    val primitive = MklDnn.PrimitiveCreate2(primDesc, srcs, indexes, srcs.length, dsts, dsts.length)
+
+    updateOutputMemoryPrimitives = srcs ++ dsts
+    updateOutputPrimitives = Array(primitive)
+    output = initTensor(outputFormats()(0))
+
+    if (phase == TrainingPhase) {
+      this.runningMean.zero()
+      this.runningVariance.zero()
+    }
+
+    if (updateOutputTensors != null) {
+      updateOutputTensors = Array.empty
+    }
+
+    (inputFormats(), outputFormats())
+  }
+
+  override def updateOutput(input: Activity): Activity = {
+    if (updateOutputTensors == null) {
+      if (this.isTraining()) {
+        val buffer = new ArrayBuffer[Tensor[Float]]()
+        buffer.append(input.asInstanceOf[Tensor[Float]])
+        buffer.append(weightAndBias)
+        buffer.append(output.asInstanceOf[Tensor[Float]])
+        buffer.append(mean)
+        buffer.append(variance)
+        updateOutputTensors = buffer.toArray
+      } else {
+        val buffer = new ArrayBuffer[Tensor[Float]]()
+        buffer.append(input.asInstanceOf[Tensor[Float]])
+        buffer.append(runningMean)
+        buffer.append(runningVariance)
+        buffer.append(weightAndBias)
+        buffer.append(output.asInstanceOf[Tensor[Float]])
+        updateOutputTensors = buffer.toArray
+      }
+    }
+
+    updateWithNewTensor(updateOutputTensors, 0, input)
+
+    MklDnnOps.streamSubmit(runtime.stream, 1, updateOutputPrimitives, updateOutputPrimitives.length,
+      updateOutputMemoryPrimitives, updateOutputTensors)
+
+    if (this.isTraining()) {
+      // update running(Mean, Var) and scaleFactor
+      scaleFactor = scaleFactor * momentum.toFloat + 1
+
+      mean.axpby(1, momentum.toFloat, runningMean)
+      variance.axpby(biasFactor, momentum.toFloat, runningVariance)
+    }
+
+    output
+  }
+
+  override private[mkldnn] def initBwdPrimitives(grad: Array[MemoryData], phase: Phase) = {
+    _gradOutputFormats = Array(NativeData(outputFormats()(0).shape, outputFormats()(0).layout))
+
+    // [PERF] the format of gradInput should be the same as input
+    val backwardDesc = phase match {
+      case TrainingPhase =>
+        MklDnn.BatchNormBackwardDescInit(PropKind.Backward,
+          inputFormats()(0).getMemoryDescription(),
+          inputFormats()(0).getMemoryDescription(), eps.toFloat,
+          MklDnn.BatchNormFlag.mkldnn_use_scaleshift)
+      case _ => throw new UnsupportedOperationException
+    }
+
+    val gradWeightAndBias: NativeData = NativeData(Array(nOutput * 2), Memory.Format.x)
+    val gradWeightPrimitive = gradWeightAndBias.getPrimitive(runtime)
+
+    val primDesc = MklDnn.PrimitiveDescCreate(backwardDesc, runtime.engine, 0)
+
+    _gradInputFormats = Array(MemoryData.operationWant(primDesc, Query.DiffSrcPd))
+
+    // maybe will throw null exception
+    val srcs = Array(updateOutputMemoryPrimitives(Index.input),
+      updateOutputMemoryPrimitives(Index.mean),
+      updateOutputMemoryPrimitives(Index.variance),
+      grad(0).getPrimitive(runtime),
+      updateOutputMemoryPrimitives(Index.weight))
+    val indexes = Array.fill(srcs.length)(0)
+    val dsts = Array(gradInputFormats()(0), gradWeightAndBias).map(_.getPrimitive(runtime))
+
+    val primitive = MklDnn.PrimitiveCreate2(primDesc, srcs, indexes, srcs.length,
+      dsts, dsts.length)
+
+    updateGradInputMemoryPrimitives = srcs ++ dsts
+    updateGradInputPrimitives = Array(primitive)
+    gradInput = initTensor(gradInputFormats()(0))
+
+    (_gradOutputFormats, gradInputFormats())
+  }
+
+  override def updateGradInput(input: Activity, gradOutput: Activity): Activity = {
+    if (updateGradInputTensors == null) {
+      val buffer = new ArrayBuffer[Tensor[Float]]()
+      buffer.append(input.asInstanceOf[Tensor[Float]])
+      buffer.append(mean)
+      buffer.append(variance)
+      buffer.append(gradOutput.asInstanceOf[Tensor[Float]])
+      buffer.append(weightAndBias)
+      buffer.append(gradInput.asInstanceOf[Tensor[Float]])
+      buffer.append(gradWeightAndBias.asInstanceOf[Tensor[Float]])
+      updateGradInputTensors = buffer.toArray
+    }
+
+    updateWithNewTensor(updateGradInputTensors, 0, input)
+    updateWithNewTensor(updateGradInputTensors, 3, gradOutput)
+
+    MklDnnOps.streamSubmit(runtime.stream, 1, updateGradInputPrimitives,
+      updateGradInputPrimitives.length, updateGradInputMemoryPrimitives, updateGradInputTensors)
+
+    gradInput
+  }
+
+  override def accGradParameters(input: Activity, gradOutput: Activity): Unit = {
+    // do nothing
+  }
+
+  override def zeroGradParameters(): Unit = {
+    if (affine) { gradWeightAndBias.zero() }
+    if (gradInput != null) { gradInput.asInstanceOf[DnnTensor[Float]].zero() }
+  }
+
+  override def parameters(): (Array[Tensor[Float]], Array[Tensor[Float]]) = {
+    (Array(weightAndBias), Array(gradWeightAndBias))
+  }
+
+  override def parametersWithShape(): (Array[MemoryData], Array[MemoryData]) = {
+    (Array(NativeData(weightAndBias.size(), Memory.Format.x)),
+    Array(NativeData(gradWeightAndBias.size(), Memory.Format.x)))
+  }
+
+  override def toString(): String = {
+    s"nn.mkl.SpatialBatchNormalization($nOutput, $eps, $momentum, $affine)"
+  }
+}
+
+object SpatialBatchNormalization {
+  def apply(
+    nOutput: Int,
+    eps: Double = 1e-5,
+    momentum: Double = 0.1,
+    affine: Boolean = true,
+    initWeight: Tensor[Float] = null,
+    initBias: Tensor[Float] = null,
+    initGradWeight: Tensor[Float] = null,
+    initGradBias: Tensor[Float] = null): SpatialBatchNormalization = {
+    new SpatialBatchNormalization(nOutput, eps, momentum, affine,
+      initWeight, initBias, initGradWeight, initGradBias)
+  }
+}
diff --git a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/SpatialConvolution.scala b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/SpatialConvolution.scala
new file mode 100644
index 00000000000..45e14433e4d
--- /dev/null
+++ b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nn/mkldnn/SpatialConvolution.scala
@@ -0,0 +1,402 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.intel.analytics.bigdl.nn.mkldnn
+
+import com.intel.analytics.bigdl.Module
+import com.intel.analytics.bigdl.mkl._
+import com.intel.analytics.bigdl.nn._
+import com.intel.analytics.bigdl.nn.abstractnn._
+import com.intel.analytics.bigdl.tensor.{DnnTensor, Tensor}
+
+import scala.collection.mutable.ArrayBuffer
+
+class SpatialConvolution(
+  val nInputPlane: Int,
+  val nOutputPlane: Int,
+  val kernelW: Int,
+  val kernelH: Int,
+  val strideW: Int = 1,
+  val strideH: Int = 1,
+  val padW: Int = 0,
+  val padH: Int = 0,
+  val nGroup: Int = 1,
+  val propagateBack: Boolean = true,
+  val initWeight: Tensor[Float] = null,
+  val initBias: Tensor[Float] = null,
+  val initGradWeight: Tensor[Float] = null,
+  val initGradBias: Tensor[Float] = null,
+  val withBias: Boolean = true,
+  val format: DataFormat = DataFormat.NCHW
+) extends MklDnnLayer with Initializable {
+  private val weightShape = if (nGroup == 1) {
+    Array(nOutputPlane, nInputPlane, kernelH, kernelW)
+  } else {
+    Array (nGroup, nOutputPlane / nGroup, nInputPlane / nGroup, kernelH, kernelW)
+  }
+
+  // !!!important!!! this is for weight conversion. The weights in forward and backward is
+  // different.
+  val reorderManager = new ReorderManager
+
+  val weight: DnnTensor[Float] = DnnTensor[Float](weightShape)
+  var weightForBackward: DnnTensor[Float] = _
+  val bias: DnnTensor[Float] = DnnTensor[Float](Array(nOutputPlane))
+  val gradWeight: DnnTensor[Float] = DnnTensor[Float](weightShape)
+  val gradBias: DnnTensor[Float] = DnnTensor[Float](Array(nOutputPlane))
+
+  var forwardPrimDesc: Long = 0L
+
+  var updateOutputMemoryPrimitives: Array[Long] = _
+  var updateOutputTensors: Array[Tensor[Float]] = _
+  var updateGradInputMemoryPrimitives: Array[Long] = _
+  var updateGradInputTensors: Array[Tensor[Float]] = _
+  var updateGradWMemoryPrimitives: Array[Long] = _
+  var updateGradWTensors: Array[Tensor[Float]] = _
+
+  var _relu = false
+  var _sum = false
+
+  def relu: Boolean = _relu
+  def setReLU(value: Boolean = true): this.type = {
+    _relu = value
+    this
+  }
+
+  def sum: Boolean = _sum
+  def setSum(value: Boolean = true): this.type = {
+    _sum = value
+    this
+  }
+
+  var sumOp: MklDnnLayer = null
+  def setSumOp(conv: Module[Float]): this.type = {
+    sumOp = conv.asInstanceOf[MklDnnLayer]
+    this
+  }
+
+  object ParamsShape {
+    var weight: MemoryData = _
+    var weightForBackward: MemoryData = _
+    var bias: MemoryData = _
+    var gradWeight: MemoryData = _
+    var gradBias: MemoryData = _
+  }
+
+  private def getOutputShape(oh: Int, ow: Int, batchSize: Int = -1): Array[Int] = {
+    format match {
+      case DataFormat.NCHW =>
+        if (batchSize == -1) {
+          Array(nOutputPlane, oh, ow)
+        } else {
+          Array(batchSize, nOutputPlane, oh, ow)
+        }
+      case DataFormat.NHWC =>
+        if (batchSize == -1) {
+          Array(oh, ow, nOutputPlane)
+        } else {
+          Array(batchSize, oh, ow, nOutputPlane)
+        }
+
+    }
+  }
+
+  {
+    val stdv = 1.0 / math.sqrt(kernelW * kernelH * nInputPlane)
+    val wInit: InitializationMethod = RandomUniform(-stdv, stdv)
+    val bInit: InitializationMethod = if (withBias) RandomUniform(-stdv, stdv)
+    else null
+    setInitMethod(wInit, bInit)
+  }
+
+  override def reset(): Unit = {
+    if (initWeight == null) { // TODO only support oihw format weights
+      val t = Tensor[Float](weightShape)
+      weightInitMethod.init(t, VariableFormat.OUT_IN)
+      weight.copy(t)
+    } else {
+      weight.copy(initWeight)
+    }
+
+    if (initBias == null) {
+      val t = Tensor[Float](Array(nOutputPlane))
+      biasInitMethod.init(t, VariableFormat.ONE_D)
+      bias.copy(t)
+    } else {
+      bias.copy(initBias)
+    }
+
+    zeroGradParameters()
+  }
+
+  override private[mkldnn] def initFwdPrimitives(inputs: Array[MemoryData], phase: Phase) = {
+    reorderManager.setRuntime(runtime)
+
+    val inputHeight = inputs(0).shape(2) // TODO only supports 4-D and nchw
+    val inputWidth = inputs(0).shape(3)
+
+    val sizes = if (padW == -1 && padH == -1) {
+        Utils.getSAMEOutSizeAndPadding(inputHeight, inputWidth, strideH, strideW, kernelH, kernelW)
+      } else {
+        Utils.getOutSizeAndPadding(inputHeight, inputWidth, strideH, strideW, kernelH, kernelW,
+          padH, padW, ceilMode = false)
+      }
+
+    val outputHeight = sizes(4)
+    val outputWidth = sizes(5)
+
+    val inputShape = inputs(0).shape
+    val outputShape = Array(inputs(0).shape(0), nOutputPlane, outputHeight, outputWidth)
+
+    val src = NativeData(inputShape, Memory.Format.any)
+    val wei = NativeData(weightShape, Memory.Format.any)
+    val bis = NativeData(bias.size(), Memory.Format.x)
+    val dst = NativeData(outputShape, Memory.Format.any)
+
+    val desc = MklDnn.ConvForwardDescInit(
+      PropKind.ForwardTraining, AlgKind.ConvolutionDirect,
+      src.getMemoryDescription(),
+      wei.getMemoryDescription(),
+      bis.getMemoryDescription(),
+      dst.getMemoryDescription(),
+      Array(strideW, strideH), Array(padH, padW), Array(padH, padW), // TODO check the meaning
+      MklDnn.PaddingKind.mkldnnPaddingZero)
+
+    forwardPrimDesc = if (relu || sum) {
+      val postOps = MklDnn.CreatePostOps()
+      if (sum) {
+        MklDnn.PostOpsAppendSum(postOps, 1.0f)
+      }
+      if (relu) {
+        MklDnn.PostOpsAppendEltwise(postOps, 1.0f, AlgKind.EltwiseRelu, 0.0f, 0.0f)
+      }
+      val attr = MklDnn.CreateAttr()
+      MklDnn.AttrSetPostOps(attr, postOps)
+
+      MklDnn.PrimitiveDescCreateV2(desc, attr, runtime.engine, 0)
+      // TODO we should destroy these ops
+    } else {
+      MklDnn.PrimitiveDescCreate(desc, runtime.engine, 0)
+    }
+
+    val List(realSrc, realWei, realDst) = List(Query.SrcPd, Query.WeightsPd, Query.DstPd).map {x =>
+      MemoryData.operationWant(forwardPrimDesc, x)
+    }
+
+    ParamsShape.weight = realWei
+    ParamsShape.bias = bis
+
+    val srcs = Array(realSrc.getPrimitive(runtime), realWei.getPrimitive(runtime),
+      bis.getPrimitive(runtime))
+    val indexes = Array.fill(srcs.length)(0)
+    val dsts = Array(realDst.getPrimitive(runtime))
+
+    val primitive = MklDnn.PrimitiveCreate2(forwardPrimDesc, srcs, indexes, srcs.length,
+      dsts, dsts.length)
+
+    updateOutputMemoryPrimitives = srcs ++ dsts
+    updateOutputPrimitives = Array(primitive)
+    output = initTensor(dst)
+
+    _inputFormats = Array(realSrc)
+    _outputFormats = Array(realDst)
+    (_inputFormats, _outputFormats)
+  }
+
+  override def updateOutput(input: Activity): Activity = {
+    if (updateOutputTensors == null) {
+      val buffer = new ArrayBuffer[Tensor[Float]]()
+      buffer.append(input.asInstanceOf[Tensor[Float]])
+      buffer.append(weight)
+      buffer.append(bias)
+      if (sum) {
+        output = sumOp.output
+      }
+      buffer.append(output.asInstanceOf[Tensor[Float]])
+      updateOutputTensors = buffer.toArray
+    }
+
+    updateWithNewTensor(updateOutputTensors, 0, input)
+
+    MklDnnOps.streamSubmit(runtime.stream, 1, updateOutputPrimitives, updateOutputPrimitives.length,
+      updateOutputMemoryPrimitives, updateOutputTensors)
+
+    output
+  }
+
+  override private[mkldnn] def initBwdPrimitives(grad: Array[MemoryData], phase: Phase) = {
+    val inputShape = inputFormats()(0).shape.length match {
+      case 1 => inputFormats()(0).shape ++ Array(1) // TODO Test
+      case _ => inputFormats()(0).shape
+    }
+
+    val outputShape = outputFormats()(0).shape
+
+    val src = NativeData(inputShape, Memory.Format.any)
+    val wei = NativeData(weightShape, Memory.Format.any)
+    val bis = NativeData(bias.size(), Memory.Format.x)
+    val dst = NativeData(outputShape, Memory.Format.any)
+
+    val desc = MklDnn.ConvBackwardDataDescInit(
+      AlgKind.ConvolutionDirect,
+      src.getMemoryDescription(),
+      wei.getMemoryDescription(), // TODO check correctness of strides and padding
+      dst.getMemoryDescription(), Array(strideW, strideH), Array(padH, padW), Array(padH, padW),
+      MklDnn.PaddingKind.mkldnnPaddingZero)
+    val backwardPrimDesc = MklDnn.PrimitiveDescCreate(desc, runtime.engine, forwardPrimDesc)
+
+    val List(realDiffSrc, realWei, realDiffDst) =
+      List(Query.DiffSrcPd, Query.WeightsPd, Query.DiffDstPd).map {x =>
+        MemoryData.operationWant(backwardPrimDesc, x)
+      }
+
+    ParamsShape.weightForBackward = realWei
+
+    reorderManager.register(ParamsShape.weight, realWei)
+
+    val srcs = Array(realDiffDst.getPrimitive(runtime), realWei.getPrimitive(runtime),
+      inputFormats()(0).getPrimitive(runtime))
+    val indexes = Array.fill(srcs.length)(0)
+    val dsts = Array(realDiffSrc.getPrimitive(runtime))
+
+    val primitive = MklDnn.PrimitiveCreate2(backwardPrimDesc, srcs, indexes, srcs.length,
+      dsts, dsts.length)
+
+    updateGradInputMemoryPrimitives = srcs ++ dsts
+    updateGradInputPrimitives = Array(primitive)
+    gradInput = initTensor(realDiffSrc)
+
+    _gradInputFormats = Array(realDiffSrc)
+    _gradOutputFormats = Array(realDiffDst)
+    (_gradOutputFormats, _gradInputFormats)
+  }
+
+  override def updateGradInput(input: Activity, gradOutput: Activity): Activity = {
+    weightForBackward = reorderManager.infer(Array(ParamsShape.weight),
+      Array(ParamsShape.weightForBackward), weight).asInstanceOf[DnnTensor[Float]]
+
+    if (updateGradInputTensors == null) {
+      val buffer = new ArrayBuffer[Tensor[Float]]()
+      buffer.append(gradOutput.asInstanceOf[Tensor[Float]])
+      buffer.append(weightForBackward)
+      buffer.append(input.asInstanceOf[Tensor[Float]])
+      buffer.append(gradInput.asInstanceOf[Tensor[Float]])
+      updateGradInputTensors = buffer.toArray
+    }
+
+    updateWithNewTensor(updateGradInputTensors, 2, input)
+    updateWithNewTensor(updateGradInputTensors, 0, gradOutput)
+
+    MklDnnOps.streamSubmit(runtime.stream, 1, updateGradInputPrimitives,
+      updateGradInputPrimitives.length, updateGradInputMemoryPrimitives, updateGradInputTensors)
+
+    gradInput
+  }
+  override private[mkldnn] def initGradWPrimitives(grad: Array[MemoryData],
+    phase: Phase): Array[MemoryData] = {
+    val inputShape = inputFormats()(0).shape
+    val outputShape = inputFormats()(0).shape
+
+    val src = NativeData(inputShape, Memory.Format.any)
+    val wei = NativeData(weightShape, Memory.Format.any)
+    val bis = NativeData(bias.size(), Memory.Format.x)
+
+    val desc = MklDnn.ConvBackwardWeightsDescInit(
+      AlgKind.ConvolutionDirect,
+      src.getMemoryDescription(),
+      wei.getMemoryDescription(),
+      bis.getMemoryDescription(),
+      grad(0).getMemoryDescription(), Array(strideW, strideH), Array(padH, padW), Array(padH, padW),
+      MklDnn.PaddingKind.mkldnnPaddingZero)
+    val gradWeightPrimDesc = MklDnn.PrimitiveDescCreate(desc, runtime.engine, forwardPrimDesc)
+
+    // TODO here seems some errors ?????? check the realSrc format.
+    val List(realSrc, realWei, realDiffDst) =
+      List(Query.SrcPd, Query.DiffWeightsPd, Query.DiffDstPd).map { x =>
+        MemoryData.operationWant(gradWeightPrimDesc, x)
+      }
+
+    ParamsShape.gradWeight = realWei
+    ParamsShape.gradBias = bis
+
+    val srcs = Array(realSrc.getPrimitive(runtime), realDiffDst.getPrimitive(runtime))
+    val indexes = Array.fill(srcs.length)(0)
+    val dsts = Array(realWei.getPrimitive(runtime), bis.getPrimitive(runtime))
+
+    val primitive = MklDnn.PrimitiveCreate2(gradWeightPrimDesc, srcs, indexes, srcs.length,
+      dsts, dsts.length)
+
+    updateGradWMemoryPrimitives = srcs ++ dsts
+    accGradientPrimitives = Array(primitive)
+
+    _gradOutputFormatsForWeight = Array(realDiffDst)
+    (_gradOutputFormatsForWeight)
+  }
+
+  override def accGradParameters(input: Activity, gradOutput: Activity): Unit = {
+    if (updateGradWTensors == null) {
+      val buffer = new ArrayBuffer[Tensor[Float]]()
+      buffer.append(input.asInstanceOf[Tensor[Float]])
+      buffer.append(gradOutput.asInstanceOf[Tensor[Float]])
+      buffer.append(gradWeight)
+      buffer.append(gradBias)
+      updateGradWTensors = buffer.toArray
+    }
+
+    updateWithNewTensor(updateGradWTensors, 0, input)
+    updateWithNewTensor(updateGradWTensors, 1, gradOutput)
+
+    MklDnnOps.streamSubmit(runtime.stream, 1, accGradientPrimitives,
+      accGradientPrimitives.length, updateGradWMemoryPrimitives, updateGradWTensors)
+  }
+
+  override def parameters(): (Array[Tensor[Float]], Array[Tensor[Float]]) = {
+    (Array(weight, bias), Array(gradWeight, gradBias))
+  }
+
+  override def zeroGradParameters(): Unit = {
+    gradWeight.zero()
+    gradBias.zero()
+  }
+
+  override def parametersWithShape(): (Array[MemoryData], Array[MemoryData]) = {
+    (Array(ParamsShape.weight, ParamsShape.bias), Array(ParamsShape.gradWeight, ParamsShape.bias))
+  }
+}
+
+object SpatialConvolution {
+  def apply(
+    nInputPlane: Int,
+    nOutputPlane: Int,
+    kW: Int,
+    kH: Int,
+    dW: Int = 1,
+    dH: Int = 1,
+    padW: Int = 0,
+    padH: Int = 0,
+    nGroup: Int = 1,
+    propagateBack: Boolean = true,
+    initWeight: Tensor[Float] = null,
+    initBias: Tensor[Float] = null,
+    initGradWeight: Tensor[Float] = null,
+    initGradBias: Tensor[Float] = null,
+    withBias: Boolean = true,
+    format: DataFormat = DataFormat.NCHW): SpatialConvolution = {
+    new SpatialConvolution(nInputPlane, nOutputPlane, kW, kH, dW,
+      dH, padW, padH, nGroup, propagateBack,
+      initWeight, initBias, initGradWeight, initGradBias, withBias, format)
+  }
+}
diff --git a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/tensor/ArrayStorage.scala b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/tensor/ArrayStorage.scala
index ab318d87a68..1cce1a8e4a5 100644
--- a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/tensor/ArrayStorage.scala
+++ b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/tensor/ArrayStorage.scala
@@ -18,7 +18,9 @@ package com.intel.analytics.bigdl.tensor
 
 import java.util
 
-import scala.reflect.ClassTag
+import com.intel.analytics.bigdl.mkl.Memory
+
+import scala.reflect._
 
 private[tensor] class ArrayStorage[@specialized(Double, Float) T: ClassTag](
   private[tensor] var values: Array[T]) extends Storage[T] {
@@ -38,12 +40,12 @@ private[tensor] class ArrayStorage[@specialized(Double, Float) T: ClassTag](
     source match {
       case s: ArrayStorage[T] => System.arraycopy(s.values, sourceOffset,
         this.values, offset, length)
-      case s: Storage[T] =>
-        var i = 0
-        while (i < length) {
-          this.values(i + offset) = s(i + sourceOffset)
-          i += 1
-        }
+      case s: DnnStorage[T] =>
+        require(classTag[T] == ClassTag.Float, "Only support copy float dnn storage")
+        require(sourceOffset == 0, "dnn storage offset should be 0")
+        Memory.CopyPtr2Array(s.ptr.address, 0, values.asInstanceOf[Array[Float]], offset, length,
+          DnnStorage.FLOAT_BYTES)
+      case _ => throw new UnsupportedOperationException("Only support dnn or array storage")
     }
     this
   }
diff --git a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/tensor/DenseTensor.scala b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/tensor/DenseTensor.scala
index a5692b024f5..6604e148d4e 100644
--- a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/tensor/DenseTensor.scala
+++ b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/tensor/DenseTensor.scala
@@ -28,7 +28,7 @@ import scala.reflect.ClassTag
 
 @SerialVersionUID(5876322619614900645L)
 private[tensor] class DenseTensor[@specialized T: ClassTag](
-  private[tensor] var _storage: Storage[T],
+  private[tensor] var _storage: ArrayStorage[T],
   private[tensor] var _storageOffset: Int,
   private[tensor] var _size: Array[Int],
   private[tensor] var _stride: Array[Int],
@@ -64,7 +64,7 @@ private[tensor] class DenseTensor[@specialized T: ClassTag](
   override def squeeze(dim: Int): Tensor[T] = DenseTensor.squeeze(this, dim - 1)
 
   override def squeezeNewTensor(): Tensor[T] = {
-    val result = new DenseTensor(this.storage(), this.storageOffset(), this._size, this._stride)
+    val result = new DenseTensor(this._storage, this.storageOffset(), this._size, this._stride)
     result.squeeze()
   }
 
@@ -171,7 +171,7 @@ private[tensor] class DenseTensor[@specialized T: ClassTag](
     require(this.isContiguous(), "current tensor is not contiguous")
     require(sizes.product == this.nElement(), "invalid size eElement")
 
-    new DenseTensor(this.storage(), this.storageOffset(), sizes.clone())
+    new DenseTensor(this._storage, this.storageOffset(), sizes.clone())
   }
 
   override def unfold(dim: Int, size: Int, step: Int): Tensor[T] = {
@@ -228,7 +228,7 @@ private[tensor] class DenseTensor[@specialized T: ClassTag](
     this(new ArrayStorage[T](new Array[T](dims.product)), 0, dims.toArray,
       DenseTensor.size2Stride(dims.toArray), dims.length)
 
-  private[tensor] def this(storage: Storage[T])(implicit ev: TensorNumeric[T]) = {
+  private[tensor] def this(storage: ArrayStorage[T])(implicit ev: TensorNumeric[T]) = {
     this(null, 0, null, null, 0)
     val _storageOffset = 0
     val _size = Array(storage.length)
@@ -236,7 +236,7 @@ private[tensor] class DenseTensor[@specialized T: ClassTag](
     DenseTensor.newWithStorage(this, storage, _storageOffset, _size, _stride, ev)
   }
 
-  private[tensor] def this(storage: Storage[T], storageOffset: Int, size: Array[Int] = null,
+  private[tensor] def this(storage: ArrayStorage[T], storageOffset: Int, size: Array[Int] = null,
     stride: Array[Int] = null)(implicit ev: TensorNumeric[T]) = {
     this(null, 0, null, null, 0)
     if (storage != null) {
@@ -249,7 +249,8 @@ private[tensor] class DenseTensor[@specialized T: ClassTag](
 
   private[tensor] def this(other: Tensor[T])(implicit ev: TensorNumeric[T]) = {
     this(null, 0, null, null, 0)
-    val _storage = other.storage()
+    require(other.isInstanceOf[DenseTensor[_]], "Only support dense tensor in this operation")
+    val _storage = other.storage().asInstanceOf[ArrayStorage[T]]
     val _storageOffset = other.storageOffset() - 1
     val _size = other.size()
     val _stride = other.stride()
@@ -363,8 +364,9 @@ private[tensor] class DenseTensor[@specialized T: ClassTag](
   }
 
   override def set(other: Tensor[T]): Tensor[T] = {
-    DenseTensor.rawSet(this, other.storage(), other.storageOffset() - 1, other.nDimension(),
-      other.size(), other.stride())
+    require(other.isInstanceOf[DenseTensor[_]], "Only support dense tensor in this operation")
+    DenseTensor.rawSet(this, other.storage().asInstanceOf[ArrayStorage[T]],
+      other.storageOffset() - 1, other.nDimension(), other.size(), other.stride())
   }
 
   override def set(storage: Storage[T], storageOffset: Int = 1, sizes: Array[Int] = null,
@@ -373,7 +375,8 @@ private[tensor] class DenseTensor[@specialized T: ClassTag](
       require(sizes.length == strides.length)
     }
 
-    DenseTensor.rawSet(this, storage, storageOffset - 1,
+    require(storage.isInstanceOf[ArrayStorage[_]], "Only support array storage in this operation")
+    DenseTensor.rawSet(this, storage.asInstanceOf[ArrayStorage[T]], storageOffset - 1,
       if (sizes == null) 0 else sizes.length,
       sizes, strides)
   }
@@ -421,7 +424,15 @@ private[tensor] class DenseTensor[@specialized T: ClassTag](
   }
 
   override def copy(other: Tensor[T]): Tensor[T] = {
-    DenseTensor.copy(this, other)
+    other match {
+      case t: DnnTensor[_] =>
+        require(this.nElement() == other.nElement(), "tensor size must match")
+        this.storage().copy(other.storage(), this.storageOffset() - 1, 0, other.nElement())
+      case t: DenseTensor[_] =>
+        DenseTensor.copy(this, other)
+      case _ => throw new UnsupportedOperationException(
+        "only support copy from dense tensor or dnn tensor")
+    }
     this
   }
 
@@ -934,6 +945,7 @@ private[tensor] class DenseTensor[@specialized T: ClassTag](
   override def add(value: T, y: Tensor[T]): Tensor[T] = DenseTensorMath.cadd(this, this, value, y)
 
   override def add(x: Tensor[T]): Tensor[T] = {
+    require(x.isInstanceOf[DenseTensor[_]], "Only support dense tensor in this operation")
     if (this.nElement() == x.nElement()) {
       if (MKL.isMKLLoaded && this.isContiguous() && x.isContiguous()) {
         ev.vAdd(this.nElement(), this.storage().array(), this.storageOffset() - 1,
@@ -955,12 +967,12 @@ private[tensor] class DenseTensor[@specialized T: ClassTag](
         i += 1
       }
     } else {
-      this.add(expandTensor(x))
+      this.add(expandTensor(x.asInstanceOf[DenseTensor[T]]))
     }
     this
   }
 
-  private[tensor] def expandTensor(x: Tensor[T]): Tensor[T] = {
+  private[tensor] def expandTensor(x: DenseTensor[T]): Tensor[T] = {
     val targetSize = DenseTensor.expandSize(this, x)
     val expandStrides = new Array[Int](targetSize.length)
 
@@ -972,7 +984,7 @@ private[tensor] class DenseTensor[@specialized T: ClassTag](
       i -= 1
     }
     val expandX = new DenseTensor[T](
-      x.storage(),
+      x.storage().asInstanceOf[ArrayStorage[T]],
       x.storageOffset(),
       targetSize,
       expandStridesX
@@ -985,7 +997,7 @@ private[tensor] class DenseTensor[@specialized T: ClassTag](
         i -= 1
       }
       val tensor1 = new DenseTensor[T](
-        this.storage(),
+        this._storage,
         this.storageOffset(),
         targetSize,
         expandStrides
@@ -1031,6 +1043,7 @@ private[tensor] class DenseTensor[@specialized T: ClassTag](
     DenseTensorMath.csub(this, this, ev.negative(value), y)
 
   override def sub(x: Tensor[T]): Tensor[T] = {
+    require(x.isInstanceOf[DenseTensor[T]], "Only dense tensor is supported in this operation")
     if (this.nElement() == x.nElement()) {
       if (MKL.isMKLLoaded && this.isContiguous() && x.isContiguous() &&
         (x.getType() == DoubleType || x.getType() == FloatType)) {
@@ -1055,7 +1068,7 @@ private[tensor] class DenseTensor[@specialized T: ClassTag](
         i += 1
       }
     } else {
-      this.sub(expandTensor(x))
+      this.sub(expandTensor(x.asInstanceOf[DenseTensor[T]]))
     }
 
     this
@@ -1162,9 +1175,16 @@ private[tensor] class DenseTensor[@specialized T: ClassTag](
     this
   }
 
-  override def cmul(y: Tensor[T]): Tensor[T] = DenseTensorMath.cmul(this, this, y)
+  override def cmul(y: Tensor[T]): Tensor[T] = {
+    require(y.isInstanceOf[DenseTensor[_]], "Only support dense tensor in this operation")
+    DenseTensorMath.cmul(this, this, y.asInstanceOf[DenseTensor[T]])
+  }
 
-  override def cmul(x: Tensor[T], y: Tensor[T]): Tensor[T] = DenseTensorMath.cmul(this, x, y)
+  override def cmul(x: Tensor[T], y: Tensor[T]): Tensor[T] = {
+    require(x.isInstanceOf[DenseTensor[_]], "Only support dense tensor in this operation")
+    require(y.isInstanceOf[DenseTensor[_]], "Only support dense tensor in this operation")
+    DenseTensorMath.cmul(this, x.asInstanceOf[DenseTensor[T]], y.asInstanceOf[DenseTensor[T]])
+  }
 
   override def cdiv(y: Tensor[T]): Tensor[T] = DenseTensorMath.cdiv(this, this, y)
 
@@ -1199,6 +1219,7 @@ private[tensor] class DenseTensor[@specialized T: ClassTag](
   override def div(value: T): Tensor[T] = DenseTensorMath.mul(this, null, ev.inv(value))
 
   override def div(x: Tensor[T]): Tensor[T] = {
+    require(x.isInstanceOf[DenseTensor[_]], "Only dense tensor is supported in this operation")
     if (this.nElement() == x.nElement()) {
       if (MKL.isMKLLoaded && this.isContiguous() && x.isContiguous()) {
         ev.vDiv(this.nElement(), this.storage().array(), this.storageOffset() - 1,
@@ -1222,7 +1243,7 @@ private[tensor] class DenseTensor[@specialized T: ClassTag](
         i += 1
       }
     } else {
-      this.div(expandTensor(x))
+      this.div(expandTensor(x.asInstanceOf[DenseTensor[T]]))
     }
 
     this
@@ -1316,8 +1337,8 @@ private[tensor] class DenseTensor[@specialized T: ClassTag](
       xSize = Array(1) ++ xSize
       i += 1
     }
-    val size = new DenseTensor(Storage[T](xSize.map(x => ev.fromType[Int](x)))).
-      cmul(new DenseTensor(Storage[T](sizes.map(x => ev.fromType[Int](x))))).
+    val size = new DenseTensor(new ArrayStorage[T](xSize.map(x => ev.fromType[Int](x)))).
+      cmul(new DenseTensor(new ArrayStorage[T](sizes.map(x => ev.fromType[Int](x))))).
       storage().array().map(x => ev.toType[Int](x))
     xTensor.resize(xSize)
     result.resize(size)
@@ -2257,7 +2278,7 @@ object DenseTensor {
   }
 
   private[tensor] def newWithStorage[@specialized(Float, Double) T: ClassTag](
-    tensor: DenseTensor[T], storage: Storage[T], storageOffset: Int, size: Array[Int],
+    tensor: DenseTensor[T], storage: ArrayStorage[T], storageOffset: Int, size: Array[Int],
     stride: Array[Int], ev: TensorNumeric[T]): DenseTensor[T] = {
     if (size != null && stride != null) {
       require(size.length == stride.length, "inconsistent size")
@@ -2278,7 +2299,7 @@ object DenseTensor {
   }
 
   private[tensor] def rawSet[@specialized(Float, Double) T: ClassTag](
-    self: DenseTensor[T], storage: Storage[T], storageOffset: Int,
+    self: DenseTensor[T], storage: ArrayStorage[T], storageOffset: Int,
     nDimension: Int, _size: Array[Int], _stride: Array[Int]): DenseTensor[T] = {
     self._storage = storage
     require(storageOffset >= 0, "Tensor: invalid storage offset")
@@ -2467,9 +2488,9 @@ object DenseTensor {
   }
 
   private[tensor] def set[@specialized(Float, Double) T: ClassTag](
-    self: DenseTensor[T], other: Tensor[T]): Tensor[T] = {
+    self: DenseTensor[T], other: DenseTensor[T]): Tensor[T] = {
     if (self != other) {
-      DenseTensor.rawSet(self, other.storage, other.storageOffset,
+      DenseTensor.rawSet(self, other.storage.asInstanceOf[ArrayStorage[T]], other.storageOffset,
         other.nDimension, other.size, other.stride)
     } else {
       self
@@ -2488,11 +2509,11 @@ object DenseTensor {
   }
 
   private[tensor] def select[@specialized(Float, Double) T: ClassTag](
-    self: DenseTensor[T], source: Tensor[T], _dimension: Int, _sliceIndex: Int): Unit = {
+    self: DenseTensor[T], source: DenseTensor[T], _dimension: Int, _sliceIndex: Int): Unit = {
     var src = source
     if (src == null) src = self
-    require(src.nDimension() > 0, "cannot select on a scalar")
-    require(_dimension >= 0 && _dimension < src.nDimension(), "out of range")
+    require(src.nDimension > 0, "cannot select on a scalar")
+    require(_dimension >= 0 && _dimension < src.nDimension, "out of range")
     require(_sliceIndex >= 0 && _sliceIndex < src.size(_dimension + 1),
       s"${_sliceIndex} out of range 0 to ${src.size(_dimension + 1) - 1}")
 
@@ -2510,14 +2531,14 @@ object DenseTensor {
   }
 
   private[tensor] def narrow[@specialized(Float, Double) T: ClassTag](
-    self: DenseTensor[T], source: Tensor[T], _dimension: Int, _firstIndex: Int, size: Int)
+    self: DenseTensor[T], source: DenseTensor[T], _dimension: Int, _firstIndex: Int, size: Int)
   : Unit = {
     var src = source
     if (src == null) {
       src = self
     }
 
-    require(_dimension >= 0 && _dimension < src.nDimension(), "dimension out of range")
+    require(_dimension >= 0 && _dimension < src.nDimension, "dimension out of range")
     require(_firstIndex >= 0 && _firstIndex < src.size(_dimension + 1),
       s"firstIndex(${_firstIndex}) out of range [0, ${src.size(_dimension + 1)})")
     require(size > 0 && _firstIndex + size <= src.size(_dimension + 1),
@@ -2532,7 +2553,7 @@ object DenseTensor {
   }
 
   private[tensor] def transpose[@specialized(Float, Double) T: ClassTag](
-    self: DenseTensor[T], source: Tensor[T], _dimension1: Int, _dimension2: Int): Unit = {
+    self: DenseTensor[T], source: DenseTensor[T], _dimension1: Int, _dimension2: Int): Unit = {
     var src = source
     if (src == null) src = self
     require(_dimension1 >= 0 && _dimension1 < src.nDimension, "out of range")
diff --git a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/tensor/DenseTensorMath.scala b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/tensor/DenseTensorMath.scala
index 03c0355fa58..1961fa7d53c 100644
--- a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/tensor/DenseTensorMath.scala
+++ b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/tensor/DenseTensorMath.scala
@@ -45,7 +45,7 @@ object DenseTensorMath {
     self
   }
 
-  def cmul[@specialized T](self: DenseTensor[T], x: Tensor[T], y: Tensor[T])
+  def cmul[@specialized T](self: DenseTensor[T], x: DenseTensor[T], y: DenseTensor[T])
     (implicit ev: TensorNumeric[T]): Tensor[T] = {
     if (x.nElement() != y.nElement() && DenseTensor.canFastBroadcast(x, y)) {
       require(self.nElement() == x.nElement(), "the self tensor nElement is not same as x" +
@@ -53,7 +53,8 @@ object DenseTensorMath {
       // recursive cmul
       var i = 0
       while(i < x.size(1)) {
-        cmul(self.select(1, i + 1).asInstanceOf[DenseTensor[T]], x.select(1, i + 1), y)
+        cmul(self.select(1, i + 1).asInstanceOf[DenseTensor[T]],
+          x.select(1, i + 1).asInstanceOf[DenseTensor[T]], y)
         i += 1
       }
     } else if (x.nElement() != y.nElement() && DenseTensor.canFastBroadcast(y, x)) {
@@ -62,7 +63,8 @@ object DenseTensorMath {
       // recursive cmul
       var i = 0
       while(i < y.size(1)) {
-        cmul(self.select(1, i + 1).asInstanceOf[DenseTensor[T]], x, y.select(1, i + 1))
+        cmul(self.select(1, i + 1).asInstanceOf[DenseTensor[T]], x,
+          y.select(1, i + 1).asInstanceOf[DenseTensor[T]])
         i += 1
       }
     } else if (x.nElement() != y.nElement()) {
diff --git a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/tensor/DnnStorage.scala b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/tensor/DnnStorage.scala
new file mode 100644
index 00000000000..9552674f3be
--- /dev/null
+++ b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/tensor/DnnStorage.scala
@@ -0,0 +1,108 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.intel.analytics.bigdl.tensor
+
+import com.intel.analytics.bigdl.mkl.Memory
+
+import scala.reflect._
+
+/**
+ * Represent a native array which is needed by mkl-dnn
+ * @param size Storage size
+ * @tparam T data type, only support float now
+ */
+private[tensor] class DnnStorage[T: ClassTag](size: Int) extends Storage[T] {
+
+  require(classTag[T] == ClassTag.Float, "DnnStorage only support float")
+
+  private var _isReleased: Boolean = false
+
+  // Hold the address of the native array
+  val ptr: Pointer = new Pointer(allocate(size))
+
+  override def length(): Int = size
+
+  override def apply(index: Int): T =
+    throw new UnsupportedOperationException("Not support this operation in DnnStorage")
+
+  /**
+   * Set the element at position index in the storage. Valid range of index is 1 to length()
+   *
+   * @param index
+   * @param value
+   */
+  override def update(index: Int, value: T): Unit =
+    throw new UnsupportedOperationException("Not support this operation in DnnStorage")
+
+  override def copy(source: Storage[T], offset: Int, sourceOffset: Int, length: Int)
+  : this.type = {
+    source match {
+      case s: ArrayStorage[T] =>
+        Memory.CopyArray2Ptr(s.array().asInstanceOf[Array[Float]], sourceOffset,
+          ptr.address, offset, length, DnnStorage.FLOAT_BYTES)
+      case s: DnnStorage[T] =>
+        Memory.CopyPtr2Ptr(s.ptr.address, sourceOffset, ptr.address, offset, length,
+          DnnStorage.FLOAT_BYTES)
+      case _ =>
+        throw new UnsupportedOperationException("Only support copy from ArrayStorage or DnnStorage")
+    }
+    this
+  }
+
+  override def fill(value: T, offset: Int, length: Int): DnnStorage.this.type =
+    throw new UnsupportedOperationException("Not support this operation in DnnStorage")
+
+  override def resize(size: Long): DnnStorage.this.type =
+    throw new UnsupportedOperationException("Not support this operation in DnnStorage")
+
+  override def array(): Array[T] =
+    throw new UnsupportedOperationException("Not support this operation in DnnStorage")
+
+  override def set(other: Storage[T]): DnnStorage.this.type =
+    throw new UnsupportedOperationException("Not support this operation in DnnStorage")
+
+  override def iterator: Iterator[T] =
+    throw new UnsupportedOperationException("Not support this operation in DnnStorage")
+
+  /**
+   * Release the native array, the storage object is useless
+   */
+  def release(): Unit = {
+    Memory.AlignedFree(ptr.address)
+    _isReleased = true
+  }
+
+  def isReleased(): Boolean = _isReleased
+
+
+  private def allocate(capacity: Int): Long = {
+    require(capacity > 0, s"capacity should not be larger than 0")
+    val ptr = Memory.AlignedMalloc(capacity * DnnStorage.FLOAT_BYTES, DnnStorage.CACHE_LINE_SIZE)
+    require(ptr != 0L, s"allocate native aligned memory failed")
+    ptr
+  }
+}
+
+/**
+ * Represent a native point
+ * @param address
+ */
+private[bigdl] class Pointer(val address: Long)
+
+object DnnStorage {
+  private[tensor] val CACHE_LINE_SIZE = System.getProperty("bigdl.cache.line", "64").toInt
+  private[tensor] val FLOAT_BYTES: Int = 4
+}
diff --git a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/tensor/DnnTensor.scala b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/tensor/DnnTensor.scala
new file mode 100644
index 00000000000..e0a53e4886c
--- /dev/null
+++ b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/tensor/DnnTensor.scala
@@ -0,0 +1,370 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.intel.analytics.bigdl.tensor
+
+import breeze.linalg.{DenseMatrix, DenseVector}
+import com.intel.analytics.bigdl.mkl.Memory
+import com.intel.analytics.bigdl.tensor.DnnTensor.DnnTensorUnsupportOperations
+import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric
+import com.intel.analytics.bigdl.utils.Table
+import org.apache.spark.mllib.linalg
+import org.apache.spark.mllib.linalg.Matrix
+
+import scala.reflect.ClassTag
+
+class DnnTensor[T: ClassTag](
+  private var _storage: DnnStorage[T],
+  private var sizes: Array[Int]
+) (implicit ev: TensorNumeric[T])
+  extends DnnTensorUnsupportOperations[T]{
+
+  override def nElement(): Int = storage.length()
+
+  override def copy(other: Tensor[T]): Tensor[T] = {
+    other match {
+      case t: DenseTensor[_] =>
+        require(DnnTensor.noTransposed(t), "dense tensor should not be transposed")
+        require(this.nElement() == other.nElement(), "tensor elements number must be same")
+        this._storage.copy(other.storage(), 0, other.storageOffset() - 1, this.nElement())
+      case t: DnnTensor[_] =>
+        require(this.nElement() == other.nElement(), "tensor elements number must be same")
+        this._storage.copy(other.storage(), 0, 0, this.nElement())
+      case _ => throw new UnsupportedOperationException(
+        "Only support copy from dense tensor and dnn tensor")
+    }
+    this
+  }
+
+  def release(): Unit = {
+    _storage.release()
+  }
+
+  def storageAddress(): Long = _storage.ptr.address
+
+  def isReleased(): Boolean = _storage.isReleased()
+
+  override def storage(): Storage[T] = _storage
+
+  override def resize(s: Array[Int], stride: Array[Int] = null): this.type = {
+    require(stride == null, "dnn tensor doesn't have stride")
+    if (s.product > nElement()) {
+      _storage.release()
+      _storage = new DnnStorage[T](s.product)
+    }
+    this.sizes = s.clone()
+    this
+  }
+
+  override def resize(s: Int): this.type = {
+    if (s > nElement()) {
+      _storage.release()
+      _storage = new DnnStorage[T](s)
+    }
+    this.sizes = Array(s)
+    this
+  }
+
+  override def add(x: Tensor[T]): Tensor[T] = {
+    require(x.isInstanceOf[DnnTensor[_]], "Just support two dnn tensor add")
+    Memory.SAdd(this.nElement(), this._storage.ptr.address, 0,
+      x.asInstanceOf[DnnTensor[T]]._storage.ptr.address, 0, this._storage.ptr.address, 0)
+    this
+  }
+
+  override def zero(): Tensor[T] = {
+    Memory.Zero(this._storage.ptr.address, this.nElement(), DnnStorage.FLOAT_BYTES)
+    this
+  }
+
+  def axpby(a: Float, b: Float, to: DnnTensor[T]): Unit = {
+    val x = this._storage.ptr.address
+    val y = to._storage.ptr.address
+    Memory.Axpby(this.nElement(), a, x, b, y)
+  }
+
+  override def toTensor[D](implicit ev: TensorNumeric[D]): DnnTensor[D] = {
+    this.asInstanceOf[DnnTensor[D]]
+  }
+
+  override def size(): Array[Int] = sizes.clone()
+
+  override def size(d: Int): Int = sizes(d - 1)
+
+  override def dim(): Int = size().length
+
+  override def nDimension(): Int = size().length
+
+  override def getTensorType: TensorType = MklDnnType
+}
+
+object DnnTensor {
+  // scalastyle:off
+  private def ???(): Nothing = {
+    throw new UnsupportedOperationException("DnnTensor doesn't support this operation")
+  }
+  // scalastyle:on
+
+  private[tensor] def noTransposed(t: DenseTensor[_]): Boolean = {
+    var product = 1
+    var i = t.dim()
+    while(i > 0) {
+      if (product != t.stride(i)) return false
+      product *= t.size(i)
+      i -= 1
+    }
+    return true
+  }
+
+  def apply[T: ClassTag](sizes: Array[Int])(implicit ev: TensorNumeric[T]): DnnTensor[T] = {
+    val storage = new DnnStorage[T](sizes.product)
+    new DnnTensor[T](storage, sizes)
+  }
+
+  def apply[T: ClassTag](d1: Int)(implicit ev: TensorNumeric[T]): DnnTensor[T] = {
+    val storage = new DnnStorage[T](d1)
+    new DnnTensor[T](storage, Array(d1))
+  }
+
+  def apply[T: ClassTag](d1: Int, d2: Int)(implicit ev: TensorNumeric[T]): DnnTensor[T] = {
+    val storage = new DnnStorage[T](d1 * d2)
+    new DnnTensor[T](storage, Array(d1, d2))
+  }
+
+  def apply[T: ClassTag](d1: Int, d2: Int, d3: Int)(implicit ev: TensorNumeric[T]): DnnTensor[T] = {
+    val storage = new DnnStorage[T](d1 * d2 * d3)
+    new DnnTensor[T](storage, Array(d1, d2, d3))
+  }
+
+  def apply[T: ClassTag](d1: Int, d2: Int, d3: Int, d4: Int)(
+    implicit ev: TensorNumeric[T]): DnnTensor[T] = {
+    val storage = new DnnStorage[T](d1 * d2 * d3 * d4)
+    new DnnTensor[T](storage, Array(d1, d2, d3, d4))
+  }
+
+  def apply[T: ClassTag](d1: Int, d2: Int, d3: Int, d4: Int, d5: Int)(
+    implicit ev: TensorNumeric[T]): DnnTensor[T] = {
+    val storage = new DnnStorage[T](d1 * d2 * d3 * d4 * d5)
+    new DnnTensor[T](storage, Array(d1, d2, d3, d4, d5))
+  }
+
+  class DnnTensorUnsupportOperations[T: ClassTag](implicit ev: TensorNumeric[T]) extends Tensor[T] {
+    // scalastyle:off
+    override def isEmpty: Boolean = ???
+    override def isScalar: Boolean = ???
+    override def nDimension(): Int = ???
+    override def dim(): Int = ???
+    override def size(): Array[Int] = ???
+    override def size(dim: Int): Int = ???
+    override def stride(): Array[Int] = ???
+    override def stride(dim: Int): Int = ???
+    override def fill(v: T): Tensor[T] = ???
+    override def forceFill(v: Any): Tensor[T] = ???
+    override def zero(): Tensor[T] = ???
+    override def randn(): Tensor[T] = ???
+    override def randn(mean: Double, stdv: Double): Tensor[T] = ???
+    override def rand(): Tensor[T] = ???
+    override def rand(lowerBound: Double, upperBound: Double): Tensor[T] = ???
+    override def bernoulli(p: Double): Tensor[T] = ???
+    override def transpose(dim1: Int, dim2: Int): Tensor[T] = ???
+    override def t(): Tensor[T] = ???
+    override def apply(index: Int): Tensor[T] = ???
+    override def apply(indexes: Array[Int]): T = ???
+    override def value(): T = ???
+    override def valueAt(d1: Int): T = ???
+    override def valueAt(d1: Int, d2: Int): T = ???
+    override def valueAt(d1: Int, d2: Int, d3: Int): T = ???
+    override def valueAt(d1: Int, d2: Int, d3: Int, d4: Int): T = ???
+    override def valueAt(d1: Int, d2: Int, d3: Int, d4: Int, d5: Int): T = ???
+    override def apply(t: Table): Tensor[T] = ???
+    override def update(index: Int, value: T): Unit = ???
+    override def update(index: Int, src: Tensor[T]): Unit = ???
+    override def update(indexes: Array[Int], value: T): Unit = ???
+    override def setValue(value: T): DnnTensorUnsupportOperations.this.type = ???
+    override def setValue(d1: Int, value: T): DnnTensorUnsupportOperations.this.type = ???
+    override def setValue(d1: Int, d2: Int, value: T): DnnTensorUnsupportOperations.this.type = ???
+    override def setValue(d1: Int, d2: Int, d3: Int, value: T): DnnTensorUnsupportOperations.this.type = ???
+    override def setValue(d1: Int, d2: Int, d3: Int, d4: Int, value: T): DnnTensorUnsupportOperations.this.type = ???
+    override def setValue(d1: Int, d2: Int, d3: Int, d4: Int, d5: Int, value: T): DnnTensorUnsupportOperations.this.type = ???
+    override def update(t: Table, value: T): Unit = ???
+    override def update(t: Table, src: Tensor[T]): Unit = ???
+    override def update(filter: (T) => Boolean, value: T): Unit = ???
+    override def isContiguous(): Boolean = ???
+    override def contiguous(): Tensor[T] = ???
+    override def isSameSizeAs(other: Tensor[_]): Boolean = ???
+    override def emptyInstance(): Tensor[T] = ???
+    override def resizeAs(src: Tensor[_]): Tensor[T] = ???
+    override def cast[D: ClassManifest](castTensor: Tensor[D])(implicit ev: TensorNumeric[D]): Tensor[D] = ???
+    override def resize(sizes: Array[Int], strides: Array[Int]): Tensor[T] = ???
+    override def resize(size1: Int): Tensor[T] = ???
+    override def resize(size1: Int, size2: Int): Tensor[T] = ???
+    override def resize(size1: Int, size2: Int, size3: Int): Tensor[T] = ???
+    override def resize(size1: Int, size2: Int, size3: Int, size4: Int): Tensor[T] = ???
+    override def resize(size1: Int, size2: Int, size3: Int, size4: Int, size5: Int): Tensor[T] = ???
+    override def nElement(): Int = ???
+    override def select(dim: Int, index: Int): Tensor[T] = ???
+    override def storage(): Storage[T] = ???
+    override def storageOffset(): Int = ???
+    override def set(other: Tensor[T]): Tensor[T] = ???
+    override def set(storage: Storage[T], storageOffset: Int, sizes: Array[Int], strides: Array[Int]): Tensor[T] = ???
+    override def set(): Tensor[T] = ???
+    override def narrow(dim: Int, index: Int, size: Int): Tensor[T] = ???
+    override def copy(other: Tensor[T]): Tensor[T] = ???
+    override def applyFun[A: ClassManifest](t: Tensor[A], func: (A) => T): Tensor[T] = ???
+    override def apply1(func: (T) => T): Tensor[T] = ???
+    override def zipWith[A: ClassManifest, B: ClassManifest](t1: Tensor[A], t2: Tensor[B], func: (A, B) => T): Tensor[T] = ???
+    override def map(other: Tensor[T], func: (T, T) => T): Tensor[T] = ???
+    override def squeeze(): Tensor[T] = ???
+    override def squeeze(dim: Int): Tensor[T] = ???
+    override def squeezeNewTensor(): Tensor[T] = ???
+    override def view(sizes: Array[Int]): Tensor[T] = ???
+    override def unfold(dim: Int, size: Int, step: Int): Tensor[T] = ???
+    override def repeatTensor(sizes: Array[Int]): Tensor[T] = ???
+    override def expandAs(template: Tensor[T]): Tensor[T] = ???
+    override def expand(sizes: Array[Int]): Tensor[T] = ???
+    override def split(size: Int, dim: Int): Array[Tensor[T]] = ???
+    override def split(dim: Int): Array[Tensor[T]] = ???
+    override def toBreezeVector(): DenseVector[T] = ???
+    override def toMLlibVector(): linalg.Vector = ???
+    override def toBreezeMatrix(): DenseMatrix[T] = ???
+    override def toMLlibMatrix(): Matrix = ???
+    override def getType(): TensorDataType = ???
+    override def diff(other: Tensor[T], count: Int, reverse: Boolean): Boolean = ???
+    override def addSingletonDimension(t: Tensor[T], dim: Int): Tensor[T] = ???
+    override def reshape(sizes: Array[Int]): Tensor[T] = ???
+    override def save(path: String, overWrite: Boolean): DnnTensorUnsupportOperations.this.type = ???
+    override def getTensorNumeric(): TensorNumeric[T] = ???
+    override def getTensorType: TensorType = ???
+    override def toArray(): Array[T] = ???
+    override def +(s: T): Tensor[T] = ???
+    override def +(t: Tensor[T]): Tensor[T] = ???
+    override def -(s: T): Tensor[T] = ???
+    override def -(t: Tensor[T]): Tensor[T] = ???
+    override def unary_-(): Tensor[T] = ???
+    override def /(s: T): Tensor[T] = ???
+    override def /(t: Tensor[T]): Tensor[T] = ???
+    override def *(s: T): Tensor[T] = ???
+    override def *(t: Tensor[T]): Tensor[T] = ???
+    override def sum(): T = ???
+    override def prod(): T = ???
+    override def prod(x: Tensor[T], dim: Int): Tensor[T] = ???
+    override def sum(dim: Int): Tensor[T] = ???
+    override def sum(x: Tensor[T], dim: Int): Tensor[T] = ???
+    override def mean(): T = ???
+    override def mean(dim: Int): Tensor[T] = ???
+    override def max(): T = ???
+    override def max(dim: Int): (Tensor[T], Tensor[T]) = ???
+    override def max(values: Tensor[T], indices: Tensor[T], dim: Int): (Tensor[T], Tensor[T]) = ???
+    override def min(): T = ???
+    override def min(dim: Int): (Tensor[T], Tensor[T]) = ???
+    override def min(values: Tensor[T], indices: Tensor[T], dim: Int): (Tensor[T], Tensor[T]) = ???
+    override def scatter(dim: Int, index: Tensor[T], src: Tensor[T]): Tensor[T] = ???
+    override def gather(dim: Int, index: Tensor[T], src: Tensor[T]): Tensor[T] = ???
+    override def conv2(kernel: Tensor[T], vf: Char): Tensor[T] = ???
+    override def xcorr2(kernel: Tensor[T], vf: Char): Tensor[T] = ???
+    override def sqrt(): Tensor[T] = ???
+    override def tanh(): Tensor[T] = ???
+    override def abs(): Tensor[T] = ???
+    override def add(value: T, y: Tensor[T]): Tensor[T] = ???
+    override def add(y: Tensor[T]): Tensor[T] = ???
+    override def add(x: Tensor[T], value: T, y: Tensor[T]): Tensor[T] = ???
+    override def add(value: T): Tensor[T] = ???
+    override def add(x: Tensor[T], y: Tensor[T]): Tensor[T] = ???
+    override def dot(y: Tensor[T]): T = ???
+    override def cmax(value: T): Tensor[T] = ???
+    override def dist(y: Tensor[T], norm: Int): T = ???
+    override def addcmul(value: T, tensor1: Tensor[T], tensor2: Tensor[T]): Tensor[T] = ???
+    override def addcmul(tensor1: Tensor[T], tensor2: Tensor[T]): Tensor[T] = ???
+    override def addcdiv(value: T, tensor1: Tensor[T], tensor2: Tensor[T]): Tensor[T] = ???
+    override def sub(value: T, y: Tensor[T]): Tensor[T] = ???
+    override def sub(x: Tensor[T], value: T, y: Tensor[T]): Tensor[T] = ???
+    override def sub(y: Tensor[T]): Tensor[T] = ???
+    override def sub(x: Tensor[T], y: Tensor[T]): Tensor[T] = ???
+    override def sub(value: T): Tensor[T] = ???
+    override def cmul(y: Tensor[T]): Tensor[T] = ???
+    override def cmul(x: Tensor[T], y: Tensor[T]): Tensor[T] = ???
+    override def cdiv(y: Tensor[T]): Tensor[T] = ???
+    override def cdiv(x: Tensor[T], y: Tensor[T]): Tensor[T] = ???
+    override def mul(value: T): Tensor[T] = ???
+    override def div(value: T): Tensor[T] = ???
+    override def div(y: Tensor[T]): Tensor[T] = ???
+    override def mul(x: Tensor[T], value: T): Tensor[T] = ???
+    override def addmm(v1: T, M: Tensor[T], v2: T, mat1: Tensor[T], mat2: Tensor[T]): Tensor[T] = ???
+    override def addmm(M: Tensor[T], mat1: Tensor[T], mat2: Tensor[T]): Tensor[T] = ???
+    override def addmm(mat1: Tensor[T], mat2: Tensor[T]): Tensor[T] = ???
+    override def addmm(v2: T, mat1: Tensor[T], mat2: Tensor[T]): Tensor[T] = ???
+    override def addmm(v1: T, v2: T, mat1: Tensor[T], mat2: Tensor[T]): Tensor[T] = ???
+    override def mm(mat1: Tensor[T], mat2: Tensor[T]): Tensor[T] = ???
+    override def addr(t1: Tensor[T], t2: Tensor[T]): Tensor[T] = ???
+    override def addr(v1: T, t1: Tensor[T], t2: Tensor[T]): Tensor[T] = ???
+    override def addr(v1: T, t1: Tensor[T], v2: T, t2: Tensor[T]): Tensor[T] = ???
+    override def addr(v1: T, t1: Tensor[T], v2: T, t2: Tensor[T], t3: Tensor[T]): Tensor[T] = ???
+    override def uniform(args: T*): T = ???
+    override def addmv(beta: T, vec1: Tensor[T], alpha: T, mat: Tensor[T], vec2: Tensor[T]): Tensor[T] = ???
+    override def addmv(beta: T, alpha: T, mat: Tensor[T], vec2: Tensor[T]): Tensor[T] = ???
+    override def addmv(alpha: T, mat: Tensor[T], vec2: Tensor[T]): Tensor[T] = ???
+    override def mv(mat: Tensor[T], vec2: Tensor[T]): Tensor[T] = ???
+    override def baddbmm(beta: T, M: Tensor[T], alpha: T, batch1: Tensor[T], batch2: Tensor[T]): Tensor[T] = ???
+    override def baddbmm(beta: T, alpha: T, batch1: Tensor[T], batch2: Tensor[T]): Tensor[T] = ???
+    override def baddbmm(alpha: T, batch1: Tensor[T], batch2: Tensor[T]): Tensor[T] = ???
+    override def bmm(batch1: Tensor[T], batch2: Tensor[T]): Tensor[T] = ???
+    override def pow(y: Tensor[T], n: T): Tensor[T] = ???
+    override def pow(n: T): Tensor[T] = ???
+    override def square(): Tensor[T] = ???
+    override def floor(y: Tensor[T]): Tensor[T] = ???
+    override def floor(): Tensor[T] = ???
+    override def ceil(): Tensor[T] = ???
+    override def inv(): Tensor[T] = ???
+    override def erf(): Tensor[T] = ???
+    override def erfc(): Tensor[T] = ???
+    override def logGamma(): Tensor[T] = ???
+    override def digamma(): Tensor[T] = ???
+    override def topk(k: Int, dim: Int, increase: Boolean, result: Tensor[T], indices: Tensor[T], sortedResult: Boolean): (Tensor[T], Tensor[T]) = ???
+    override def log(y: Tensor[T]): Tensor[T] = ???
+    override def exp(y: Tensor[T]): Tensor[T] = ???
+    override def sqrt(y: Tensor[T]): Tensor[T] = ???
+    override def tanh(y: Tensor[T]): Tensor[T] = ???
+    override def log1p(y: Tensor[T]): Tensor[T] = ???
+    override def log(): Tensor[T] = ???
+    override def exp(): Tensor[T] = ???
+    override def log1p(): Tensor[T] = ???
+    override def abs(x: Tensor[T]): Tensor[T] = ???
+    override def norm(y: Tensor[T], value: Int, dim: Int): Tensor[T] = ???
+    override def gt(x: Tensor[T], y: Tensor[T]): Tensor[T] = ???
+    override def lt(x: Tensor[T], y: Tensor[T]): Tensor[T] = ???
+    override def le(x: Tensor[T], y: Tensor[T]): Tensor[T] = ???
+    override def eq(x: Tensor[T], y: T): Tensor[T] = ???
+    override def maskedFill(mask: Tensor[T], e: T): Tensor[T] = ???
+    override def maskedCopy(mask: Tensor[T], y: Tensor[T]): Tensor[T] = ???
+    override def maskedSelect(mask: Tensor[T], y: Tensor[T]): Tensor[T] = ???
+    override def norm(value: Int): T = ???
+    override def sign(): Tensor[T] = ???
+    override def ge(x: Tensor[T], value: Double): Tensor[T] = ???
+    override def indexAdd(dim: Int, index: Tensor[T], y: Tensor[T]): Tensor[T] = ???
+    override def index(dim: Int, index: Tensor[T], y: Tensor[T]): Tensor[T] = ???
+    override def cmax(y: Tensor[T]): Tensor[T] = ???
+    override def cmin(y: Tensor[T]): Tensor[T] = ???
+    override def cmax(x: Tensor[T], y: Tensor[T]): Tensor[T] = ???
+    override def cmin(x: Tensor[T], y: Tensor[T]): Tensor[T] = ???
+    override def range(xmin: Double, xmax: Double, step: Int): Tensor[T] = ???
+    override def negative(x: Tensor[T]): Tensor[T] = ???
+    override def reduce(dim: Int, result: Tensor[T], reducer: (T, T) => T): Tensor[T] = ???
+    override def sumSquare(): T = ???
+    override def clamp(min: Double, max: Double): Tensor[T] = ???
+    override def toTensor[D](implicit ev: TensorNumeric[D]): Tensor[D] = ???
+    override private[bigdl] def toQuantizedTensor = ???
+    // scalastyle: on
+  }
+}
diff --git a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/tensor/Tensor.scala b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/tensor/Tensor.scala
index 75f2bfe83d3..881752e5bbd 100644
--- a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/tensor/Tensor.scala
+++ b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/tensor/Tensor.scala
@@ -837,6 +837,8 @@ object SparseType extends TensorType
 
 object QuantizedType extends TensorType
 
+object MklDnnType extends TensorType
+
 object Tensor {
 
   // pre-load MKL library. If we do not do it here,
@@ -960,7 +962,8 @@ object Tensor {
    */
   def apply[@specialized(Float, Double) T: ClassTag](storage: Storage[T])(
     implicit ev: TensorNumeric[T]): Tensor[T] = {
-    new DenseTensor(storage.asInstanceOf[Storage[T]])
+    require(storage.isInstanceOf[ArrayStorage[_]], "Only support array storage in this operaiton")
+    new DenseTensor(storage.asInstanceOf[ArrayStorage[T]])
   }
 
   /**
@@ -1002,12 +1005,12 @@ object Tensor {
    * @tparam T
    * @return
    */
-  def apply[@specialized(Float, Double) T: ClassTag](storage: Storage[T],
-                                                     storageOffset: Int,
-                                                     size: Array[Int] = null,
-                                                     stride: Array[Int] = null)
-                                                    (implicit ev: TensorNumeric[T]): Tensor[T] = {
-    new DenseTensor(storage.asInstanceOf[Storage[T]], storageOffset, size, stride)
+  def apply[@specialized(Float, Double) T: ClassTag](
+    storage: Storage[T],
+    storageOffset: Int,
+    size: Array[Int] = null,
+    stride: Array[Int] = null)(implicit ev: TensorNumeric[T]): Tensor[T] = {
+    new DenseTensor(storage.asInstanceOf[ArrayStorage[T]], storageOffset, size, stride)
   }
 
   /**
diff --git a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/utils/utils/ThreadPool.scala b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/utils/utils/ThreadPool.scala
index 1c48eef4619..5dc71e8151b 100644
--- a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/utils/utils/ThreadPool.scala
+++ b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/utils/utils/ThreadPool.scala
@@ -18,7 +18,7 @@ package com.intel.analytics.bigdl.utils
 
 import java.util.concurrent._
 
-import com.intel.analytics.bigdl.mkl.MKL
+import com.intel.analytics.bigdl.mkl.{MKL, MklDnn}
 import org.apache.commons.lang.exception.ExceptionUtils
 import org.apache.log4j.Logger
 
diff --git a/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/AvgPoolingSpec.scala b/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/AvgPoolingSpec.scala
new file mode 100644
index 00000000000..eb623adc959
--- /dev/null
+++ b/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/AvgPoolingSpec.scala
@@ -0,0 +1,80 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.intel.analytics.bigdl.nn.mkldnn
+
+import com.intel.analytics.bigdl.mkl.Memory
+import com.intel.analytics.bigdl.nn.SpatialAveragePooling
+import com.intel.analytics.bigdl.tensor.Tensor
+import com.intel.analytics.bigdl.utils.BigDLSpecHelper
+import com.intel.analytics.bigdl.utils.RandomGenerator.RNG
+
+import scala.util.Random
+
+class AvgPoolingSpec extends BigDLSpecHelper {
+  "Avg Pooling test1" should "be correct" in {
+    val batchSize = 2
+    val input = Tensor[Float](batchSize, 480, 28, 28).apply1(e => Random.nextFloat())
+
+    RNG.setSeed(100)
+    val pool = AvgPooling(3, 3, 2, 2)
+    RNG.setSeed(100)
+    val layer = SpatialAveragePooling[Float](3, 3, 2, 2).ceil()
+
+    val output2 = layer.forward(input).toTensor[Float]
+
+    val seq = Sequential()
+    seq.add(ReorderMemory(HeapData(Array(batchSize, 480, 28, 28), Memory.Format.nchw),
+      HeapData(Array(batchSize, 480, 28, 28), Memory.Format.nchw)))
+    seq.add(pool)
+    seq.add(ReorderMemory(HeapData(Array(batchSize, 480, 14, 14), Memory.Format.nchw),
+      HeapData(Array(batchSize, 480, 14, 14), Memory.Format.nchw)))
+    seq.compile(Phase.TrainingPhase, Array(HeapData(Array(batchSize, 480, 28, 28),
+      Memory.Format.nchw)))
+    val output1 = seq.forward(input)
+    output1 should be(output2)
+
+    val grad2 = layer.backward(input, output2).toTensor[Float]
+    val grad1 = seq.backward(input, output2)
+    grad1 should be(grad2)
+  }
+
+  "Avg Pooling test2" should "be correct" in {
+    val batchSize = 2
+    val input = Tensor[Float](batchSize, 64, 112, 112).apply1(e => Random.nextFloat())
+
+    RNG.setSeed(100)
+    val pool = AvgPooling(3, 3, 2, 2)
+    RNG.setSeed(100)
+    val layer = SpatialAveragePooling[Float](3, 3, 2, 2).ceil()
+
+    val output2 = layer.forward(input).toTensor[Float]
+
+    val seq = Sequential()
+    seq.add(ReorderMemory(HeapData(Array(batchSize, 64, 112, 112), Memory.Format.nchw),
+      HeapData(Array(batchSize, 64, 112, 112), Memory.Format.nchw)))
+    seq.add(pool)
+    seq.add(ReorderMemory(HeapData(Array(batchSize, 64, 56, 56), Memory.Format.nchw),
+      HeapData(Array(batchSize, 64, 56, 56), Memory.Format.nchw)))
+    seq.compile(Phase.TrainingPhase, Array(HeapData(Array(batchSize, 64, 112, 112),
+      Memory.Format.nchw)))
+    val output1 = seq.forward(input)
+    output1 should be(output2)
+
+    val grad2 = layer.backward(input, output2).toTensor[Float]
+    val grad1 = seq.backward(input, output2)
+    grad1 should be(grad2)
+  }
+}
diff --git a/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/CAddTableSpec.scala b/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/CAddTableSpec.scala
new file mode 100644
index 00000000000..591b7f2475f
--- /dev/null
+++ b/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/CAddTableSpec.scala
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.intel.analytics.bigdl.nn.mkldnn
+
+import com.intel.analytics.bigdl.mkl.Memory
+import com.intel.analytics.bigdl.tensor.Tensor
+import com.intel.analytics.bigdl.utils.{BigDLSpecHelper, T}
+
+class CAddTableSpec extends BigDLSpecHelper {
+  "CAddTable" should "be correct" in {
+    val layer = CAddTable()
+    val model = Sequential()
+    val concat = ConcatTable()
+    concat.add(ReorderMemory(HeapData(Array(2, 2), Memory.Format.nc),
+      NativeData(Array(2, 2), Memory.Format.nc), HeapData(Array(2, 2), Memory.Format.nc),
+      NativeData(Array(2, 2), Memory.Format.nc)))
+    concat.add(ReorderMemory(HeapData(Array(2, 2), Memory.Format.nc),
+      NativeData(Array(2, 2), Memory.Format.nc), HeapData(Array(2, 2), Memory.Format.nc),
+      NativeData(Array(2, 2), Memory.Format.nc)))
+    model.add(concat)
+    model.add(layer)
+    model.add(ReorderMemory(NativeData(Array(2, 2), Memory.Format.nc),
+      HeapData(Array(2, 2), Memory.Format.nc), NativeData(Array(2, 2), Memory.Format.nc),
+      HeapData(Array(2, 2), Memory.Format.nc)))
+    model.compile(Phase.TrainingPhase, Array(HeapData(Array(2, 2), Memory.Format.nc)))
+    model.forward(Tensor[Float](T(T(1, 2), T(3, 4)))) should be(Tensor[Float](T(
+      T(2, 4),
+      T(6, 8)
+    )))
+    val dnnGrad = model.backward(Tensor[Float](T(T(1, 2), T(3, 4))), T(
+      Tensor[Float](T(
+        T(4, 5),
+        T(6, 7)
+      ))
+    )).asInstanceOf[Tensor[Float]]
+    val heapGrad = Tensor[Float](2, 2)
+    heapGrad.copy(dnnGrad)
+    heapGrad should be (
+      Tensor[Float](T(T(8, 10), T(12, 14)))
+    )
+  }
+}
diff --git a/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/ConcatTableSpec.scala b/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/ConcatTableSpec.scala
new file mode 100644
index 00000000000..ebebe17b089
--- /dev/null
+++ b/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/ConcatTableSpec.scala
@@ -0,0 +1,72 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.intel.analytics.bigdl.nn.mkldnn
+
+import com.intel.analytics.bigdl.mkl.Memory
+import com.intel.analytics.bigdl.tensor.Tensor
+import com.intel.analytics.bigdl.utils.{BigDLSpecHelper, T}
+
+class ConcatTableSpec extends BigDLSpecHelper {
+  "ConcatTable" should "throw exception when input shape is different" in {
+    val container = ConcatTable()
+    container.add(Input(Array(1, 2, 3, 4), Memory.Format.nchw))
+    container.add(Input(Array(1, 3, 4, 2), Memory.Format.nchw))
+
+    intercept[IllegalArgumentException] {
+      container.compile(Phase.TrainingPhase, Array(HeapData(Array(1, 2, 3, 4), Memory.Format.nchw)))
+    }
+  }
+
+  "ConcatTable" should "be good" in {
+    val container = ConcatTable()
+    container.add(ReorderMemory(
+      HeapData(Array(3, 4), Memory.Format.nc),
+      HeapData(Array(3, 4), Memory.Format.nc),
+      HeapData(Array(3, 4), Memory.Format.nc),
+      HeapData(Array(3, 4), Memory.Format.nc)))
+    val subcontainer = Sequential()
+    subcontainer.add(ReorderMemory(
+      HeapData(Array(3, 4), Memory.Format.nc),
+      NativeData(Array(3, 4), Memory.Format.nc),
+      HeapData(Array(3, 4), Memory.Format.nc),
+      NativeData(Array(3, 4), Memory.Format.nc)))
+    subcontainer.add(ReorderMemory(NativeData(Array(3, 4), Memory.Format.io),
+      NativeData(Array(3, 4), Memory.Format.nc)))
+    subcontainer.add(ReorderMemory(HeapData(Array(3, 4), Memory.Format.nc),
+      NativeData(Array(3, 4), Memory.Format.io)))
+    container.add(subcontainer)
+
+    container.compile(Phase.TrainingPhase, Array(HeapData(Array(3, 4), Memory.Format.nc)))
+    val input1 = Tensor[Float](3, 4).rand()
+    val output1 = container.forward(input1).toTable
+    output1(1).asInstanceOf[Tensor[Float]] should be(input1)
+    output1(2).asInstanceOf[Tensor[Float]] should be(input1)
+
+    val grad1 = Tensor[Float](3, 4).rand()
+    val nativeGrad = container.backward(input1, T(grad1, grad1)).asInstanceOf[Tensor[Float]]
+    val heapGrad = Tensor[Float](3, 4).copy(nativeGrad)
+    heapGrad should be(grad1 * 2)
+    val input2 = Tensor[Float](3, 4).rand()
+    val output2 = container.forward(input2).toTable
+    output2(1).asInstanceOf[Tensor[Float]] should be(input2)
+    output2(2).asInstanceOf[Tensor[Float]] should be(input2)
+
+    val grad2 = Tensor[Float](3, 4).rand()
+    val nativeGrad2 = container.backward(input1, T(grad2, grad2)).asInstanceOf[Tensor[Float]]
+    val heapGrad2 = Tensor[Float](3, 4).copy(nativeGrad2)
+    heapGrad2 should be(grad2 * 2)
+  }
+}
diff --git a/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/FusionSpec.scala b/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/FusionSpec.scala
new file mode 100644
index 00000000000..062e87fee2a
--- /dev/null
+++ b/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/FusionSpec.scala
@@ -0,0 +1,155 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.intel.analytics.bigdl.nn.mkldnn
+
+import com.intel.analytics.bigdl.mkl.Memory
+import com.intel.analytics.bigdl.nn.mkldnn.Phase.InferencePhase
+import org.scalatest.{FlatSpec, Matchers}
+import com.intel.analytics.bigdl.numeric.NumericFloat
+import com.intel.analytics.bigdl.tensor.Tensor
+
+class FusionSpec extends FlatSpec with Matchers {
+  "Conv with relu" should "work correctly" in {
+    val batchSize = 2
+    val input = Tensor[Float](batchSize, 3, 224, 224).fill(1.0f)
+
+    val inputShape = Array(batchSize, 3, 224, 224)
+    val outputShape = Array(batchSize, 64, 112, 112)
+
+    val conv1 = SpatialConvolution(3, 64, 7, 7, 2, 2, 3, 3, 1, false)
+    val reorder1 = ReorderMemory(HeapData(inputShape, Memory.Format.nchw))
+    val reorder11 = ReorderMemory(HeapData(outputShape, Memory.Format.nchw))
+    val model1 = Sequential().add(reorder1).add(conv1).add(ReLU()).add(reorder11)
+    model1.compile(InferencePhase, Array(HeapData(inputShape, Memory.Format.nchw)))
+
+    System.setProperty("bigdl.mkldnn.fusion.convrelu", "true")
+    val conv2 = SpatialConvolution(3, 64, 7, 7, 2, 2, 3, 3, 1, false, initWeight = conv1.weight,
+      initBias = conv1.bias)
+    val reorder2 = ReorderMemory(HeapData(inputShape, Memory.Format.nchw))
+    val reorder22 = ReorderMemory(HeapData(outputShape, Memory.Format.nchw))
+    val model2 = Sequential().add(reorder2).add(conv2).add(ReLU()).add(reorder22)
+    model2.compile(InferencePhase, Array(HeapData(inputShape, Memory.Format.nchw)))
+    System.setProperty("bigdl.mkldnn.fusion.convrelu", "false")
+
+    model1.evaluate()
+    model2.evaluate()
+
+    model1.forward(input)
+    model2.forward(input)
+
+    model1.output should be (model2.output)
+    model1.modules.length should be (model2.modules.length + 1)
+  }
+
+  "Conv Bn merge" should "work correctly" in {
+    val batchSize = 4
+    val inputShape = Array(batchSize, 3, 224, 224)
+    val outputShape = Array(batchSize, 64, 112, 112)
+    val input = Tensor[Float](batchSize, 3, 224, 224).fill(1.0f)
+
+    val runningMean = Tensor[Float](64).rand(-1, 1)
+    val runningVar = Tensor[Float](64).fill(100)
+    val initWeight = Tensor[Float]().resize(Array(64, 3, 7, 7)).rand(-1, 1)
+    val initBias = Tensor[Float]().resize(Array(64)).rand(-100, 100)
+
+    val conv1 = SpatialConvolution(3, 64, 7, 7, 2, 2, 3, 3, 1, false, initWeight = initWeight,
+      initBias = initBias)
+    val bn1 = SpatialBatchNormalization(64, eps = 0.0)
+    bn1.runningMean.copy(runningMean)
+    bn1.runningVariance.copy(runningVar)
+    val reorder1 = ReorderMemory(HeapData(inputShape, Memory.Format.nchw))
+    val reorder11 = ReorderMemory(HeapData(outputShape, Memory.Format.nchw))
+    val model1 = Sequential().add(reorder1).add(conv1).add(bn1).add(reorder11)
+    model1.compile(InferencePhase, Array(HeapData(inputShape, Memory.Format.nchw)))
+
+    System.setProperty("bigdl.mkldnn.fusion.convbn", "true")
+    val conv2 = SpatialConvolution(3, 64, 7, 7, 2, 2, 3, 3, 1, false, initWeight = conv1.weight,
+      initBias = conv1.bias)
+    val bn2 = SpatialBatchNormalization(64, eps = 0.0)
+    bn2.runningMean.copy(runningMean)
+    bn2.runningVariance.copy(runningVar)
+    val reorder2 = ReorderMemory(HeapData(inputShape, Memory.Format.nchw))
+    val reorder22 = ReorderMemory(HeapData(outputShape, Memory.Format.nchw))
+    val model2 = Sequential().add(reorder2).add(conv2).add(bn2).add(reorder22)
+    model2.compile(InferencePhase, Array(HeapData(inputShape, Memory.Format.nchw)))
+    System.setProperty("bigdl.mkldnn.fusion.convbn", "false")
+
+    model1.evaluate()
+    model2.evaluate()
+
+    model1.forward(input)
+    model2.forward(input)
+
+    Equivalent.nearequals(model1.output.toTensor, model2.output.toTensor, 1e-5) should be (true)
+    model1.modules.length should be (model2.modules.length + 1)
+  }
+
+  "Conv sum fusion" should "work correctly" in {
+    import com.intel.analytics.bigdl.numeric.NumericFloat
+
+    val input = Tensor[Float](2, 1, 6, 6).rand(-1, 1)
+    val inputShape = Array(2, 1, 6, 6)
+    val outputShape = Array(2, 3, 4, 4)
+
+    val initWeight = Tensor[Float](3, 1, 2, 2).fill(1)
+    val initBias = Tensor[Float](3).fill(0)
+
+    val conv1 = SpatialConvolution(1, 3, 2, 2, 2, 2, 1, 1, 1, initWeight = initWeight,
+      initBias = initBias)
+    val conv2 = SpatialConvolution(1, 3, 2, 2, 2, 2, 1, 1, 1, initWeight = initWeight,
+      initBias = initBias)
+    val conv3 = SpatialConvolution(1, 3, 2, 2, 2, 2, 1, 1, 1, initWeight = initWeight,
+      initBias = initBias)
+    val conv4 = SpatialConvolution(1, 3, 2, 2, 2, 2, 1, 1, 1, initWeight = initWeight,
+      initBias = initBias)
+
+    val reorder1 = ReorderMemory(HeapData(outputShape, Memory.Format.nchw))
+    val reorder2 = ReorderMemory(HeapData(outputShape, Memory.Format.nchw))
+
+    val model1 = Sequential()
+      .add(ConcatTable()
+        .add(conv1)
+        .add(conv2))
+      .add(CAddTable())
+      .add(ReLU())
+      .add(reorder1)
+    model1.compile(InferencePhase, Array(HeapData(inputShape, Memory.Format.nchw)))
+
+    System.setProperty("bigdl.mkldnn.fusion.convsum", "true")
+    System.setProperty("bigdl.mkldnn.fusion.convrelu", "true")
+    val model2 = Sequential()
+      .add(ConcatTable()
+        .add(conv3)
+        .add(conv4))
+      .add(CAddTable())
+      .add(ReLU())
+      .add(reorder2)
+
+    model1.evaluate()
+    model2.evaluate()
+
+    model2.compile(InferencePhase, Array(HeapData(inputShape, Memory.Format.nchw)))
+    System.setProperty("bigdl.mkldnn.fusion.convsum", "false")
+    System.setProperty("bigdl.mkldnn.fusion.convrelu", "false")
+
+    model1.forward(input)
+    model2.forward(input)
+
+    model1.output should be (model2.output)
+  }
+
+}
diff --git a/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/InputSpec.scala b/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/InputSpec.scala
new file mode 100644
index 00000000000..a7921362884
--- /dev/null
+++ b/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/InputSpec.scala
@@ -0,0 +1,33 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.intel.analytics.bigdl.nn.mkldnn
+
+import com.intel.analytics.bigdl.mkl.Memory
+import com.intel.analytics.bigdl.tensor.Tensor
+import com.intel.analytics.bigdl.utils.BigDLSpecHelper
+
+class InputSpec extends BigDLSpecHelper {
+  "Input" should "be correct" in {
+    val layer = Input(Array(2, 2), Memory.Format.nc)
+    layer.setRuntime(new MklDnnRuntime())
+    layer.initFwdPrimitives(Array(), Phase.TrainingPhase)
+    layer.initBwdPrimitives(Array(), Phase.TrainingPhase)
+    val tensor = Tensor[Float](2, 2).rand()
+    val grad = Tensor[Float](2, 2).rand()
+    layer.forward(tensor) should be(tensor)
+    layer.backward(tensor, grad) should be(grad)
+  }
+}
diff --git a/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/JoinTableSpec.scala b/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/JoinTableSpec.scala
new file mode 100644
index 00000000000..7d4b7c8162d
--- /dev/null
+++ b/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/JoinTableSpec.scala
@@ -0,0 +1,59 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.intel.analytics.bigdl.nn.mkldnn
+
+import com.intel.analytics.bigdl.mkl.Memory
+import com.intel.analytics.bigdl.tensor.Tensor
+import com.intel.analytics.bigdl.utils.{BigDLSpecHelper, T}
+
+class JoinTableSpec extends BigDLSpecHelper {
+  "Join table" should "work correctly" in {
+    val layer = JoinTable(1)
+    val model = Sequential()
+    val concat = ConcatTable()
+    concat.add(ReorderMemory(HeapData(Array(2, 2), Memory.Format.nc),
+      NativeData(Array(2, 2), Memory.Format.nc), HeapData(Array(2, 2), Memory.Format.nc),
+      NativeData(Array(2, 2), Memory.Format.nc)))
+    concat.add(ReorderMemory(HeapData(Array(2, 2), Memory.Format.nc),
+      NativeData(Array(2, 2), Memory.Format.nc), HeapData(Array(2, 2), Memory.Format.nc),
+      NativeData(Array(2, 2), Memory.Format.nc)))
+    model.add(concat)
+    model.add(layer)
+    model.add(ReorderMemory(NativeData(Array(4, 2), Memory.Format.nc),
+      HeapData(Array(4, 2), Memory.Format.nc), NativeData(Array(4, 2), Memory.Format.nc),
+      HeapData(Array(4, 2), Memory.Format.nc)))
+    model.compile(Phase.TrainingPhase, Array(HeapData(Array(2, 2), Memory.Format.nc)))
+    model.forward(Tensor[Float](T(T(1, 2), T(3, 4)))) should be(Tensor[Float](T(
+      T(1, 2),
+      T(3, 4),
+      T(1, 2),
+      T(3, 4)
+    )))
+    val dnnGrad = model.backward(Tensor[Float](T(T(1, 2), T(3, 4))), T(
+      Tensor[Float](T(
+        T(4, 5),
+        T(6, 7),
+        T(1, 3),
+        T(4, 2)
+      ))
+    )).asInstanceOf[Tensor[Float]]
+    val heapGrad = Tensor[Float](2, 2)
+    heapGrad.copy(dnnGrad)
+    heapGrad should be(
+      Tensor[Float](T(T(5, 8), T(10, 9)))
+    )
+  }
+}
diff --git a/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/LRNSpec.scala b/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/LRNSpec.scala
new file mode 100644
index 00000000000..24208d39765
--- /dev/null
+++ b/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/LRNSpec.scala
@@ -0,0 +1,53 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.intel.analytics.bigdl.nn.mkldnn
+
+import com.intel.analytics.bigdl.mkl.Memory
+import com.intel.analytics.bigdl.nn.SpatialCrossMapLRN
+import com.intel.analytics.bigdl.tensor.Tensor
+import com.intel.analytics.bigdl.utils.BigDLSpecHelper
+import com.intel.analytics.bigdl.utils.RandomGenerator.RNG
+
+import scala.util.Random
+
+class LRNSpec extends BigDLSpecHelper {
+  "LRNDnn with format=nchw" should "work correctly" in {
+    val batchSize = 2
+    val input = Tensor[Float](batchSize, 7, 3, 3).apply1(e => Random.nextFloat())
+    val gradOutput = Tensor[Float](batchSize, 7, 3, 3).apply1(e => Random.nextFloat())
+
+    RNG.setSeed(100)
+    val lrnDnn = LRN(5, 0.0001, 0.75, 1.0)
+    RNG.setSeed(100)
+    val lrnBLAS = SpatialCrossMapLRN[Float](5, 0.0001, 0.75, 1.0)
+
+    val output2 = lrnBLAS.forward(input)
+    val grad2 = lrnBLAS.updateGradInput(input, gradOutput)
+
+    val seq = Sequential()
+    seq.add(ReorderMemory(HeapData(Array(batchSize, 7, 3, 3), Memory.Format.nchw),
+      HeapData(Array(batchSize, 7, 3, 3), Memory.Format.nchw)))
+    seq.add(lrnDnn)
+    seq.add(ReorderMemory(HeapData(Array(batchSize, 7, 3, 3), Memory.Format.nchw),
+      HeapData(Array(batchSize, 7, 3, 3), Memory.Format.nchw)))
+    seq.compile(Phase.TrainingPhase, Array(HeapData(Array(batchSize, 7, 3, 3), Memory.Format.nchw)))
+    val output = seq.forward(input)
+    output.asInstanceOf[Tensor[Float]] should be(output2)
+    val grad1 = seq.backward(input, gradOutput)
+    grad1.asInstanceOf[Tensor[Float]] should be(grad2)
+  }
+
+}
diff --git a/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/LinearSpec.scala b/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/LinearSpec.scala
new file mode 100644
index 00000000000..39668c2945f
--- /dev/null
+++ b/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/LinearSpec.scala
@@ -0,0 +1,365 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.intel.analytics.bigdl.nn.mkldnn
+
+import com.intel.analytics.bigdl.mkl.Memory
+import com.intel.analytics.bigdl.nn
+import com.intel.analytics.bigdl.nn.mkldnn.Phase.TrainingPhase
+import com.intel.analytics.bigdl.numeric.NumericFloat
+import com.intel.analytics.bigdl.tensor.Tensor
+import org.scalatest.{FlatSpec, Matchers}
+
+class LinearSpec extends FlatSpec with Matchers {
+  "linear updateOutput" should "work correctly" in {
+    val inputSize = 2
+    val outputSize = 2
+    val batchSize = 2
+
+    val inputFormat = HeapData(Array(batchSize, inputSize), Memory.Format.nc)
+    val outputFormat = HeapData(Array(batchSize, outputSize), Memory.Format.nc)
+    val input = Tensor[Float](batchSize, inputSize).rand()
+
+    val initWeight = Tensor[Float](outputSize, inputSize).rand()
+    val initBias = Tensor[Float](outputSize).rand()
+
+    val linear = Linear(inputSize, outputSize, initWeight = initWeight, initBias = initBias)
+    linear.setRuntime(new MklDnnRuntime)
+    linear.initFwdPrimitives(Array(inputFormat), TrainingPhase)
+    linear.initBwdPrimitives(Array(outputFormat), TrainingPhase)
+    linear.initGradWPrimitives(Array(outputFormat), TrainingPhase)
+
+    val output = linear.forward(input)
+    println(output)
+
+    val nnLinear = nn.Linear(inputSize, outputSize, initWeight = initWeight, initBias = initBias)
+    val nnOutput = nnLinear.forward(input)
+    println(nnOutput)
+
+    Tools.dense(output) should be (nnOutput)
+  }
+
+  "linear updateOutput multi times" should "work correctly" in {
+    val inputSize = 2
+    val outputSize = 2
+    val batchSize = 2
+
+    val inputFormat = HeapData(Array(batchSize, inputSize), Memory.Format.nc)
+    val outputFormat = HeapData(Array(batchSize, outputSize), Memory.Format.nc)
+
+    val initWeight = Tensor[Float](outputSize, inputSize).rand()
+    val initBias = Tensor[Float](outputSize).rand()
+
+    val inputs = new Array[Tensor[Float]](100)
+    for (i <- inputs.indices) {
+      inputs(i) = Tensor[Float](batchSize, inputSize).rand()
+    }
+
+    val linear = Linear(inputSize, outputSize, initWeight = initWeight, initBias = initBias)
+    linear.setRuntime(new MklDnnRuntime)
+    linear.initFwdPrimitives(Array(inputFormat), TrainingPhase)
+    linear.initBwdPrimitives(Array(outputFormat), TrainingPhase)
+    linear.initGradWPrimitives(Array(outputFormat), TrainingPhase)
+
+    for (in <- inputs) {
+      linear.forward(in)
+    }
+    println(linear.output)
+
+    val nnLinear = nn.Linear(inputSize, outputSize, initWeight = initWeight, initBias = initBias)
+    for (in <- inputs) {
+      nnLinear.forward(in)
+    }
+    println(nnLinear.output)
+
+    Tools.dense(linear.output) should be (nnLinear.output)
+  }
+
+  "linear updateGradInput" should "work correctly" in {
+    val inputSize = 2
+    val outputSize = 2
+    val batchSize = 2
+
+    val inputFormat = HeapData(Array(batchSize, inputSize), Memory.Format.nc)
+    val outputFormat = HeapData(Array(batchSize, outputSize), Memory.Format.nc)
+    val input = Tensor[Float](batchSize, inputSize).rand()
+    val gradOutput = Tensor().resize(outputFormat.shape).rand()
+
+    val initWeight = Tensor[Float](outputSize, inputSize).rand()
+    val initBias = Tensor[Float](outputSize).rand()
+
+    val linear = Linear(inputSize, outputSize, initWeight = initWeight, initBias = initBias)
+    linear.setRuntime(new MklDnnRuntime)
+    linear.initFwdPrimitives(Array(inputFormat), TrainingPhase)
+    linear.initBwdPrimitives(Array(outputFormat), TrainingPhase)
+    linear.initGradWPrimitives(Array(outputFormat), TrainingPhase)
+
+    val output = linear.forward(input)
+    val gradInput = linear.updateGradInput(input, gradOutput)
+
+    val nnLinear = nn.Linear(inputSize, outputSize, initWeight = initWeight, initBias = initBias)
+    val nnOutput = nnLinear.forward(input)
+    val nnGradInput = nnLinear.updateGradInput(input, gradOutput)
+
+    println(gradInput)
+    println("-" * 80)
+    println(nnGradInput)
+
+    Tools.dense(gradInput) should be (nnGradInput)
+  }
+
+  "linear updateGradInput multi times" should "work correctly" in {
+    val inputSize = 2
+    val outputSize = 2
+    val batchSize = 2
+
+    val inputFormat = HeapData(Array(batchSize, inputSize), Memory.Format.nc)
+    val outputFormat = HeapData(Array(batchSize, outputSize), Memory.Format.nc)
+
+    val initWeight = Tensor[Float](outputSize, inputSize).rand()
+    val initBias = Tensor[Float](outputSize).rand()
+
+    val inputs = new Array[Tensor[Float]](100)
+    for (i <- inputs.indices) {
+      inputs(i) = Tensor[Float](batchSize, inputSize).rand()
+    }
+
+    val linear = Linear(inputSize, outputSize, initWeight = initWeight, initBias = initBias)
+    linear.setRuntime(new MklDnnRuntime)
+    linear.initFwdPrimitives(Array(inputFormat), TrainingPhase)
+    linear.initBwdPrimitives(Array(outputFormat), TrainingPhase)
+    linear.initGradWPrimitives(Array(outputFormat), TrainingPhase)
+
+    for (i <- inputs.indices) {
+      inputs(i) = Tensor[Float](batchSize, inputSize).rand()
+    }
+
+    val gradOutputs = new Array[Tensor[Float]](100)
+    for (i <- gradOutputs.indices) {
+      gradOutputs(i) = Tensor[Float](batchSize, outputSize).rand()
+    }
+
+    linear.forward(inputs.last)
+
+    for (i <- inputs.indices) {
+      linear.updateGradInput(inputs(i), gradOutputs(i))
+    }
+
+    val nnLinear = nn.Linear(inputSize, outputSize, initWeight = initWeight, initBias = initBias)
+    val nnOutput = nnLinear.forward(inputs.last)
+
+    for (i <- inputs.indices) {
+      nnLinear.updateGradInput(inputs(i), gradOutputs(i))
+    }
+
+    Tools.dense(linear.gradInput) should be (nnLinear.gradInput)
+  }
+
+  "linear accGradParameters" should "work correctly" in {
+    val inputSize = 2
+    val outputSize = 2
+    val batchSize = 2
+
+    val inputFormat = HeapData(Array(batchSize, inputSize), Memory.Format.nc)
+    val outputFormat = HeapData(Array(batchSize, outputSize), Memory.Format.nc)
+    val input = Tensor[Float](batchSize, inputSize).rand()
+    val gradOutput = Tensor[Float]().resize(outputFormat.shape).rand()
+
+    val initWeight = Tensor[Float](outputSize, inputSize).rand()
+    val initBias = Tensor[Float](outputSize).rand()
+
+    val linear = Linear(inputSize, outputSize, initWeight = initWeight, initBias = initBias)
+    linear.setRuntime(new MklDnnRuntime)
+    linear.initFwdPrimitives(Array(inputFormat), TrainingPhase)
+    linear.initBwdPrimitives(Array(outputFormat), TrainingPhase)
+    linear.initGradWPrimitives(Array(outputFormat), TrainingPhase)
+
+    val output = linear.forward(input)
+
+    val gradInput = linear.updateGradInput(input, gradOutput)
+
+    val nnLinear = nn.Linear(inputSize, outputSize, initWeight = initWeight, initBias = initBias)
+    val nnOutput = nnLinear.forward(input)
+    val nnGradInput = nnLinear.updateGradInput(input, gradOutput)
+
+    linear.accGradParameters(input, gradOutput)
+    nnLinear.accGradParameters(input, gradOutput)
+
+    println(linear.gradWeight)
+    println(linear.gradBias)
+    println("-" * 80)
+    println(nnLinear.gradWeight)
+    println(nnLinear.gradBias)
+
+    Tools.dense(linear.gradWeight) should be (nnLinear.gradWeight)
+    Tools.dense(linear.gradBias) should be (nnLinear.gradBias)
+  }
+
+  "linear with maxpooling" should "work correctly" in {
+    val initWeight = Tensor[Float](4096, 256 * 6 * 6).rand()
+    val initBias = Tensor[Float](4096).rand()
+    val input = Tensor[Float](4, 256, 13, 13).rand()
+
+    val dnn = Sequential()
+      .add(MaxPooling(3, 3, 2, 2))
+      .add(Linear(256 * 6 * 6, 4096, initWeight = initWeight, initBias = initBias))
+      .add(ReorderMemory(HeapData(Array(4, 4096), Memory.Format.nc)))
+    dnn.compile(TrainingPhase, Array(HeapData(input.size(), Memory.Format.nchw)))
+
+    val blas = nn.Sequential()
+      .add(nn.SpatialMaxPooling(3, 3, 2, 2))
+      .add(nn.View(256 * 6 * 6))
+      .add(nn.Linear(256 * 6 * 6, 4096, initWeight = initWeight, initBias = initBias))
+
+    blas.forward(input)
+    dnn.forward(input)
+
+    val gradOutput = Tensor[Float]().resizeAs(blas.output.toTensor).rand()
+    dnn.backward(input, gradOutput)
+    blas.backward(input, gradOutput)
+
+    Tools.dense(dnn.output) should be (blas.output)
+    Tools.dense(dnn.gradInput) should be (blas.gradInput)
+  }
+
+//  "relu + linear with 1-D" should "work correctly" in {
+//    val initWeight = Tensor(10, 20).rand(-1, 1)
+//    val initBias = Tensor(10).rand(-1, 1)
+//
+//    val input = Tensor(20).rand()
+//    val inputFormat = HeapData(Array(20), Memory.Format.x)
+//    val outputFormat = HeapData(Array(10), Memory.Format.x)
+//
+//    val dnn = Sequential().add(ReLU()).add(Linear(20, 10, initWeight = initWeight,
+//      initBias = initBias))
+//    dnn.compile(TrainingPhase, Array(inputFormat))
+//
+//    val blas = nn.Sequential().add(nn.ReLU()).add(nn.Linear(20, 10, initWeight = initWeight,
+//      initBias = initBias))
+//
+//    dnn.forward(input)
+//    println("=" * 80)
+//    blas.forward(input)
+//
+//    val gradOutput = Tensor().resizeAs(blas.output.toTensor)
+//    dnn.backward(input, gradOutput)
+//    blas.backward(input, gradOutput)
+//  }
+
+//  "1-D input" should "work correctly" in {
+//    val input = Tensor(20).rand()
+//    val gradOutput = Tensor(10).rand()
+//
+//    val model = Linear(20, 10)
+//    model.setRuntime(new MklDnnRuntime)
+//    model.initFwdPrimitives(Array(HeapData(Array(20), Memory.Format.x)), TrainingPhase)
+//    model.initBwdPrimitives(Array(HeapData(Array(10), Memory.Format.x)), TrainingPhase)
+//    model.initGradWPrimitives(Array(HeapData(Array(10), Memory.Format.x)), TrainingPhase)
+//
+//    model.forward(input)
+//
+//    model.updateGradInput(input, gradOutput)
+//  }
+
+  "linear + linear, the first linear with a 4-D input" should "work correctly" in {
+    val inputSize = 16 * 16 * 16
+    val outputSize = 16 * 16 * 16
+    val initWeight = Tensor[Float](outputSize, inputSize).rand()
+    val initBias = Tensor[Float](outputSize)
+
+    val input = Tensor[Float](16, inputSize).rand()
+    val input2 = Tensor[Float](16, 16, 16, 16).rand()
+
+    val inputShape1 = Array(16, inputSize)
+    val inputShape2 = Array(16, 16, 16, 16)
+
+    val seq = Sequential()
+      .add(Linear(outputSize, inputSize, initWeight = initWeight, initBias = initBias))
+      .add(Linear(outputSize, inputSize, initWeight = initWeight, initBias = initBias))
+
+    seq.compile(TrainingPhase, Array(HeapData(inputShape1, Memory.Format.nc)))
+
+    val seq2 = Sequential()
+      .add(Linear(outputSize, inputSize, initWeight = initWeight, initBias = initBias))
+      .add(Linear(outputSize, inputSize, initWeight = initWeight, initBias = initBias))
+
+    seq.compile(TrainingPhase, Array(HeapData(inputShape2, Memory.Format.nchw)))
+
+    seq.forward(input)
+    seq.backward(input, input)
+
+    seq.forward(input2)
+    seq.backward(input2, input)
+  }
+
+
+  "linear " should "work correctly" ignore {
+    val (batchSize, nInput) = (4, 64)
+    val inputShape = Array(batchSize, nInput)
+    val nOutput = 1000
+    val outputShape = Array(batchSize, nOutput)
+    val name = "fc"
+
+    val prototxt =
+      s"""
+         |name: "relu-simple"
+         |force_backward: true
+         |layer {
+         |  name: "data"
+         |  type: "DummyData"
+         |  top: "data"
+         |  include {
+         |    phase: TRAIN
+         |  }
+         |  dummy_data_param {
+         |    data_filler {
+         |      type: "xavier"
+         |    }
+         |    shape: { ${shape2Dim(inputShape)} }
+         |  }
+         |}
+         |
+         |layer {
+         |  bottom: "data"
+         |  top: "$name"
+         |  name: "$name"
+         |  type: "InnerProduct"
+         |  inner_product_param {
+         |    num_output: $nOutput
+         |    weight_filler {
+         |      type: "gaussian"
+         |      std: 0.01
+         |    }
+         |    bias_filler {
+         |      type: "constant"
+         |      value: 0
+         |    }
+         |  }
+         |}
+       """.stripMargin
+    val linear = new Linear(nInput, nOutput).setName(name)
+    linear.setRuntime(new MklDnnRuntime)
+    linear.initFwdPrimitives(Array(HeapData(inputShape, Memory.Format.nc)), TrainingPhase)
+    linear.initBwdPrimitives(Array(HeapData(outputShape, Memory.Format.nc)), TrainingPhase)
+    linear.initGradWPrimitives(Array(HeapData(outputShape, Memory.Format.nc)), TrainingPhase)
+
+    Tools.compare(prototxt, linear, inputShape, outputShape)
+  }
+
+  private def shape2Dim(shape: Array[Int]): String = {
+    shape.map(x => "dim: " + x).mkString(" ")
+  }
+}
diff --git a/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/MaxPoolingSpec.scala b/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/MaxPoolingSpec.scala
new file mode 100644
index 00000000000..2511044b297
--- /dev/null
+++ b/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/MaxPoolingSpec.scala
@@ -0,0 +1,80 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.intel.analytics.bigdl.nn.mkldnn
+
+import com.intel.analytics.bigdl.mkl.{AlgKind, Memory}
+import com.intel.analytics.bigdl.nn.{SpatialAveragePooling, SpatialMaxPooling}
+import com.intel.analytics.bigdl.tensor.Tensor
+import com.intel.analytics.bigdl.utils.BigDLSpecHelper
+import com.intel.analytics.bigdl.utils.RandomGenerator.RNG
+
+import scala.util.Random
+
+class MaxPoolingSpec extends BigDLSpecHelper {
+  "Max Pooling test1" should "be correct" in {
+    val batchSize = 2
+    val input = Tensor[Float](batchSize, 480, 28, 28).apply1(e => Random.nextFloat())
+
+    RNG.setSeed(100)
+    val pool = MaxPooling(3, 3, 2, 2)
+    RNG.setSeed(100)
+    val layer = SpatialMaxPooling[Float](3, 3, 2, 2).ceil()
+
+    val output2 = layer.forward(input).toTensor[Float]
+
+    val seq = Sequential()
+    seq.add(ReorderMemory(HeapData(Array(batchSize, 480, 28, 28), Memory.Format.nchw),
+      HeapData(Array(batchSize, 480, 28, 28), Memory.Format.nchw)))
+    seq.add(pool)
+    seq.add(ReorderMemory(HeapData(Array(batchSize, 480, 14, 14), Memory.Format.nchw),
+      HeapData(Array(batchSize, 480, 14, 14), Memory.Format.nchw)))
+    seq.compile(Phase.TrainingPhase, Array(HeapData(Array(batchSize, 480, 28, 28),
+      Memory.Format.nchw)))
+    val output1 = seq.forward(input)
+    output1 should be(output2)
+
+    val grad2 = layer.backward(input, output2).toTensor[Float]
+    val grad1 = seq.backward(input, output2)
+    grad1 should be(grad2)
+  }
+
+  "Max Pooling test2" should "be correct" in {
+    val batchSize = 2
+    val input = Tensor[Float](batchSize, 64, 112, 112).apply1(e => Random.nextFloat())
+
+    RNG.setSeed(100)
+    val pool = MaxPooling(3, 3, 2, 2)
+    RNG.setSeed(100)
+    val layer = SpatialMaxPooling[Float](3, 3, 2, 2).ceil()
+
+    val output2 = layer.forward(input).toTensor[Float]
+
+    val seq = Sequential()
+    seq.add(ReorderMemory(HeapData(Array(batchSize, 64, 112, 112), Memory.Format.nchw),
+      HeapData(Array(batchSize, 64, 112, 112), Memory.Format.nchw)))
+    seq.add(pool)
+    seq.add(ReorderMemory(HeapData(Array(batchSize, 64, 56, 56), Memory.Format.nchw),
+      HeapData(Array(batchSize, 64, 56, 56), Memory.Format.nchw)))
+    seq.compile(Phase.TrainingPhase, Array(HeapData(Array(batchSize, 64, 112, 112),
+      Memory.Format.nchw)))
+    val output1 = seq.forward(input)
+    output1 should be(output2)
+
+    val grad2 = layer.backward(input, output2).toTensor[Float]
+    val grad1 = seq.backward(input, output2)
+    grad1 should be(grad2)
+  }
+}
diff --git a/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/ReLUSpec.scala b/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/ReLUSpec.scala
new file mode 100644
index 00000000000..349f8c719ce
--- /dev/null
+++ b/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/ReLUSpec.scala
@@ -0,0 +1,76 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.intel.analytics.bigdl.nn.mkldnn
+
+import com.intel.analytics.bigdl.mkl.Memory
+import com.intel.analytics.bigdl.nn
+import com.intel.analytics.bigdl.nn.mkldnn.Phase.TrainingPhase
+import com.intel.analytics.bigdl.numeric.NumericFloat
+import com.intel.analytics.bigdl.tensor.Tensor
+import com.intel.analytics.bigdl.utils.T
+import org.scalatest.{FlatSpec, Matchers}
+
+class ReLUSpec extends FlatSpec with Matchers {
+  "a simple relu" should "be correct" in {
+    val layer = ReLU(0.0f)
+    val input = Tensor[Float](T(
+      T(1.0, 2.0),
+      T(-1.0, -2.0)
+    ))
+    val seq = Sequential()
+    seq.add(ReorderMemory(HeapData(Array(2, 2), Memory.Format.nc),
+      HeapData(Array(2, 2), Memory.Format.nc)))
+    seq.add(layer)
+    seq.add(ReorderMemory(HeapData(Array(2, 2), Memory.Format.nc),
+      HeapData(Array(2, 2), Memory.Format.nc)))
+    seq.compile(Phase.TrainingPhase, Array(HeapData(Array(2, 2), Memory.Format.nc)))
+    seq.forward(input) should be(Tensor[Float](T(
+      T(1.0, 2.0),
+      T(0.0, 0.0)
+    )))
+    val grad = Tensor[Float](T(
+      T(-1.0, -2.0),
+      T(1.0, 2.0)
+    ))
+    seq.backward(input, grad) should be(Tensor[Float](T(
+      T(-1.0, -2.0),
+      T(0.0, 0.0)
+    )))
+  }
+
+  "Relu dnn should be same with bigdl relu" should "work correctly" in {
+    val input = Tensor(4, 96, 55, 55).rand(-1, 1)
+    val gradOutput = Tensor(4, 96, 55, 55).rand(-1, 1)
+
+    val relu = nn.ReLU(ip = false)
+    val reludnn = ReLU()
+    val defaultFormat = HeapData(input.size(), Memory.Format.nchw)
+    reludnn.setRuntime(new MklDnnRuntime)
+    reludnn.initFwdPrimitives(Array(defaultFormat), TrainingPhase)
+    reludnn.initBwdPrimitives(Array(defaultFormat), TrainingPhase)
+
+    val output = relu.forward(input)
+    val gradInput = relu.backward(input, gradOutput)
+
+    val outputdnn = reludnn.forward(input)
+    val gradInputdnn = reludnn.backward(input, gradOutput)
+
+    Equivalent.nearequals(output, Tools.dense(outputdnn).toTensor) should be(true)
+    Equivalent.nearequals(gradInput, Tools.dense(gradInputdnn).toTensor) should be(true)
+  }
+
+}
diff --git a/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/ReorderMemorySpec.scala b/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/ReorderMemorySpec.scala
new file mode 100644
index 00000000000..d7b6f5ff14a
--- /dev/null
+++ b/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/ReorderMemorySpec.scala
@@ -0,0 +1,51 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.intel.analytics.bigdl.nn.mkldnn
+
+import com.intel.analytics.bigdl.mkl.Memory
+import com.intel.analytics.bigdl.tensor.Tensor
+import com.intel.analytics.bigdl.utils.BigDLSpecHelper
+
+class ReorderMemorySpec extends BigDLSpecHelper {
+  "From heap to native" should "be correct" in {
+    val layer = ReorderMemory(new NativeData(Array(3, 4), Memory.Format.nc),
+      HeapData(Array(3, 4), Memory.Format.nc))
+    layer.setRuntime(new MklDnnRuntime())
+    layer.initFwdPrimitives(Array(HeapData(Array(3, 4), Memory.Format.nc)), Phase.TrainingPhase)
+    layer.initBwdPrimitives(Array(NativeData(Array(3, 4), Memory.Format.nc)), Phase.TrainingPhase)
+    val input = Tensor[Float](3, 4).rand()
+    val output = layer.forward(input)
+    val grad = layer.backward(input, output)
+    grad should be(input)
+  }
+
+  "From heap to heap" should "be correct" in {
+    val layer = ReorderMemory(
+      HeapData(Array(3, 4), Memory.Format.nc),
+      HeapData(Array(3, 4), Memory.Format.nc),
+      HeapData(Array(3, 4), Memory.Format.nc),
+      HeapData(Array(3, 4), Memory.Format.nc)
+    )
+    layer.setRuntime(new MklDnnRuntime())
+    layer.initFwdPrimitives(Array(HeapData(Array(3, 4), Memory.Format.nc)), Phase.TrainingPhase)
+    layer.initBwdPrimitives(Array(NativeData(Array(3, 4), Memory.Format.nc)), Phase.TrainingPhase)
+    val input = Tensor[Float](3, 4).rand()
+    val output = layer.forward(input)
+    val grad = layer.backward(input, output)
+    grad should be(input)
+  }
+
+}
diff --git a/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/SequentialSpec.scala b/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/SequentialSpec.scala
new file mode 100644
index 00000000000..06fa24108c9
--- /dev/null
+++ b/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/SequentialSpec.scala
@@ -0,0 +1,102 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.intel.analytics.bigdl.nn.mkldnn
+
+import com.intel.analytics.bigdl.mkl.{Memory, MklDnn}
+import com.intel.analytics.bigdl.tensor.Tensor
+import com.intel.analytics.bigdl.utils.BigDLSpecHelper
+
+class SequentialSpec extends BigDLSpecHelper {
+  "Sequential" should "not be called add after compilation" in {
+    val layer = ReorderMemory(NativeData(Array(3, 4), Memory.Format.nc),
+      HeapData(Array(3, 4), Memory.Format.nc))
+    val layer2 = ReorderMemory(NativeData(Array(3, 4), Memory.Format.nc),
+      NativeData(Array(3, 4), Memory.Format.nc))
+    val seq = new Sequential()
+    seq.add(layer)
+    seq.compile(Phase.TrainingPhase, Array(HeapData(Array(3, 4), Memory.Format.nc)))
+    intercept[IllegalArgumentException] {
+      seq.add(layer2)
+    }
+  }
+
+  "Sequential" should "be correct when no memory reorder happened" in {
+    val layer1 = ReorderMemory(NativeData(Array(3, 4), Memory.Format.nc),
+      HeapData(Array(3, 4), Memory.Format.nc))
+    val layer2 = ReorderMemory(NativeData(Array(3, 4), Memory.Format.io),
+      NativeData(Array(3, 4), Memory.Format.nc))
+    val layer3 = ReorderMemory(HeapData(Array(3, 4), Memory.Format.nc),
+      NativeData(Array(3, 4), Memory.Format.io))
+    val seq = new Sequential()
+    seq.add(layer1)
+    seq.add(layer2)
+    seq.add(layer3)
+    seq.compile(Phase.TrainingPhase, Array(HeapData(Array(3, 4), Memory.Format.nc)))
+    val input1 = Tensor[Float](3, 4).rand()
+    val input2 = Tensor[Float](3, 4).rand()
+    val output1 = seq.forward(input1)
+    output1 should be(input1)
+    val output2 = seq.forward(input2)
+    output2 should be(input2)
+
+    val gradOutput1 = Tensor[Float](3, 4).rand()
+    val gradInput1 = seq.backward(input1, gradOutput1)
+    gradInput1 should be(gradOutput1)
+
+    val gradOutput2 = Tensor[Float](3, 4).rand()
+    val gradInput2 = seq.backward(input2, gradOutput2)
+    gradInput2 should be(gradOutput2)
+  }
+
+  "Sequential" should "be correct when auto add memory reorder" in {
+    val layer1 = ReorderMemory(
+      HeapData(Array(3, 4), Memory.Format.nc),
+      HeapData(Array(3, 4), Memory.Format.nc),
+      HeapData(Array(3, 4), Memory.Format.nc),
+      HeapData(Array(3, 4), Memory.Format.nc))
+    val layer2 = ReorderMemory(
+      NativeData(Array(3, 4), Memory.Format.nc),
+      NativeData(Array(3, 4), Memory.Format.io),
+      NativeData(Array(3, 4), Memory.Format.nc),
+      NativeData(Array(3, 4), Memory.Format.io))
+    val layer3 = ReorderMemory(
+      HeapData(Array(3, 4), Memory.Format.nc),
+      HeapData(Array(3, 4), Memory.Format.nc),
+      HeapData(Array(3, 4), Memory.Format.nc),
+      HeapData(Array(3, 4), Memory.Format.nc))
+    val seq = Sequential()
+    seq.add(layer1)
+    seq.add(layer2)
+    seq.add(layer3)
+    seq.compile(Phase.TrainingPhase, Array(HeapData(Array(3, 4), Memory.Format.nc)))
+    val input1 = Tensor[Float](3, 4).rand()
+    val input2 = Tensor[Float](3, 4).rand()
+    println(s"Input1 is $input1")
+    println(s"Input2 is $input2")
+    val output1 = seq.forward(input1)
+    output1 should be(input1)
+    val output2 = seq.forward(input2)
+    output2 should be(input2)
+
+    val gradOutput1 = Tensor[Float](3, 4).rand()
+    val gradInput1 = seq.backward(input1, gradOutput1)
+    gradInput1 should be(gradOutput1)
+
+    val gradOutput2 = Tensor[Float](3, 4).rand()
+    val gradInput2 = seq.backward(input2, gradOutput2)
+    gradInput2 should be(gradOutput2)
+  }
+}
diff --git a/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/SingleLayerSpec.scala b/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/SingleLayerSpec.scala
new file mode 100644
index 00000000000..8aa6bcc9e25
--- /dev/null
+++ b/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/SingleLayerSpec.scala
@@ -0,0 +1,329 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.intel.analytics.bigdl.nn.mkldnn
+
+import com.intel.analytics.bigdl.mkl.Memory
+import com.intel.analytics.bigdl.nn.mkldnn.Phase.TrainingPhase
+import com.intel.analytics.bigdl.tensor.{MklDnnType, Tensor}
+import org.scalatest.{BeforeAndAfter, FlatSpec, Ignore, Matchers}
+
+@Ignore
+class SingleLayerSpec extends FlatSpec with Matchers with BeforeAndAfter {
+  "convolution" should "work correctly" in {
+    val inputShape = Array(4, 3, 5, 5)
+    val outputShape = Array(4, 2, 3, 3)
+    val name = "conv"
+    val nOutput = 2
+    val kernel = 3
+    val pad = 1
+    val stride = 2
+
+    val prototxt =
+      s"""
+         |name: "conv-simple"
+         |force_backward: true
+         |layer {
+         |  name: "data"
+         |  type: "DummyData"
+         |  top: "data"
+         |  include {
+         |    phase: TRAIN
+         |  }
+         |  dummy_data_param {
+         |    data_filler {
+         |      type: "xavier"
+         |    }
+         |    shape: { ${shape2Dim(inputShape)} }
+         |  }
+         |}
+         |
+         |layer {
+         |  bottom: "data"
+         |  top: "conv"
+         |  name: "$name"
+         |  type: "Convolution"
+         |  convolution_param {
+         |    num_output: $nOutput
+         |    kernel_size: $kernel
+         |    pad: $pad
+         |    stride: $stride
+         |    weight_filler {
+         |      type: "msra"
+         |      variance_norm: FAN_OUT
+         |    }
+         |    bias_filler {
+         |      type: "gaussian"
+         |    }
+         |  }
+         |}
+       """.stripMargin
+
+    val conv = SpatialConvolution(3, nOutput, kernel, kernel, stride, stride, pad, pad, 1)
+      .setName(name)
+    conv.setRuntime(new MklDnnRuntime)
+    conv.initFwdPrimitives(Array(HeapData(inputShape, Memory.Format.nchw)), TrainingPhase)
+    conv.initBwdPrimitives(Array(HeapData(outputShape, Memory.Format.nchw)), TrainingPhase)
+    conv.initGradWPrimitives(Array(HeapData(outputShape, Memory.Format.nchw)), TrainingPhase)
+    Tools.compare(prototxt, conv, inputShape, outputShape)
+  }
+
+  "convolution2" should "work correctly" in {
+    val inputShape = Array(4, 3, 224, 224)
+    val outputShape = Array(4, 64, 112, 112)
+    val name = "conv"
+    val nOutput = 64
+    val kernel = 7
+    val pad = 3
+    val stride = 2
+
+    val prototxt =
+      s"""
+         |name: "conv-simple"
+         |force_backward: true
+         |layer {
+         |  name: "data"
+         |  type: "DummyData"
+         |  top: "data"
+         |  include {
+         |    phase: TRAIN
+         |  }
+         |  dummy_data_param {
+         |    data_filler {
+         |      type: "xavier"
+         |    }
+         |    shape: { ${shape2Dim(inputShape)} }
+         |  }
+         |}
+         |
+         |layer {
+         |  bottom: "data"
+         |  top: "conv"
+         |  name: "$name"
+         |  type: "Convolution"
+         |  convolution_param {
+         |    num_output: $nOutput
+         |    kernel_size: $kernel
+         |    pad: $pad
+         |    stride: $stride
+         |    weight_filler {
+         |      type: "msra"
+         |      variance_norm: FAN_OUT
+         |    }
+         |    bias_filler {
+         |      type: "gaussian"
+         |    }
+         |  }
+         |}
+       """.stripMargin
+
+    val conv = SpatialConvolution(3, nOutput, kernel, kernel, stride, stride, pad, pad, 1)
+      .setName(name)
+    val seq = Sequential()
+      .add(conv)
+      .add(ReorderMemory(HeapData(outputShape, Memory.Format.nchw)))
+
+    seq.compile(TrainingPhase, Array(HeapData(inputShape, Memory.Format.nchw)))
+
+    Tools.compare(prototxt, seq, inputShape, outputShape)
+  }
+
+  "max pooling" should "work correctly" in {
+    val inputShape = Array(4, 64, 112, 112)
+    val outputShape = Array(4, 64, 56, 56)
+    val name = "pool"
+    val prototxt =
+      s"""
+         |name: "maxpool-simple"
+         |force_backward: true
+         |layer {
+         |  name: "data"
+         |  type: "DummyData"
+         |  top: "data"
+         |  include {
+         |    phase: TRAIN
+         |  }
+         |  dummy_data_param {
+         |    data_filler {
+         |      type: "xavier"
+         |    }
+         |    shape: { ${shape2Dim(inputShape)} }
+         |  }
+         |}
+         |
+         |layer {
+         |  bottom: "data"
+         |  top: "pool"
+         |  name: "$name"
+         |  type: "Pooling"
+         |  pooling_param {
+         |    kernel_size: 3
+         |    stride: 2
+         |    pool: MAX
+         |  }
+         |}
+       """.stripMargin
+
+    val maxPooling = MaxPooling(3, 3, 2, 2).setName(name)
+    maxPooling.setRuntime(new MklDnnRuntime)
+    maxPooling.initFwdPrimitives(Array(HeapData(inputShape, Memory.Format.nchw)), TrainingPhase)
+    maxPooling.initBwdPrimitives(Array(HeapData(outputShape, Memory.Format.nchw)), TrainingPhase)
+
+    Tools.compare(prototxt, maxPooling, inputShape, outputShape)
+  }
+
+  "avg pooling" should "work correctly" in {
+    val inputShape = Array(4, 3, 7, 7)
+    val outputShape = Array(4, 3, 3, 3)
+    val name = "pool"
+    val prototxt =
+      s"""
+         |name: "maxpool-simple"
+         |force_backward: true
+         |layer {
+         |  name: "data"
+         |  type: "DummyData"
+         |  top: "data"
+         |  include {
+         |    phase: TRAIN
+         |  }
+         |  dummy_data_param {
+         |    data_filler {
+         |      type: "xavier"
+         |    }
+         |    shape: { ${shape2Dim(inputShape)} }
+         |  }
+         |}
+         |
+         |layer {
+         |  bottom: "data"
+         |  top: "pool"
+         |  name: "$name"
+         |  type: "Pooling"
+         |  pooling_param {
+         |    kernel_size: 3
+         |    stride: 2
+         |    pool: AVE
+         |  }
+         |}
+       """.stripMargin
+
+    val avgPooling = AvgPooling(3, 3, 2, 2).setName(name)
+    avgPooling.setRuntime(new MklDnnRuntime)
+    avgPooling.initFwdPrimitives(Array(HeapData(inputShape, Memory.Format.nchw)), TrainingPhase)
+    avgPooling.initBwdPrimitives(Array(HeapData(outputShape, Memory.Format.nchw)), TrainingPhase)
+    Tools.compare(prototxt, avgPooling, inputShape, outputShape)
+  }
+
+  "linear " should "work correctly" in {
+    val (batchSize, nInput) = (4, 64)
+    val inputShape = Array(batchSize, nInput)
+    val nOutput = 1000
+    val outputShape = Array(batchSize, nOutput)
+    val name = "fc"
+
+    val prototxt =
+      s"""
+         |name: "relu-simple"
+         |force_backward: true
+         |layer {
+         |  name: "data"
+         |  type: "DummyData"
+         |  top: "data"
+         |  include {
+         |    phase: TRAIN
+         |  }
+         |  dummy_data_param {
+         |    data_filler {
+         |      type: "xavier"
+         |    }
+         |    shape: { ${shape2Dim(inputShape)} }
+         |  }
+         |}
+         |
+         |layer {
+         |  bottom: "data"
+         |  top: "$name"
+         |  name: "$name"
+         |  type: "InnerProduct"
+         |  inner_product_param {
+         |    num_output: $nOutput
+         |    weight_filler {
+         |      type: "gaussian"
+         |      std: 0.01
+         |    }
+         |    bias_filler {
+         |      type: "constant"
+         |      value: 0
+         |    }
+         |  }
+         |}
+       """.stripMargin
+    val linear = Linear(nInput, nOutput).setName(name)
+    linear.setRuntime(new MklDnnRuntime)
+    linear.initFwdPrimitives(Array(HeapData(inputShape, Memory.Format.nc)), TrainingPhase)
+    linear.initBwdPrimitives(Array(HeapData(outputShape, Memory.Format.nc)), TrainingPhase)
+    linear.initGradWPrimitives(Array(HeapData(outputShape, Memory.Format.nc)), TrainingPhase)
+
+    Tools.compare(prototxt, linear, inputShape, outputShape)
+  }
+
+  "relu" should "work correctly" in {
+    val (batchSize, channel, height, width) = (4, 64, 112, 112)
+    val inputShape = Array(batchSize, channel, height, width)
+    val outputShape = inputShape
+    val name = "relu"
+    val prototxt =
+      s"""
+         |name: "relu-simple"
+         |force_backward: true
+         |layer {
+         |  name: "data"
+         |  type: "DummyData"
+         |  top: "data"
+         |  include {
+         |    phase: TRAIN
+         |  }
+         |  dummy_data_param {
+         |    data_filler {
+         |      type: "xavier"
+         |    }
+         |    shape: { dim: $batchSize dim: $channel dim: $height dim: $width }
+         |  }
+         |}
+         |
+         |layer {
+         |  bottom: "data"
+         |  top: "relu"
+         |  name: "$name"
+         |  type: "ReLU"
+         |  relu_param {
+         |  }
+         |}
+       """.stripMargin
+
+    val relu = ReLU().setName(name)
+    relu.setRuntime(new MklDnnRuntime)
+    relu.initFwdPrimitives(Array(HeapData(inputShape, Memory.Format.nchw)), TrainingPhase)
+    relu.initBwdPrimitives(Array(HeapData(outputShape, Memory.Format.nchw)), TrainingPhase)
+    Tools.compare(prototxt, relu, inputShape, outputShape)
+  }
+
+  private def shape2Dim(shape: Array[Int]): String = {
+    shape.map(x => "dim: " + x).mkString(" ")
+  }
+}
+
diff --git a/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/SoftMaxSpec.scala b/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/SoftMaxSpec.scala
new file mode 100644
index 00000000000..e6a2790fd82
--- /dev/null
+++ b/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/SoftMaxSpec.scala
@@ -0,0 +1,142 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.intel.analytics.bigdl.nn.mkldnn
+
+import com.intel.analytics.bigdl.mkl.Memory
+import com.intel.analytics.bigdl.nn
+import com.intel.analytics.bigdl.nn.mkldnn.Phase.InferencePhase
+import com.intel.analytics.bigdl.numeric.NumericFloat
+import com.intel.analytics.bigdl.tensor.Tensor
+import org.scalatest.{FlatSpec, Matchers}
+
+class SoftMaxSpec extends FlatSpec with Matchers {
+  "SoftMax forward 1-D" should "work correctly" in {
+    // we should test the cases which contain 1
+    val tests = List(2, 1)
+
+    for (x <- tests) {
+      val sm = SoftMax()
+      sm.evaluate()
+      sm.setRuntime(new MklDnnRuntime)
+      sm.initFwdPrimitives(Array(HeapData(Array(x), Memory.Format.x)), InferencePhase)
+
+      val input = Tensor(x).rand()
+
+      val output = sm.forward(input)
+
+      val nnSm = nn.SoftMax()
+      val nnOutput = nnSm.forward(input)
+
+      Tools.dense(output) should be (nnOutput)
+    }
+  }
+
+  "SoftMax forward 2-D" should "work correctly" in {
+    val tests = List(
+      (2, 3),
+      (1, 3),
+      (1, 1),
+      (2, 1))
+
+    for ((batchSize, channel) <- tests) {
+      val sm = SoftMax()
+      sm.setRuntime(new MklDnnRuntime)
+      sm.initFwdPrimitives(Array(HeapData(Array(batchSize, channel), Memory.Format.nc)),
+        InferencePhase)
+      sm.evaluate()
+
+      val input = Tensor(batchSize, channel).rand()
+
+      val output = sm.forward(input)
+
+      val nnSm = nn.SoftMax()
+      val nnOutput = nnSm.forward(input)
+
+      Tools.dense(output) shouldEqual nnOutput
+    }
+  }
+
+  "SoftMax forward 4-D" should "work correctly" in {
+    // we should test the cases which contain 1
+    val tests = List(
+      (2, 3, 4, 4),
+      (1, 3, 4, 4),
+      (1, 3, 1, 1),
+      (1, 1, 1, 1),
+      (1, 1, 3, 3),
+      (2, 1, 3, 3),
+      (2, 2, 1, 1))
+
+    for ((batchSize, channel, height, width) <- tests) {
+      val sm = SoftMax()
+      sm.setRuntime(new MklDnnRuntime)
+      sm.initFwdPrimitives(Array(HeapData(Array(batchSize, channel, height, width),
+        Memory.Format.nchw)), InferencePhase)
+      sm.evaluate()
+
+      val input = Tensor(batchSize, channel, height, width).rand()
+
+      val output = sm.forward(input)
+
+      val nnSm = nn.SoftMax()
+      val nnOutput = nnSm.forward(input)
+
+      Tools.dense(output) should be (nnOutput)
+    }
+  }
+
+  "SoftMax backward" should "work correctly" in {
+    val (batchSize, channel, height, width) = (2, 3, 4, 4)
+    val sm = SoftMax()
+    sm.setRuntime(new MklDnnRuntime)
+    sm.initFwdPrimitives(Array(HeapData(Array(batchSize, channel, height, width),
+      Memory.Format.nchw)), InferencePhase)
+
+    val nnSm = nn.SoftMax()
+
+    val input = Tensor(batchSize, channel, height, width).rand()
+    val gradOutput = Tensor().resizeAs(input).rand(-10, 10)
+
+    sm.forward(input)
+    nnSm.forward(input)
+
+    sm.backward(input, gradOutput)
+    nnSm.backward(input, gradOutput)
+
+    sm.output should be (nnSm.output)
+    sm.gradInput should be (nnSm.gradInput)
+  }
+
+  "SoftMax multi times forward" should "work correctly" in {
+    val (batchSize, channel, height, width) = (2, 3, 4, 4)
+    val sm = SoftMax()
+    sm.setRuntime(new MklDnnRuntime)
+    sm.initFwdPrimitives(Array(HeapData(Array(batchSize, channel, height, width),
+      Memory.Format.nchw)), InferencePhase)
+    sm.evaluate()
+
+    val nnSm = nn.SoftMax()
+
+    (0 until 5).foreach { _ =>
+      val input = Tensor(batchSize, channel, height, width).rand(-1, 1)
+      sm.forward(input)
+      nnSm.forward(input)
+
+      Tools.dense(sm.output) should be (nnSm.output)
+    }
+  }
+}
diff --git a/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/SpatialBatchNormalizationSpec.scala b/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/SpatialBatchNormalizationSpec.scala
new file mode 100644
index 00000000000..d49a27aa5d9
--- /dev/null
+++ b/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/SpatialBatchNormalizationSpec.scala
@@ -0,0 +1,535 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.intel.analytics.bigdl.nn.mkldnn
+
+import com.intel.analytics.bigdl.mkl.Memory
+import com.intel.analytics.bigdl.nn
+import com.intel.analytics.bigdl.nn.abstractnn.Activity
+import com.intel.analytics.bigdl.nn.mkldnn.Phase.TrainingPhase
+import com.intel.analytics.bigdl.numeric.NumericFloat
+import com.intel.analytics.bigdl.tensor.Tensor
+import com.intel.analytics.bigdl.utils.RandomGenerator._
+import org.scalatest.{FlatSpec, Ignore, Matchers}
+
+class SpatialBatchNormalizationSpec extends FlatSpec with Matchers {
+  "a simple bn with random input" should "work correctly" in {
+    val batchSize = 2
+    RNG.setSeed(100)
+    val input = Tensor(100, 1, 10, 10).rand(-1, 1)
+    val (channel, height, width) = (1, 10, 10)
+
+    val initWeight = Tensor(channel).rand(-1, 1)
+    val initBias = Tensor(channel).fill(0)
+
+    val bn = SpatialBatchNormalization(1, 0.0, initWeight = initWeight, initBias = initBias)
+    val nnBn = nn.SpatialBatchNormalization(1, 0.0, initWeight = initWeight, initBias = initBias)
+
+    val inputShape = Array(100, 1, 10, 10)
+    bn.setRuntime(new MklDnnRuntime)
+    bn.initFwdPrimitives(Array(HeapData(inputShape, Memory.Format.nchw)), TrainingPhase)
+    bn.initBwdPrimitives(Array(HeapData(inputShape, Memory.Format.nchw)), TrainingPhase)
+    bn.initGradWPrimitives(Array(HeapData(inputShape, Memory.Format.nchw)), TrainingPhase)
+
+    val out1 = bn.forward(input)
+    val out2 = nnBn.forward(input)
+
+    Equivalent.nearequals(Tools.dense(out1).toTensor, out2, 1e-4) should be(true)
+
+    val gradOutput = Tensor[Float]().resizeAs(input).rand()
+
+    bn.backward(input, gradOutput)
+    nnBn.backward(input, gradOutput)
+
+    val gradWeight1 = Tools.dense(bn.gradWeightAndBias).toTensor
+    val gradWeight2 = nnBn.getParameters()._2
+
+    val weight1 = Tools.dense(bn.weightAndBias).toTensor
+    val weight2 = nnBn.getParameters()._1
+
+    Equivalent.nearequals(weight1, weight2) should be (true)
+    Equivalent.nearequals(gradWeight1, gradWeight2) should be (true)
+
+    Equivalent.nearequals(Tools.dense(bn.gradInput).toTensor, nnBn.gradInput) should be (true)
+  }
+
+  "bn updateOutput" should "work correctly" in {
+    val (batchSize, channel, height, width) = (4, 64, 112, 112)
+    val inputShape = Array(batchSize, channel, height, width)
+    val defaultFormat = HeapData(inputShape, Memory.Format.nchw)
+    val epsilon = 1e-5
+
+    val input = Tensor(batchSize, channel, height, width).rand(-1, 1)
+    val initWeight = Tensor(channel).rand(-1, 1)
+    val initBias = Tensor(channel).fill(0)
+
+    val bn = SpatialBatchNormalization(channel, epsilon, initWeight = initWeight,
+      initBias = initBias)
+    bn.setRuntime(new MklDnnRuntime)
+    bn.initFwdPrimitives(Array(defaultFormat), TrainingPhase)
+    bn.initBwdPrimitives(Array(defaultFormat), TrainingPhase)
+    bn.initGradWPrimitives(Array(defaultFormat), TrainingPhase)
+
+    val output = Tools.toNCHW(bn.forward(input).toTensor, bn.outputFormats()(0))
+
+    val nnBn = nn.SpatialBatchNormalization(channel, epsilon,
+      initWeight = initWeight, initBias = initBias)
+    val nnOutput = nnBn.forward(input)
+
+    Equivalent.nearequals(output, nnOutput) should be (true)
+  }
+
+  "bn updateOutput multi times" should "work correctly" in {
+    val (batchSize, channel, height, width) = (2, 3, 4, 4)
+    val inputShape = Array(batchSize, channel, height, width)
+    val defaultFormat = HeapData(inputShape, Memory.Format.nchw)
+    val epsilon = 1e-5
+
+    val input = Tensor(batchSize, channel, height, width).rand(-1, 1)
+    val initWeight = Tensor(channel).rand(-1, 1)
+    val initBias = Tensor(channel).rand(-1, 1)
+
+    val bn = SpatialBatchNormalization(channel, epsilon, initWeight = initWeight,
+      initBias = initBias)
+    bn.setRuntime(new MklDnnRuntime)
+    bn.initFwdPrimitives(Array(defaultFormat), TrainingPhase)
+    bn.initBwdPrimitives(Array(defaultFormat), TrainingPhase)
+    bn.initGradWPrimitives(Array(defaultFormat), TrainingPhase)
+
+    Utils.manyTimes(bn.forward(input))(10)
+
+    val nnBn = nn.SpatialBatchNormalization(channel, epsilon,
+      initWeight = initWeight, initBias = initBias)
+
+    Utils.manyTimes(nnBn.forward(input))(10)
+
+    val output = Tools.toNCHW(bn.output.toTensor, bn.outputFormats()(0))
+
+    Equivalent.nearequals(output, nnBn.output.toTensor) should be (true)
+  }
+
+  "bn backward" should "work correctly" in {
+    val (batchSize, channel, height, width) = (2, 3, 4, 4)
+    val inputShape = Array(batchSize, channel, height, width)
+    val defaultFormat = HeapData(inputShape, Memory.Format.nchw)
+    val epsilon = 0.0f
+
+    val input = Tensor(batchSize, channel, height, width).rand(-1, 1)
+    val gradOutput = Tensor().resize(inputShape).rand(-1, 1)
+    val initWeight = Tensor(channel).rand(-1, 1)
+    val initBias = Tensor(channel).rand(-1, 1)
+
+    val bn = SpatialBatchNormalization(channel, epsilon, initWeight = initWeight,
+      initBias = initBias)
+    bn.setRuntime(new MklDnnRuntime)
+    bn.initFwdPrimitives(Array(defaultFormat), TrainingPhase)
+    bn.initBwdPrimitives(Array(defaultFormat), TrainingPhase)
+    bn.initGradWPrimitives(Array(defaultFormat), TrainingPhase)
+
+    val nnBn = nn.SpatialBatchNormalization(channel, epsilon,
+      initWeight = initWeight, initBias = initBias)
+
+    bn.forward(input)
+    nnBn.forward(input)
+
+    val output = Tools.toNCHW(bn.output.toTensor, bn.outputFormats()(0))
+
+    Equivalent.nearequals(output, nnBn.output) should be (true)
+
+    bn.backward(input, gradOutput)
+    val nnGradInput = nnBn.backward(input, gradOutput)
+
+    val gradInput = Tools.toNCHW(bn.gradInput.toTensor, bn.gradInputFormats()(0))
+    val weightAndBias = Tools.dense(bn.parameters()._2(0)).toTensor
+
+    Equivalent.nearequals(gradInput, nnGradInput.toTensor) should be (true)
+    Equivalent.nearequals(weightAndBias, nnBn.getParameters()._2) should be (true)
+  }
+
+//  "bn perf" should "work correctly" in {
+//    // For PERF test. It seems sometimes batch norm maybe slower than java version.
+//    val (batchSize, channel, height, width) = (4, 64, 112, 112)
+//    val inputShape = Array(batchSize, channel, height, width)
+//    val defaultFormat = HeapData(inputShape, Memory.Format.nChw8c)
+//    val epsilon = 0.0f
+//
+//    val input = Tensor(batchSize, channel, height, width).rand(-1, 1)
+//    val gradOutput = Tensor().resizeAs(input).rand(-1, 1)
+//
+//    val initWeight = Tensor(channel).rand(-1, 1)
+//    val initBias = Tensor(channel).rand(-1, 1)
+//
+//    val bn = SpatialBatchNormalization(channel, epsilon, initWeight = initWeight,
+//      initBias = initBias)
+//    bn.setRuntime(new MklDnnRuntime)
+//    bn.initFwdPrimitives(Array(defaultFormat), TrainingPhase)
+//    bn.initBwdPrimitives(Array(defaultFormat), TrainingPhase)
+//    bn.initGradWPrimitives(Array(defaultFormat), TrainingPhase)
+//
+//    val nnBn = nn.SpatialBatchNormalization(channel, epsilon,
+//      initWeight = initWeight, initBias = initBias)
+//
+//    val times = Utils.manyTimes {
+//      bn.forward(input)
+//      bn.backward(input, gradOutput)
+//    } _
+//
+//    val nnTimes = Utils.manyTimes {
+//      nnBn.forward(input)
+//      nnBn.backward(input, gradOutput)
+//    } _
+//
+//    times(10)
+//    nnTimes(10)
+//
+//    val costs = times(50)._1
+//    val nnCosts = nnTimes(50)._1
+//
+//    costs should be < (nnCosts)
+//  }
+
+  "a complicated batch norm" should "work correctly" in {
+    val (channel, height, width) = (64, 112, 112)
+    val epsilon = 1e-3
+    val batchSize = 2
+
+    RNG.setSeed(100)
+    val input = Tensor[Float](Array(batchSize, 64, 112, 112)).rand(-1, 1)
+    val gradOutput = Tensor().resizeAs(input).copy(input)
+
+    RNG.setSeed(100)
+    val initWeight = Tensor(channel).rand(-1, 1)
+    val initBias = Tensor(channel).fill(0f)
+
+    val inputShape = input.size()
+    val outputShape = input.size()
+    val defaultFormat = HeapData(inputShape, Memory.Format.nchw)
+
+    val bn = SpatialBatchNormalization(channel, epsilon, initWeight = initWeight,
+      initBias = initBias)
+    bn.setRuntime(new MklDnnRuntime)
+    bn.initFwdPrimitives(Array(defaultFormat), TrainingPhase)
+    bn.initBwdPrimitives(Array(defaultFormat), TrainingPhase)
+    bn.initGradWPrimitives(Array(defaultFormat), TrainingPhase)
+
+    val nnBn = nn.SpatialBatchNormalization(channel, epsilon, initWeight = initWeight,
+      initBias = initBias)
+
+    bn.zeroGradParameters()
+    nnBn.zeroGradParameters()
+
+    val (weight, gradWeight) = bn.parameters()
+    val (nnWeight, nnGradWeight) = nnBn.getParameters()
+    Equivalent.nearequals(Tools.dense(weight(0)).toTensor, nnWeight) should be(true)
+    Equivalent.nearequals(Tools.dense(gradWeight(0)).toTensor, nnGradWeight) should be(true)
+
+    val out1 = bn.forward(input)
+    val out2 = nnBn.forward(input)
+
+    Equivalent.nearequals(Tools.dense(bn.output).toTensor, nnBn.output) should be (true)
+
+    val gradInput = bn.backward(input, gradOutput)
+    val nnGradInput = nnBn.backward(input, gradOutput)
+
+    Equivalent.nearequals(Tools.dense(gradInput).toTensor, nnGradInput.toTensor) should be (true)
+    Equivalent.nearequals(Tools.dense(gradWeight(0)).toTensor, nnGradWeight, 1e-3) should be (true)
+  }
+
+  "A nChw8c input" should "work correctly" in {
+    val (batchSize, channel, height, width) = (2, 256, 56, 56)
+    val input = Tensor(batchSize, channel, height, width).rand(-1, 1)
+    val gradOutput = Tensor(batchSize, channel, height, width).rand(-1, 1)
+
+    val inputShape = input.size()
+    val reorder1 = ReorderMemory(HeapData(inputShape, Memory.Format.nChw8c))
+    val reorder2 = ReorderMemory(HeapData(inputShape, Memory.Format.nchw))
+
+    val initWeight = Tensor(channel).rand(-1, 1)
+    val initBias = Tensor(channel).rand(-1, 1)
+
+    val dnn = Sequential()
+      .add(reorder1)
+      .add(SpatialBatchNormalization(channel, 1e-3, initWeight = initWeight, initBias = initBias))
+      .add(reorder2)
+
+    dnn.compile(TrainingPhase, Array(HeapData(inputShape, Memory.Format.nchw)))
+
+    val blas = nn.Sequential().add(
+        nn.SpatialBatchNormalization(channel, 1e-3, initWeight = initWeight, initBias = initBias))
+
+    dnn.forward(input)
+    blas.forward(input)
+
+    dnn.backward(input, gradOutput)
+    blas.backward(input, gradOutput)
+
+    val gradWeight = Tools.dense(dnn.parameters()._2(0)).toTensor
+
+    Equivalent.nearequals(dnn.output.toTensor, blas.output.toTensor, 1e-4) should be (true)
+    Equivalent.nearequals(dnn.gradInput.toTensor, blas.gradInput.toTensor, 1e-4) should be (true)
+    Equivalent.nearequals(gradWeight, blas.getParameters()._2, 1e-3) should be (true)
+  }
+
+//  "A nChw16c input" should "work correctly" in {
+//    // only works on avx512 (SKX->)
+//    val (batchSize, channel, height, width) = (2, 256, 56, 56)
+//    val input = Tensor(batchSize, channel, height, width).rand(-1, 1)
+//    val gradOutput = Tensor(batchSize, channel, height, width).rand(-1, 1)
+//
+//    val inputShape = input.size()
+//    val reorder1 = ReorderMemory(HeapData(inputShape, Memory.Format.nChw16c))
+//    val reorder2 = ReorderMemory(HeapData(inputShape, Memory.Format.nchw))
+//
+//    val initWeight = Tensor(channel).rand(-1, 1)
+//    val initBias = Tensor(channel).rand(-1, 1)
+//
+//    val dnn = Sequential()
+//      .add(reorder1)
+//      .add(SpatialBatchNormalization(channel, 1e-3, initWeight = initWeight, initBias = initBias))
+//      .add(reorder2)
+//
+//    dnn.compile(TrainingPhase, Array(HeapData(inputShape, Memory.Format.nchw)))
+//
+//    val blas = nn.Sequential().add(
+//        nn.SpatialBatchNormalization(channel, 1e-3, initWeight = initWeight, initBias = initBias))
+//
+//    dnn.forward(input)
+//    blas.forward(input)
+//
+//    dnn.backward(input, gradOutput)
+//    blas.backward(input, gradOutput)
+//
+//    val gradWeight = Tools.dense(dnn.parameters()._2(0)).toTensor
+//
+//    DnnUtils.nearequals(dnn.output.toTensor, blas.output.toTensor, 1e-4) should be (true)
+//    DnnUtils.nearequals(dnn.gradInput.toTensor, blas.gradInput.toTensor, 1e-4) should be (true)
+//    DnnUtils.nearequals(gradWeight, blas.getParameters()._2, 1e-3) should be (true)
+//  }
+
+  "Sbn with relu fusion" should "work correctly" in {
+    val (batchSize, channel, height, width) = (4, 64, 112, 112)
+    val shape = Array(batchSize, channel, height, width)
+    val epsilon = 1e-5
+
+    val initWeight = Tensor(channel).rand(-1, 1)
+    val initBias = Tensor(channel).fill(0)
+
+    val bn1 = SpatialBatchNormalization(channel, epsilon, initWeight = initWeight,
+      initBias = initBias)
+    val reorder1 = ReorderMemory(HeapData(shape, Memory.Format.nchw))
+    val bn2 = SpatialBatchNormalization(channel, epsilon, initWeight = initWeight,
+      initBias = initBias)
+    val reorder2 = ReorderMemory(HeapData(shape, Memory.Format.nchw))
+
+    val model1 = Sequential().add(bn1).add(ReLU()).add(ReLU()).add(reorder1)
+    model1.compile(TrainingPhase, Array(HeapData(shape, Memory.Format.nchw)))
+
+    System.setProperty("bigdl.mkldnn.fusion.bnrelu", "true")
+    val model2 = Sequential().add(bn2).add(ReLU()).add(ReLU()).add(reorder2)
+    model2.compile(TrainingPhase, Array(HeapData(shape, Memory.Format.nchw)))
+    System.setProperty("bigdl.mkldnn.fusion.bnrelu", "false")
+
+    val input = Tensor(batchSize, channel, height, width).rand(-1, 1)
+
+    model1.forward(input)
+    model2.forward(input)
+
+    model1.output should be (model2.output)
+  }
+
+  "a simple bach norm" should "work correctly" ignore {
+    val (batchSize, channel, height, width) = (4, 64, 2, 2)
+    val shape = Array(batchSize, channel, height, width)
+    val prototxt = s"""
+         |name: "relu-simple"
+         |force_backward: true
+         |layer {
+         |  name: "data"
+         |  type: "DummyData"
+         |  top: "data"
+         |  include {
+         |    phase: TRAIN
+         |  }
+         |  dummy_data_param {
+         |    data_filler {
+         |      type: "xavier"
+         |    }
+         |    shape: { dim: $batchSize dim: $channel dim: $height dim: $width }
+         |  }
+         |}
+         |
+         |layer {
+         |  bottom: "data"
+         |  top: "bn"
+         |  name: "bn"
+         |  type: "BatchNorm"
+         |
+         |  batch_norm_param {
+         |    moving_average_fraction: 1.0
+         |    filler { value: 1 }
+         |    bias_filler { value: 1 }
+         |    relu: false
+         |    eps: 0.0
+         |  }
+         |}
+       """.stripMargin
+
+    val identity = Collect.run(prototxt)
+
+    val input = Tools.getTensor("Fwrd_data", shape, identity)
+    val output = Tools.getTensor("Fwrd_bn", shape, identity)
+    val weight = Tools.getTensor("Fwrd_bn.Wght.3", Array(channel), identity)
+    val bias = Tools.getTensor("Fwrd_bn.Wght.4", Array(channel), identity)
+    val scale = Tools.getTensor("Fwrd_bn.Wght.2", Array(1), identity)
+    val runningMean = Tools.getTensor("Fwrd_bn.Wght.0", Array(channel), identity)
+    val runningVariance = Tools.getTensor("Fwrd_bn.Wght.1", Array(channel), identity)
+    val gradOutput = Tools.getTensor("Bwrd_bn.loss", shape, identity)
+    val gradInput = Tools.getTensor("Bwrd_bn", shape, identity)
+    val gradWeight = Tools.getTensor("Bwrd_bn.Grad.3", Array(channel), identity)
+    val gradBias = Tools.getTensor("Bwrd_bn.Grad.4", Array(channel), identity)
+
+    val bn = new SpatialBatchNormalization(channel, eps = 0.0, momentum = 1.0,
+      affine = true, initWeight = weight, initBias = bias)
+
+    val reorder1 = ReorderMemory(HeapData(shape, Memory.Format.nchw)).setName("reorder1")
+    val reorder2 = ReorderMemory(HeapData(shape, Memory.Format.nchw)).setName("reorder2")
+    val reorder3 = ReorderMemory(HeapData(shape, Memory.Format.nChw8c)).setName("reorder3")
+    val reorder4 = ReorderMemory(HeapData(shape, Memory.Format.nchw)).setName("reorder4")
+
+    val seq = Sequential()
+    seq.add(reorder1)
+    seq.add(reorder3)
+    seq.add(bn)
+    seq.add(reorder2)
+    seq.compile(Phase.TrainingPhase, Array(HeapData(shape, Memory.Format.nchw)))
+    seq.reset()
+
+    bn.zeroGradParameters()
+
+    seq.forward(input)
+    seq.backward(input, gradOutput)
+
+    val weightAndBias = Tensor[Float](Array(2, channel))
+    weightAndBias.select(1, 1).copy(weight)
+    weightAndBias.select(1, 2).copy(bias)
+
+    val gradWeightAndBias = Tensor[Float](Array(2, channel))
+    gradWeightAndBias.select(1, 1).copy(gradWeight)
+    gradWeightAndBias.select(1, 2).copy(gradBias)
+
+    compare(weightAndBias.view(Array(2 * channel)), bn.weightAndBias)
+    compare(output, seq.output)
+    compare(runningMean, bn.runningMean)
+    compare(runningVariance, bn.runningVariance)
+    compare(gradWeightAndBias.view(Array(2 * channel)), bn.gradWeightAndBias)
+    compare(gradInput, seq.gradInput)
+  }
+
+  "a simple bach norm inference" should "work correctly" ignore {
+    val (batchSize, channel, height, width) = (4, 64, 112, 112)
+    val shape = Array(batchSize, channel, height, width)
+    val prototxt = s"""
+         |name: "relu-simple"
+         |force_backward: true
+         |state {
+         |  phase: TEST
+         |}
+         |layer {
+         |  name: "data"
+         |  type: "DummyData"
+         |  top: "data"
+         |  include {
+         |    phase: TRAIN
+         |  }
+         |  dummy_data_param {
+         |    data_filler {
+         |      type: "xavier"
+         |    }
+         |    shape: { dim: $batchSize dim: $channel dim: $height dim: $width }
+         |  }
+         |}
+         |
+         |layer {
+         |  bottom: "data"
+         |  top: "bn"
+         |  name: "bn"
+         |  type: "BatchNorm"
+         |
+         |  batch_norm_param {
+         |    moving_average_fraction: 1.0
+         |    filler { value: 1 }
+         |    bias_filler { value: 0 }
+         |    relu: false
+         |    eps: 0.0
+         |  }
+         |
+         |  phase: TEST
+         |}
+       """.stripMargin
+
+    val identity = Collect.run(prototxt)
+
+    val input = Tools.getTensor("Fwrd_data", shape, identity)
+    val output = Tools.getTensor("Fwrd_bn", shape, identity)
+    val weight = Tools.getTensor("Fwrd_bn.Wght.3", Array(channel), identity)
+    val bias = Tools.getTensor("Fwrd_bn.Wght.4", Array(channel), identity)
+    val scale = Tools.getTensor("Fwrd_bn.Wght.2", Array(1), identity)
+    val runningMean = Tools.getTensor("Fwrd_bn.Wght.0", Array(channel), identity)
+    val runningVariance = Tools.getTensor("Fwrd_bn.Wght.1", Array(channel), identity)
+
+    val bn = new SpatialBatchNormalization(channel, eps = 0.0, momentum = 1.0,
+      affine = true, initWeight = weight, initBias = bias)
+    bn.runningMean.copy(runningMean)
+    bn.runningVariance.copy(runningVariance)
+
+    val reorder1 = ReorderMemory(HeapData(shape, Memory.Format.nchw)).setName("reorder1")
+    val reorder2 = ReorderMemory(HeapData(shape, Memory.Format.nchw)).setName("reorder2")
+
+    val seq = Sequential()
+    seq.add(reorder1)
+    seq.add(bn)
+    seq.add(reorder2)
+    seq.compile(Phase.InferencePhase, Array(HeapData(shape, Memory.Format.nchw)))
+    seq.reset()
+    seq.evaluate()
+
+    seq.forward(input)
+
+    val weightAndBias = Tensor[Float](Array(2, channel))
+    weightAndBias.select(1, 1).copy(weight)
+    weightAndBias.select(1, 2).copy(bias)
+
+    compare(weightAndBias.view(Array(2 * channel)), bn.weightAndBias)
+    compare(runningMean, bn.runningMean)
+    compare(runningVariance, bn.runningVariance)
+
+    val denseOutput = Tools.dense(bn.output).toTensor
+
+    denseOutput.storage().array().zip(output.storage().array()).foreach { x =>
+      if (x._2.isInfinity)   x._1.isNaN should be (true)
+    }
+  }
+
+  private def compare(src: Activity, dst: Activity): Unit = {
+    if (src.isTensor) {
+      Equivalent.nearequals(Tools.dense(src).toTensor, Tools.dense(dst).toTensor) should be (true)
+    }
+  }
+
+  private def shape2Dim(shape: Array[Int]): String = {
+    shape.map(x => "dim: " + x).mkString(" ")
+  }
+}
diff --git a/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/SpatialConvolutionSpec.scala b/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/SpatialConvolutionSpec.scala
new file mode 100644
index 00000000000..b27c07d045f
--- /dev/null
+++ b/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/SpatialConvolutionSpec.scala
@@ -0,0 +1,514 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.intel.analytics.bigdl.nn.mkldnn
+
+import com.intel.analytics.bigdl._
+import com.intel.analytics.bigdl.mkl._
+import com.intel.analytics.bigdl.nn.mkldnn.Phase.TrainingPhase
+import com.intel.analytics.bigdl.nn.{Xavier, Zeros}
+import com.intel.analytics.bigdl.numeric.NumericFloat
+import com.intel.analytics.bigdl.tensor.Tensor
+import com.intel.analytics.bigdl.utils.RandomGenerator._
+import org.scalatest.{FlatSpec, Matchers}
+
+import scala.util.Random
+
+class SpatialConvolutionSpec extends FlatSpec with Matchers {
+  "ConvolutionDnn with format=nchw and ngroup=1" should "work correctly" in {
+    val nInputPlane = 2
+    val nOutputPlane = 4
+    val kW = 3
+    val kH = 3
+    val dW = 4
+    val dH = 4
+    val padW = 0
+    val padH = 0
+
+    val input = Tensor[Float](2, 2, 23, 23).apply1(e => Random.nextFloat())
+    val gradOutput = Tensor[Float](2, 4, 6, 6).apply1(e => Random.nextFloat())
+    RNG.setSeed(100)
+    val conv = SpatialConvolution(nInputPlane, nOutputPlane, kW, kH, dW, dH, padW, padH)
+    RNG.setSeed(100)
+    val layer = nn.SpatialConvolution[Float](nInputPlane, nOutputPlane, kW, kH, dW, dH, padW, padH)
+
+    conv.setRuntime(new MklDnnRuntime)
+    conv.initFwdPrimitives(Array(HeapData(Array(2, 2, 23, 23), Memory.Format.nchw)), TrainingPhase)
+    conv.initBwdPrimitives(Array(HeapData(Array(2, 4, 6, 6), Memory.Format.nchw)), TrainingPhase)
+    conv.initGradWPrimitives(Array(HeapData(Array(2, 4, 6, 6), Memory.Format.nchw)), TrainingPhase)
+
+    val output = Tools.toNCHW(conv.forward(input).toTensor, conv.outputFormats()(0))
+    val grad1 = Tools.toNCHW(conv.updateGradInput(input, gradOutput).toTensor,
+      conv.gradInputFormats()(0))
+    conv.accGradParameters(input, gradOutput)
+
+    val weight1 = Tools.toOIHW(conv.weight, conv.ParamsShape.weight)
+    val gradweight1 = Tools.toOIHW(conv.gradWeight, conv.ParamsShape.gradWeight)
+    val bias1 = Tools.dense(conv.bias).toTensor[Float]
+    val gradbias1 = Tools.dense(conv.gradBias).toTensor
+
+    val output2 = layer.forward(input)
+    val grad2 = layer.updateGradInput(input, gradOutput)
+    layer.accGradParameters(input, gradOutput)
+
+    val weight2 = layer.weight
+    val gradweight2 = layer.gradWeight
+    val bias2 = layer.bias
+    val gradbias2 = layer.gradBias
+
+    Equivalent.nearequals(weight1, weight2.resizeAs(weight1)) should be(true)
+    Equivalent.nearequals(gradweight1, gradweight2.resizeAs(gradweight1)) should be(true)
+    Equivalent.nearequals(bias1, bias2) should be(true)
+    Equivalent.nearequals(gradbias1, gradbias2) should be(true)
+    Equivalent.nearequals(output.toTensor, output2) should be(true)
+    Equivalent.nearequals(grad1.toTensor, grad2) should be(true)
+  }
+
+  "ConvolutionDnn with format=nchw and ngroup=2" should "work correctly" in {
+    val nInputPlane = 2
+    val nOutputPlane = 4
+    val kW = 3
+    val kH = 3
+    val dW = 4
+    val dH = 4
+    val padW = 0
+    val padH = 0
+    val ngroup = 2
+
+    val input = Tensor[Float](2, 2, 23, 23).apply1(e => Random.nextFloat())
+    val gradOutput = Tensor[Float](2, 4, 6, 6).apply1(e => Random.nextFloat())
+    RNG.setSeed(100)
+    val conv = SpatialConvolution(nInputPlane, nOutputPlane, kW, kH, dW, dH,
+      padW, padH, ngroup)
+    RNG.setSeed(100)
+    val layer = nn.SpatialConvolution[Float](nInputPlane, nOutputPlane, kW, kH,
+      dW, dH, padW, padH, ngroup)
+
+    conv.setRuntime(new MklDnnRuntime)
+    conv.initFwdPrimitives(Array(HeapData(Array(2, 2, 23, 23), Memory.Format.nchw)), TrainingPhase)
+    conv.initBwdPrimitives(Array(HeapData(Array(2, 4, 6, 6), Memory.Format.nchw)), TrainingPhase)
+    conv.initGradWPrimitives(Array(HeapData(Array(2, 4, 6, 6), Memory.Format.nchw)), TrainingPhase)
+
+    val output2 = layer.forward(input)
+    val grad2 = layer.updateGradInput(input, gradOutput)
+    layer.accGradParameters(input, gradOutput)
+    val weight2 = layer.weight
+    val gradweight2 = layer.gradWeight
+    val bias2 = layer.bias
+    val gradbias2 = layer.gradBias
+
+    val output = Tools.toNCHW(conv.forward(input).toTensor, conv.outputFormats()(0))
+    val grad1 = Tools.toNCHW(conv.updateGradInput(input, gradOutput).toTensor,
+      conv.gradInputFormats()(0))
+    conv.accGradParameters(input, gradOutput)
+    val weight1 = Tools.toOIHW(conv.weight, conv.ParamsShape.weight)
+    val gradweight1 = Tools.toOIHW(conv.gradWeight, conv.ParamsShape.gradWeight)
+    val bias1 = Tools.dense(conv.bias).toTensor[Float]
+    val gradbias1 = Tools.dense(conv.gradBias).toTensor[Float]
+
+    Equivalent.nearequals(weight1, weight2) should be(true)
+    Equivalent.nearequals(gradweight1, gradweight2) should be(true)
+    Equivalent.nearequals(bias1, bias2) should be(true)
+    Equivalent.nearequals(gradbias1, gradbias2) should be(true)
+    Equivalent.nearequals(output, output2) should be(true)
+    Equivalent.nearequals(grad1, grad2) should be(true)
+  }
+
+  "ConvolutionDnn with relu " should "work correctly" in {
+    val nInputPlane = 2
+    val nOutputPlane = 4
+    val kW = 3
+    val kH = 3
+    val dW = 4
+    val dH = 4
+    val padW = 0
+    val padH = 0
+    val ngroup = 2
+
+    val input = Tensor[Float](2, 2, 23, 23).apply1(e => Random.nextFloat())
+    val gradOutput = Tensor[Float](2, 4, 6, 6).apply1(e => Random.nextFloat())
+    RNG.setSeed(100)
+    val conv = SpatialConvolution(nInputPlane, nOutputPlane, kW, kH, dW, dH, padW, padH, ngroup)
+    RNG.setSeed(100)
+    val conv1 = nn.SpatialConvolution[Float](nInputPlane, nOutputPlane, kW, kH, dW, dH, padW, padH,
+      ngroup)
+
+    val relu = ReLU()
+    val relu1 = nn.ReLU[Float](ip = false)
+
+    val model = Sequential().add(conv).add(relu)
+      .add(ReorderMemory(HeapData(Array(2, 4, 6, 6), Memory.Format.nchw)))
+    model.compile(TrainingPhase, Array(HeapData(Array(2, 2, 23, 23), Memory.Format.nchw)))
+
+    val model1 = nn.Sequential().add(conv1).add(relu1)
+
+    model.forward(input)
+    model.backward(input, gradOutput)
+
+    model1.forward(input)
+    model1.backward(input, gradOutput)
+
+    val output = Tools.toNCHW(conv.output.toTensor, conv.outputFormats()(0))
+    val gradInput = Tools.toNCHW(conv.gradInput.toTensor, conv.gradInputFormats()(0))
+
+    val weight = Tools.toOIHW(conv.weight, conv.ParamsShape.weight)
+    val gradweight = Tools.toOIHW(conv.gradWeight, conv.ParamsShape.gradWeight)
+    val bias = Tools.dense(conv.bias).toTensor
+    val gradbias = Tools.dense(conv.gradBias).toTensor
+
+    val output1 = conv1.output.toTensor
+    val gradInput1 = conv1.gradInput
+
+    val weight1 = conv1.weight
+    val gradweight1 = conv1.gradWeight
+    val bias1 = conv1.bias
+    val gradbias1 = conv1.gradBias
+
+    Equivalent.nearequals(weight, weight1) should be(true)
+    Equivalent.nearequals(gradweight, gradweight1) should be(true)
+    Equivalent.nearequals(bias, bias1) should be(true)
+    Equivalent.nearequals(gradbias, gradbias1) should be(true)
+    Equivalent.nearequals(output, output1) should be(true)
+    Equivalent.nearequals(gradInput, gradInput1) should be(true)
+  }
+
+  "ConvolutionDnn with same params with vgg16" should "work correctly" in {
+    val batchSize = 2
+    val needPropagateBack: Boolean = true
+    val inputShape = Array(batchSize, 3, 224, 224)
+    val outputShape = Array(batchSize, 64, 112, 112)
+
+    RNG.setSeed(100)
+    val model1 = nn.SpatialConvolution[Float](3, 64, 7, 7, 2, 2, 3, 3, 1)
+      .setInitMethod(weightInitMethod = Xavier, Zeros)
+    model1.zeroGradParameters()
+    val (weightAll1, gradWeightAll1) = model1.parameters()
+
+    RNG.setSeed(100)
+    val model2 = SpatialConvolution(3, 64, 7, 7, 2, 2, 3, 3, 1)
+    model2.zeroGradParameters()
+
+    model2.setRuntime(new MklDnnRuntime)
+    model2.initFwdPrimitives(Array(HeapData(inputShape, Memory.Format.nchw)), TrainingPhase)
+    model2.initBwdPrimitives(Array(HeapData(outputShape, Memory.Format.nchw)), TrainingPhase)
+    model2.initGradWPrimitives(Array(HeapData(outputShape, Memory.Format.nchw)), TrainingPhase)
+
+    val initWeight = Tools.fromOIHW(weightAll1(0), model2.ParamsShape.weight)
+    model2.weight.copy(initWeight)
+    model2.bias.copy(model1.bias)
+
+    RNG.setSeed(1)
+    val input = Tensor(batchSize, 3, 224, 224).apply1(e => RNG.uniform(0, 1).toFloat)
+    val gradOutput = Tensor(outputShape).apply1(_ => RNG.uniform(0, 1).toFloat)
+
+    val (weightAll2, gradWeightAll2) = model2.parameters()
+
+    val out1 = model1.forward(input).toTensor[Float]
+    val out2 = model2.forward(input).toTensor[Float]
+
+    var userOut2 = Tools.toNCHW(out2, model2.outputFormats()(0))
+
+    Equivalent.nearequals(out1, userOut2, 1e-4) should be(true)
+
+    val grad1 = model1.updateGradInput(input, gradOutput).toTensor[Float]
+    val grad2 = model2.updateGradInput(input, gradOutput).toTensor[Float]
+
+    val userGradInput2 = Tools.toNCHW(grad2, model2.gradInputFormats()(0))
+
+    Equivalent.nearequals(grad1, userGradInput2, 1e-4) should be(true)
+
+    model1.accGradParameters(input, gradOutput)
+    model2.accGradParameters(input, gradOutput)
+
+    val gw1 = model1.gradWeight
+    val gb1 = model1.gradBias
+
+    val gw2 = Tools.toOIHW(model2.gradWeight, model2.ParamsShape.gradWeight)
+    val gb2 = Tools.dense(model2.gradBias).toTensor
+
+    Equivalent.nearequals(gw1, gw2, 1e-4) should be(true)
+    Equivalent.nearequals(gb1, gb2, 1e-3) should be(true)
+  }
+
+  "a simple convolution compared with caffe" should "work correctly" ignore {
+    val inputShape = Array(4, 3, 5, 5)
+    val outputShape = Array(4, 2, 3, 3)
+    val name = "conv"
+    val nOutput = 2
+    val kernel = 3
+    val pad = 1
+    val stride = 2
+
+    val txt = prototxt(inputShape, name, nOutput, kernel, pad, stride)
+
+    val conv = new SpatialConvolution(3, nOutput, kernel, kernel, stride, stride, pad, pad, 1)
+    conv.setName(name)
+    conv.setRuntime(new MklDnnRuntime)
+    conv.initFwdPrimitives(Array(HeapData(inputShape, Memory.Format.nchw)), TrainingPhase)
+    conv.initBwdPrimitives(Array(HeapData(outputShape, Memory.Format.nchw)), TrainingPhase)
+    conv.initGradWPrimitives(Array(HeapData(outputShape, Memory.Format.nchw)), TrainingPhase)
+    Tools.compare(txt, conv, inputShape, outputShape)
+  }
+
+  "conv exists some format conversion" should "work correctly" ignore {
+    val inputShape = Array(4, 3, 224, 224)
+    val outputShape = Array(4, 64, 112, 112)
+
+    val name = "conv"
+    val conv = SpatialConvolution(3, 64, 7, 7, 2, 2, 3, 3).setName(name)
+    // TODO we should insert a reorder manually
+    val reorder1 = ReorderMemory(HeapData(inputShape, Memory.Format.nchw))
+    val reorder2 = ReorderMemory(HeapData(outputShape, Memory.Format.nchw))
+
+    val seq = Sequential()
+    seq.add(reorder1)
+    seq.add(conv)
+    seq.add(reorder2)
+    seq.compile(Phase.TrainingPhase, Array(HeapData(inputShape, Memory.Format.nchw)))
+    seq.reset()
+
+    val txt = prototxt(inputShape, name, outputShape(1), 7, 3, 2)
+    val identity = Collect.run(txt)
+
+    val input = Tools.getTensor("Fwrd_data", inputShape, identity)
+    val gradOutput = Tools.getTensor(s"Bwrd_$name.loss", outputShape, identity)
+    val output = Tools.getTensor(s"Fwrd_$name", outputShape, identity)
+    val gradInput = Tools.getTensor(s"Bwrd_$name", inputShape, identity)
+
+    if (conv.parameters() != null) {
+      val params = conv.parameters()._1
+      val infos = conv.parametersWithShape()._1
+      val name = conv.getName()
+
+      for (j <- params.indices) {
+        val w = Tools.getTensor(s"Fwrd_$name.Wght.$j", params(j).size(), identity)
+        params(j).copy(normal(w, infos(j)))
+      }
+    }
+
+    seq.forward(input)
+    seq.backward(input, gradOutput)
+
+    Tools.compare2Tensors(Tools.dense(seq.output).toTensor, output) should be (true)
+    Tools.compare2Tensors(Tools.dense(seq.gradInput).toTensor, gradInput) should be (true)
+
+    val params = seq.parameters()._2
+    val infos = conv.parametersWithShape()._2
+    for (j <- params.indices) {
+      val w = Tools.getTensor(s"Bwrd_$name.Grad.$j", params(j).size(), identity)
+      Tools.compare2Tensors(params(j), normal(w, infos(j))) should be (true)
+    }
+  }
+
+  "conv kernel 1x1 with reorder in container" should "work correctly" ignore {
+    val inputShape = Array(4, 64, 56, 56)
+    val outputShape = Array(4, 64, 56, 56)
+
+    val name = "conv"
+    val conv = SpatialConvolution(64, 64, 1, 1, 1, 1, 0, 0).setName(name)
+    // TODO we should insert a reorder manually
+    val reorder1 = ReorderMemory(HeapData(inputShape, Memory.Format.nchw))
+    val reorder2 = ReorderMemory(HeapData(outputShape, Memory.Format.nchw))
+
+    val seq = Sequential()
+    seq.add(reorder1)
+    seq.add(conv)
+    seq.add(reorder2)
+    seq.compile(Phase.TrainingPhase, Array(HeapData(inputShape, Memory.Format.nchw)))
+    seq.reset()
+
+    val txt = prototxt(inputShape, name, outputShape(1), 1, 0, 1)
+    val identity = Collect.run(txt)
+
+    val input = Tools.getTensor("Fwrd_data", inputShape, identity)
+    val gradOutput = Tools.getTensor(s"Bwrd_$name.loss", outputShape, identity)
+    val output = Tools.getTensor(s"Fwrd_$name", outputShape, identity)
+    val gradInput = Tools.getTensor(s"Bwrd_$name", inputShape, identity)
+
+    if (conv.parameters() != null) {
+      val params = conv.parameters()._1
+      val infos = conv.parametersWithShape()._1
+      val name = conv.getName()
+
+      for (j <- params.indices) {
+        val w = Tools.getTensor(s"Fwrd_$name.Wght.$j", params(j).size(), identity)
+        params(j).copy(normal(w, infos(j)))
+      }
+    }
+
+    seq.forward(input)
+    seq.backward(input, gradOutput)
+
+    Tools.compare2Tensors(Tools.dense(seq.output).toTensor, output) should be (true)
+    Tools.compare2Tensors(Tools.dense(seq.gradInput).toTensor, gradInput) should be (true)
+
+    val params = seq.parameters()._2
+    val infos = conv.parametersWithShape()._2
+    for (j <- params.indices.reverse) {
+      val w = Tools.getTensor(s"Bwrd_$name.Grad.$j", params(j).size(), identity)
+      Tools.compare2Tensors(params(j), normal(w, infos(j))) should be (true)
+    }
+  }
+
+  "conv + bn" should "work correctly" ignore {
+    val inputShape = Array(4, 3, 224, 224)
+    val outputShape = Array(4, 64, 112, 112)
+    val channel = 64
+
+    val name = "conv"
+    val conv = SpatialConvolution(3, 64, 7, 7, 2, 2, 3, 3).setName("conv")
+    val bn = SpatialBatchNormalization(64, momentum = 1.0, eps = 100).setName("bn")
+    // TODO we should insert a reorder manually
+    val reorder1 = ReorderMemory(HeapData(inputShape, Memory.Format.nchw)).setName("reorder1")
+    val reorder2 = ReorderMemory(HeapData(outputShape, Memory.Format.nchw)).setName("reorder2")
+
+    val seq = Sequential()
+    seq.add(reorder1)
+    seq.add(conv)
+    seq.add(bn)
+    seq.add(reorder2)
+    seq.compile(Phase.TrainingPhase, Array(HeapData(inputShape, Memory.Format.nchw)))
+    seq.reset()
+    seq.training()
+
+    val txt = prototxt2(inputShape, name, outputShape(1), 7, 3, 2) +
+              """
+                |layer {
+                |  bottom: "conv"
+                |  top: "bn"
+                |  name: "bn"
+                |  type: "BatchNorm"
+                |
+                |  batch_norm_param {
+                |    moving_average_fraction: 1.0
+                |    filler { value: 1 }
+                |    bias_filler { value: 0 }
+                |    relu: false
+                |    eps: 100
+                |  }
+                |}
+              """.stripMargin
+    Tools.compare(txt, seq, inputShape, outputShape, 1e-2)
+  }
+
+  def prototxt(inputShape: Array[Int], name: String,
+    nOutput: Int, kernel: Int, pad: Int, stride: Int): String = {
+      s"""
+         |name: "conv-simple"
+         |force_backward: true
+         |layer {
+         |  name: "data"
+         |  type: "DummyData"
+         |  top: "data"
+         |  include {
+         |    phase: TRAIN
+         |  }
+         |  dummy_data_param {
+         |    data_filler {
+         |      type: "xavier"
+         |    }
+         |    shape: { ${shape2Dim(inputShape)} }
+         |  }
+         |}
+         |
+         |layer {
+         |  bottom: "data"
+         |  top: "conv"
+         |  name: "$name"
+         |  type: "Convolution"
+         |  convolution_param {
+         |    num_output: $nOutput
+         |    kernel_size: $kernel
+         |    pad: $pad
+         |    stride: $stride
+         |    weight_filler {
+         |      type: "msra"
+         |      variance_norm: FAN_OUT
+         |    }
+         |    bias_filler {
+         |      type: "gaussian"
+         |    }
+         |  }
+         |}
+       """.stripMargin
+  }
+
+  def prototxt2(inputShape: Array[Int], name: String,
+    nOutput: Int, kernel: Int, pad: Int, stride: Int): String = {
+    s"""
+       |name: "conv-simple"
+       |force_backward: true
+       |layer {
+       |  name: "data"
+       |  type: "DummyData"
+       |  top: "data"
+       |  include {
+       |    phase: TRAIN
+       |  }
+       |  dummy_data_param {
+       |    data_filler {
+       |      type: "uniform"
+       |      min: -1000
+       |      max: 1000
+       |    }
+       |    shape: { ${shape2Dim(inputShape)} }
+       |  }
+       |}
+       |
+         |layer {
+       |  bottom: "data"
+       |  top: "conv"
+       |  name: "$name"
+       |  type: "Convolution"
+       |  convolution_param {
+       |    num_output: $nOutput
+       |    kernel_size: $kernel
+       |    pad: $pad
+       |    stride: $stride
+       |    weight_filler {
+       |      type: "msra"
+       |      variance_norm: FAN_OUT
+       |    }
+       |    bias_filler {
+       |      type: "gaussian"
+       |    }
+       |  }
+       |}
+       """.stripMargin
+  }
+
+  def normal(src: Tensor[Float], outputFormat: MemoryData): Tensor[Float] = {
+    val defaultFormat = src.size().length match {
+      case 1 => Memory.Format.x
+      case 2 => Memory.Format.oi
+      case 4 => Memory.Format.oihw
+    }
+
+    if (defaultFormat != outputFormat.layout) {
+      val inputFormat = HeapData(src.size(), defaultFormat)
+      val reorder = ReorderMemory(inputFormat, outputFormat, null, null)
+      reorder.setRuntime(new MklDnnRuntime)
+      reorder.initFwdPrimitives(Array(inputFormat), TrainingPhase)
+      reorder.updateOutput(src).toTensor
+    } else {
+      src
+    }
+  }
+
+  private def shape2Dim(shape: Array[Int]): String = {
+    shape.map(x => "dim: " + x).mkString(" ")
+  }
+}
diff --git a/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/TestUtils.scala b/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/TestUtils.scala
new file mode 100644
index 00000000000..e80f4d052fe
--- /dev/null
+++ b/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/TestUtils.scala
@@ -0,0 +1,514 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.intel.analytics.bigdl.nn.mkldnn
+
+import java.io.{File, PrintWriter}
+import java.nio.channels.FileChannel
+import java.nio.file.{Files, Paths, StandardOpenOption}
+import java.nio.{ByteBuffer, ByteOrder}
+
+import breeze.numerics.abs
+import com.intel.analytics.bigdl.Module
+import com.intel.analytics.bigdl.mkl.Memory
+import com.intel.analytics.bigdl.nn.Container
+import com.intel.analytics.bigdl.nn.abstractnn.Activity
+import com.intel.analytics.bigdl.nn.mkldnn.Phase.TrainingPhase
+import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric
+import com.intel.analytics.bigdl.tensor.{DenseTensorMath, Storage, Tensor}
+
+import scala.collection.mutable.ArrayBuffer
+import scala.reflect.ClassTag
+import scala.sys.process._
+
+object Tools {
+  def error[@specialized(Float, Double) T: ClassTag](tensor1: Tensor[T], tensor2: Tensor[T])(
+      implicit ev: TensorNumeric[T]): Double = {
+    require(tensor1.nElement() == tensor2.nElement())
+    var ret = 0.0
+    val storage1 = tensor1.storage().array()
+    val storage2 = tensor2.storage().array()
+    for (i <- 0 until tensor1.nElement()) {
+      ret += math.abs(
+        ev.toType[Double](storage1(i)) - ev.toType[Double](storage2(i)))
+    }
+    ret
+  }
+
+  def cumulativeError[T: ClassTag](tensor1: Tensor[T], tensor2: Tensor[T], msg: String)(
+      implicit ev: TensorNumeric[T]): Double = {
+    val ret = error[T](tensor1, tensor2)
+    println((msg, "CUMULATIVE ERROR:", ret).productIterator.mkString(" ").toUpperCase)
+    ret
+  }
+
+  def averageError[T: ClassTag](tensor1: Tensor[T], tensor2: Tensor[T], msg: String)(
+      implicit ev: TensorNumeric[T]): Double = {
+    require(tensor1.nElement() > 0)
+    val ret = error[T](tensor1, tensor2) / tensor1.nElement()
+    println((msg, "AVERAGE ERROR:", ret).productIterator.mkString(" ").toUpperCase)
+    ret
+  }
+
+  def averageError[T: ClassTag](m1: Map[String, Tensor[T]],
+                                m2: Map[String, Tensor[T]],
+                                err: Map[String, Double])(implicit ev: TensorNumeric[T]): Unit = {
+    require(m1.keySet == m2.keySet)
+    require(m1.keySet subsetOf err.keySet)
+
+    m1.keySet.foreach(i => {
+      val err = error(m1(i), m2(i)) / m1(i).nElement()
+      printf("%20s = %E\n", i.toUpperCase(), err)
+    })
+  }
+
+  def averageAllTensors[T: ClassTag](tensor1: Tensor[T], msg: String = "Unknown")(
+      implicit ev: TensorNumeric[T]): Unit = {
+    val sum = tensor1.storage().array().foldLeft(ev.fromType[Int](0))((l, r) => ev.plus(l, r))
+    val num = ev.fromType[Int](tensor1.nElement())
+    println(("AVERGE", msg, ev.divide(sum, num)).productIterator.mkString(" ").toUpperCase())
+  }
+
+  def printTensor[T: ClassTag](tensor: Tensor[T], num: Int = 16, msg: String = "Unknown")(
+      implicit ev: TensorNumeric[T]): Unit = {
+    println(msg.toUpperCase)
+    for (i <- 0 until num) {
+      println((i, ev.toType[Double](tensor.storage().array()(i))).productIterator.mkString("\t"))
+    }
+  }
+
+  private def fileName(name: String, identity: String): String = {
+    val tmpdir = System.getProperty("java.io.tmpdir")
+    val dirname = if (tmpdir.endsWith("/")) {
+      tmpdir
+    } else {
+      tmpdir + "/"
+    }
+
+    val filename = if (identity.isEmpty) {
+      ".bin"
+    } else {
+      "." + identity + ".bin"
+    }
+
+    dirname + name + filename
+  }
+
+  /*
+   * @brief read binary in tmp dir to Tensor, which is used for comparing
+   *        with Intel Caffe with MKL-DNN
+   */
+  def getTensor(name: String, size: Array[Int], identity: String): Tensor[Float] = {
+    val tensor = Tensor[Float]()
+    val file = fileName(name, identity)
+
+    if (Files.exists(Paths.get(file))) {
+      println(s"[INFO] load $file")
+      setTensorFloat()
+
+      def loadData(name: String): ByteBuffer = {
+        val fileChannel: FileChannel = Files.newByteChannel(
+          Paths.get(name),
+          StandardOpenOption.READ,
+          StandardOpenOption.DELETE_ON_CLOSE).asInstanceOf[FileChannel]
+        val byteBuffer: ByteBuffer = ByteBuffer.allocate(fileChannel.size().toInt)
+        byteBuffer.order(ByteOrder.nativeOrder())
+        fileChannel.read(byteBuffer)
+        byteBuffer.flip()
+        byteBuffer
+      }
+
+
+      def setTensorFloat(): Unit = {
+        val data = loadData(file).asFloatBuffer()
+        val array = new Array[Float](data.limit())
+        data.get(array)
+        assert(size.product == array.length, s"the data length is not correct")
+        tensor.set(Storage(array), sizes = size)
+      }
+    }
+
+    tensor
+  }
+
+  def flattenModules(model: Module[Float], modules: ArrayBuffer[Module[Float]]): Unit = {
+    model match {
+      case container : Container[_, _, _] =>
+        if (container.modules.nonEmpty) {
+          for (i <- container.modules) {
+            flattenModules(i.asInstanceOf[Module[Float]], modules)
+          }
+        }
+      case x => if (!x.isInstanceOf[ReorderMemory] && !x.isInstanceOf[Identity]) {
+        modules += model
+      }
+    }
+  }
+
+  def randTimes(): Int = 10
+
+  def loadWeights(module: Module[Float], identity: String): Unit = {
+    val params = module.parameters()._1
+    val name = module.getName()
+    module match {
+      case bn: SpatialBatchNormalization =>
+        val channel = bn.weightAndBias.size(1) / 2
+
+        val weight = Tools.getTensor(s"Fwrd_${bn.getName}.Wght.3", Array(channel), identity)
+        val bias = Tools.getTensor(s"Fwrd_${bn.getName}.Wght.4", Array(channel), identity)
+        val weightAndBias = Tensor[Float].resize(Array(2, channel))
+        if (weight.isEmpty) {weight.resize(Array(channel)).fill(1)}
+        weightAndBias.select(1, 1).copy(weight)
+        if (bias.isEmpty) {
+          bias.resize(Array(channel)).fill(0)
+        }
+        weightAndBias.select(1, 2).copy(bias)
+        bn.weightAndBias.copy(weightAndBias.view(bn.weightAndBias.size()))
+      case _ =>
+        for (j <- params.indices) {
+          val w = Tools.getTensor(s"Fwrd_$name.Wght.$j", params(j).size(), identity)
+          module match {
+            case layer: MklDnnLayer =>
+              val infos = layer.parametersWithShape()._1
+              val weights = if (!w.isEmpty) {
+                params(j).copy(fromOIHW(w, infos(j)))
+              } else {
+                val zeros = Tensor[Float]().resize(params(j).size()).fill(0)
+                params(j).copy(zeros)
+              }
+            case _ =>
+              params(j).copy(w)
+          }
+        }
+    }
+  }
+
+  def compareGradients(module: Module[Float], epsilon: Float, identity: String): Boolean = {
+    var ret = true
+
+    val name = module.getName()
+    val params = module.parameters()._2
+
+    module match {
+      case bn: SpatialBatchNormalization =>
+        val channel = bn.weightAndBias.size(1) / 2
+
+        val weight = Tools.getTensor(s"Bwrd_${bn.getName}.Grad.3", Array(channel), identity)
+        val bias = Tools.getTensor(s"Bwrd_${bn.getName}.Grad.4", Array(channel), identity)
+        val weightAndBias = Tensor[Float].resize(Array(2, channel))
+        weightAndBias.select(1, 1).copy(weight)
+        weightAndBias.select(1, 2).copy(bias)
+
+        ret &= Equivalent.nearequals(weightAndBias.view(bn.gradWeightAndBias.size()),
+          dense(bn.gradWeightAndBias).toTensor, epsilon)
+        val runningMean = Tools.getTensor(s"Fwrd_$name.Wght.0", Array(channel), identity)
+        val runningVariance = Tools.getTensor(s"Fwrd_$name.Wght.1", Array(channel), identity)
+
+        ret &= compare2Tensors(runningMean, dense(bn.runningMean).toTensor)
+        ret &= compare2Tensors(runningVariance, dense(bn.runningVariance).toTensor)
+
+        assert(ret, s"${module.getName()} gradient can't pass, please check")
+      case _ =>
+        for (j <- params.indices) {
+          val w = Tools.getTensor(s"Bwrd_$name.Grad.$j", params(j).size(), identity)
+          module match {
+            case layer: MklDnnLayer =>
+              val infos = layer.parametersWithShape()._2
+              ret &= Equivalent.nearequals(dense(params(j)).toTensor,
+                dense(fromOIHW(w, infos(j))).toTensor, epsilon)
+            case _ => ret &= compare2Tensors(params(j), w)
+          }
+
+          assert(ret, s"${module.getName()} gradient $j can't pass, please check")
+        }
+    }
+
+    ret
+  }
+
+  def compare(prototxt: String, model: Module[Float], inputShape: Array[Int],
+    outputShape: Array[Int], epsilon: Double = 1e-7): Unit = {
+    val identity = Collect.run(prototxt, singleLayer = true)
+    val modules = ArrayBuffer[Module[Float]]()
+    Tools.flattenModules(model, modules)
+
+    val input = Tools.getTensor("Fwrd_data", inputShape, identity)
+    val gradOutput = Tools.getTensor(s"Bwrd_${modules.last.getName()}.loss", outputShape, identity)
+
+    modules.filter(_.parameters() != null).foreach(loadWeights(_, identity))
+
+    model.forward(input)
+    model.backward(input, gradOutput)
+
+    for (i <- modules.indices) {
+      compareSingleLayer(modules(i), identity)
+    }
+
+    def compareSingleLayer(module: Module[Float], identity: String): Boolean = {
+      val name = module.getName()
+      val bigdlOutput = module.output.toTensor[Float]
+      val bigdlGradInput = if (module.isInstanceOf[CAddTable]) {
+        module.gradInput.toTable.apply[Tensor[Float]](1)
+      } else {
+        module.gradInput.toTensor[Float]
+      }
+
+      val output = Tools.getTensor(s"Fwrd_$name", bigdlOutput.size(), identity)
+      val gradInput = Tools.getTensor(s"Bwrd_$name", bigdlGradInput.size(), identity)
+
+      var ret = true
+
+      module match {
+        case layer: MklDnnLayer =>
+          ret &= compare2Tensors(output, toNCHW(bigdlOutput, layer.outputFormats()(0)))
+          assert(ret, s"${module.getName()} output can't pass, please check")
+
+          ret &= compare2Tensors(gradInput, toNCHW(bigdlGradInput, layer.gradInputFormats()(0)))
+          assert(ret, s"${module.getName()} gradInput can't pass, please check")
+        case _ =>
+          ret &= compare2Tensors(output, bigdlOutput)
+          assert(ret, s"${module.getName()} output can't pass, please check")
+
+          ret &= compare2Tensors(gradInput, bigdlGradInput)
+          assert(ret, s"${module.getName()} gradInput can't pass, please check")
+      }
+
+      if (module.parameters() == null) {
+        return ret
+      }
+
+      val params = module.parameters()._2
+      compareGradients(module, epsilon.toFloat, identity)
+
+      ret
+    }
+  }
+
+  def compare2Tensors(src: Tensor[Float], dst: Tensor[Float]): Boolean = {
+    Equivalent.nearequals(dense(src).toTensor, dense(dst).toTensor)
+  }
+
+  def dense(t: Activity): Activity = {
+    val ret = if (t.isTensor) {
+      val tt = t.asInstanceOf[Tensor[Float]]
+      Tensor[Float]().resize(tt.size()).copy(tt)
+    } else {
+      throw new UnsupportedOperationException
+    }
+
+    ret
+  }
+
+  def toNCHW(src: Tensor[Float], inputFormat: MemoryData): Tensor[Float] = {
+    val outputFormat = HeapData(inputFormat.shape,
+      if (src.size().length == 2) { Memory.Format.nc } else { Memory.Format.nchw })
+    val reorder = ReorderMemory(inputFormat, outputFormat, null, null)
+
+    reorder.setRuntime(new MklDnnRuntime)
+    reorder.initFwdPrimitives(Array(inputFormat), TrainingPhase)
+    reorder.forward(src).toTensor
+  }
+
+  def fromNCHW(src: Tensor[Float], outputFormat: MemoryData): Tensor[Float] = {
+    val defaultFormat = src.size().length match {
+      case 1 => Memory.Format.x
+      case 2 => Memory.Format.nc
+      case 4 => Memory.Format.nchw
+    }
+
+    val inputFormat = HeapData(src.size(), defaultFormat)
+    val reorder = ReorderMemory(inputFormat, outputFormat, null, null)
+    reorder.setRuntime(new MklDnnRuntime)
+    reorder.initFwdPrimitives(Array(inputFormat), TrainingPhase)
+    reorder.forward(src).toTensor
+  }
+
+  def fromOIHW(src: Tensor[Float], outputFormat: MemoryData): Tensor[Float] = {
+    val defaultFormat = outputFormat.shape.length match {
+      case 1 => Memory.Format.x
+      case 2 => Memory.Format.oi
+      case 4 => Memory.Format.oihw
+    }
+
+    val inputFormat = HeapData(outputFormat.shape, defaultFormat)
+    val reorder = ReorderMemory(inputFormat, outputFormat, null, null)
+    reorder.setRuntime(new MklDnnRuntime)
+    reorder.initFwdPrimitives(Array(inputFormat), TrainingPhase)
+    reorder.updateOutput(src).toTensor
+  }
+
+  def toOIHW(src: Tensor[Float], inputFormat: MemoryData): Tensor[Float] = {
+    val defaultFormat = inputFormat.shape.length match {
+      case 1 => Memory.Format.x
+      case 2 => Memory.Format.oi
+      case 4 => Memory.Format.oihw
+      case 5 => Memory.Format.goihw
+    }
+
+    val outputFormat = HeapData(inputFormat.shape, defaultFormat)
+    val reorder = ReorderMemory(inputFormat, outputFormat, null, null)
+    reorder.setRuntime(new MklDnnRuntime)
+    reorder.initFwdPrimitives(Array(inputFormat), TrainingPhase)
+    reorder.updateOutput(src).toTensor
+  }
+}
+
+/**
+ * Call "collect" command, which is a method to collect output binary files.
+ * It's similar to "caffe collect", the difference is that it supports collect
+ * single layers output and gradient through make a fake gradOutput/top_diff.
+ */
+object Collect {
+  val tmpdir: String = System.getProperty("java.io.tmpdir")
+  val collectPath: String = System.getProperty("collect.location")
+
+  def hasCollect: Boolean = {
+    val exitValue = if (collectPath != null) s"ls $collectPath".! else "which collect".!
+    exitValue == 0
+  }
+
+  /**
+   * save the prototxt to a temporary file and call collect
+   * @param prototxt prototxt with string
+   * @return the middle random number in temporary file, which is an identity for getTensor.
+   */
+  def run(prototxt: String, singleLayer: Boolean = true): String = {
+    def saveToFile(prototxt: String, name: String): String = {
+      val tmpFile = java.io.File.createTempFile(name, ".prototxt")
+      val absolutePath = tmpFile.getAbsolutePath
+
+      println(s"prototxt is saved to $absolutePath")
+
+      val writer = new PrintWriter(tmpFile)
+      writer.println(prototxt)
+      writer.close()
+
+      absolutePath
+    }
+
+    if (! hasCollect) {
+      throw new RuntimeException(s"Can't find collect command. Have you copy to the PATH?")
+    }
+
+    val file = saveToFile(prototxt, "UnitTest.") // UnitTest ends with dot for getting random number
+    val identity = file.split("""\.""").reverse(1) // get the random number
+
+    val cmd = Seq(s"$collectPath", "--model", file, "--type", "float", "--identity", identity)
+    val exitValue = if (singleLayer) {
+      Process(cmd :+ "--single", new File(tmpdir)).!
+    } else {
+      Process(cmd, new File(tmpdir)).!
+    }
+
+    Files.deleteIfExists(Paths.get(file))
+    require(exitValue == 0, s"Something wrong with collect command. Please check it.")
+
+    identity
+  }
+}
+
+object Utils {
+  def time[R](block: => R): (Double, R) = {
+    val t0 = System.nanoTime()
+    val result = block
+    val t1 = System.nanoTime()
+    val takes = (t1 - t0) / 1e9
+    (takes, result)
+  }
+
+  def manyTimes[R](block: => R)(iters: Int): (Double, R) = {
+    time[R] {
+      var i = 0
+      while (i < iters - 1) {
+        block
+        i += 1
+      }
+      block
+    }
+  }
+
+  def speedup(base: Double, after: Double): String = {
+    val result = (base - after) / base
+    ((result * 1000).toInt / 10.0).toString + "%"
+  }
+}
+
+object Equivalent {
+
+  def nearlyEqual(a: Float, b: Float, epsilon: Double): Boolean = {
+    val absA = math.abs(a)
+    val absB = math.abs(b)
+    val diff = math.abs(a - b)
+
+    val result = if (a == b) {
+      true
+    } else {
+      math.min(diff / (absA + absB), diff) < epsilon
+    }
+
+    result
+  }
+
+  def nearequals(t1: Tensor[Float], t2: Tensor[Float],
+    epsilon: Double = DenseTensorMath.floatEpsilon): Boolean = {
+    var result = true
+    t1.map(t2, (a, b) => {
+      if (result) {
+        result = nearlyEqual(a, b, epsilon)
+        if (!result) {
+          val diff = math.abs(a - b)
+          println("epsilon " + a + "***" + b + "***" + diff / (abs(a) + abs(b)) + "***" + diff)
+        }
+      }
+      a
+    })
+    result
+  }
+
+  def getunequals(t1: Tensor[Float], t2: Tensor[Float],
+    epsilon: Double = DenseTensorMath.floatEpsilon): Boolean = {
+    var result = true
+    var num = 0
+    t1.map(t2, (a, b) => {
+      if (true) {
+        result = nearlyEqual(a, b, epsilon)
+        if (!result) {
+          num += 1
+          val diff = math.abs(a - b)
+          println("epsilon " + a + "***" + b + "***" + diff / (abs(a) + abs(b)) + "***" + diff)
+        }
+      }
+      a
+    })
+    println("diff num " + num)
+    return true
+  }
+
+  def isEquals(t1: Tensor[Float], t2: Tensor[Float]): Boolean = {
+    var result = true
+    t1.map(t2, (a, b) => {
+      if (result) {
+        result = if (a == b) true else false
+        if (!result) {
+          val diff = Math.abs(a - b)
+          println("epsilon " + a + "***" + b + "***" + diff / (abs(a) + abs(b)) + "***" + diff)
+        }
+      }
+      a
+    })
+    return result
+  }
+}
diff --git a/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/TopologySpec.scala b/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/TopologySpec.scala
new file mode 100644
index 00000000000..922435a3a90
--- /dev/null
+++ b/spark/dl/src/test/scala/com/intel/analytics/bigdl/nn/mkldnn/TopologySpec.scala
@@ -0,0 +1,1011 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.intel.analytics.bigdl.nn.mkldnn
+
+import com.intel.analytics.bigdl.mkl.Memory
+import com.intel.analytics.bigdl.nn.mkldnn.Phase.TrainingPhase
+import com.intel.analytics.bigdl.numeric.NumericFloat
+import com.intel.analytics.bigdl.{Module, nn}
+import org.scalatest.{FlatSpec, Ignore, Matchers}
+
+@Ignore
+class TopologySpec extends FlatSpec with Matchers {
+
+  "LeNet5 has no tanh" should "work correctly" in {
+    val inputShape = Array(4, 1, 28, 28)
+    val outputShape = Array(4, 10)
+    val prototxt = s"""
+         |name: "LeNet"
+         |force_backward: true
+         |layer {
+         |  name: "data"
+         |  type: "DummyData"
+         |  top: "data"
+         |  dummy_data_param {
+         |    data_filler {
+         |      type: "xavier"
+         |    }
+         |    shape: { ${shape2Dim(inputShape)} }
+         |  }
+         |}
+         |layer {
+         |  name: "conv1"
+         |  type: "Convolution"
+         |  bottom: "data"
+         |  top: "conv1"
+         |  param {
+         |    lr_mult: 1
+         |  }
+         |  param {
+         |    lr_mult: 2
+         |  }
+         |  convolution_param {
+         |    num_output: 20
+         |    kernel_size: 5
+         |    stride: 1
+         |    weight_filler {
+         |      type: "msra"
+         |      variance_norm: FAN_OUT
+         |    }
+         |    bias_filler {
+         |      type: "constant"
+         |      value: 0
+         |    }
+         |  }
+         |}
+         |layer {
+         |  name: "pool1"
+         |  type: "Pooling"
+         |  bottom: "conv1"
+         |  top: "pool1"
+         |  pooling_param {
+         |    pool: MAX
+         |    kernel_size: 2
+         |    stride: 2
+         |  }
+         |}
+         |layer {
+         |  name: "conv2"
+         |  type: "Convolution"
+         |  bottom: "pool1"
+         |  top: "conv2"
+         |  param {
+         |    lr_mult: 1
+         |  }
+         |  param {
+         |    lr_mult: 2
+         |  }
+         |  convolution_param {
+         |    num_output: 50
+         |    kernel_size: 5
+         |    stride: 1
+         |    weight_filler {
+         |      type: "xavier"
+         |    }
+         |    bias_filler {
+         |      type: "constant"
+         |    }
+         |  }
+         |}
+         |layer {
+         |  name: "pool2"
+         |  type: "Pooling"
+         |  bottom: "conv2"
+         |  top: "pool2"
+         |  pooling_param {
+         |    pool: MAX
+         |    kernel_size: 2
+         |    stride: 2
+         |  }
+         |}
+         |layer {
+         |  name: "ip1"
+         |  type: "InnerProduct"
+         |  bottom: "pool2"
+         |  top: "ip1"
+         |  param {
+         |    lr_mult: 1
+         |  }
+         |  param {
+         |    lr_mult: 2
+         |  }
+         |  inner_product_param {
+         |    num_output: 500
+         |    weight_filler {
+         |      type: "xavier"
+         |    }
+         |    bias_filler {
+         |      type: "constant"
+         |    }
+         |  }
+         |}
+         |layer {
+         |  name: "relu1"
+         |  type: "ReLU"
+         |  bottom: "ip1"
+         |  top: "ip1"
+         |}
+         |layer {
+         |  name: "ip2"
+         |  type: "InnerProduct"
+         |  bottom: "ip1"
+         |  top: "ip2"
+         |  param {
+         |    lr_mult: 1
+         |  }
+         |  param {
+         |    lr_mult: 2
+         |  }
+         |  inner_product_param {
+         |    num_output: 10
+         |    weight_filler {
+         |      type: "xavier"
+         |    }
+         |    bias_filler {
+         |      type: "constant"
+         |    }
+         |  }
+         |}
+       """.stripMargin
+//    |layer {
+//      |  name: "prob"
+//      |  type: "Softmax"
+//      |  bottom: "ip2"
+//      |  top: "prob"
+//      |}
+//    |
+
+    val bigdl = Sequential()
+      .add(SpatialConvolution(1, 20, 5, 5).setName("conv1"))
+      .add(MaxPooling(2, 2, 2, 2).setName("pool1"))
+      .add(SpatialConvolution(20, 50, 5, 5).setName("conv2"))
+      .add(MaxPooling(2, 2, 2, 2).setName("pool2"))
+      .add(Linear(50 * 4 * 4, 500).setName("ip1"))
+      .add(ReLU().setName("relu1"))
+      .add(Linear(500, 10).setName("ip2"))
+      .add(ReorderMemory(HeapData(outputShape, Memory.Format.nc)))
+//      .add(SoftMax().setName("prob")) // TODO SoftMax is totally different with Caffe.
+    bigdl.compile(TrainingPhase, Array(HeapData(inputShape, Memory.Format.nchw)))
+
+    Tools.compare(prototxt, bigdl, inputShape, outputShape, 1e-6)
+  }
+
+  "eltwise" should "work correctly" in {
+    val nInput = 3
+    val nOutput = 2
+    val inputShape = Array(4, 3, 5, 5)
+    val outputShape = Array(4, 2, 3, 3)
+
+    val kernel = 3
+    val pad = 1
+    val stride = 2
+
+    val prototxt =
+      s"""
+          | name: "eltwise-simple"
+          |force_backward: true
+          |layer {
+          |  name: "data"
+          |  type: "DummyData"
+          |  top: "data"
+          |  include {
+          |    phase: TRAIN
+          |  }
+          |  dummy_data_param {
+          |    data_filler {
+          |      type: "xavier"
+          |    }
+          |    shape: { dim: 4 dim: 3 dim: 5 dim: 5 }
+          |  }
+          |}
+          |layer {
+          |  bottom: "data"
+          |  top: "conv1"
+          |  name: "conv1"
+          |  type: "Convolution"
+          |  convolution_param {
+          |    num_output: 2
+          |    kernel_size: 3
+          |    pad: 1
+          |    stride: 2
+          |    weight_filler {
+          |      # type: "msra"
+          |      # variance_norm: FAN_OUT
+          |      type: "constant"
+          |      value: 0.1
+          |    }
+          |    bias_filler {
+          |      # type: "gaussian"
+          |      type: "constant"
+          |      value: 0.1
+          |    }
+          |  }
+          |}
+          |layer {
+          |  bottom: "data"
+          |  top: "conv2"
+          |  name: "conv2"
+          |  type: "Convolution"
+          |  convolution_param {
+          |    num_output: 2
+          |    kernel_size: 3
+          |    pad: 1
+          |    stride: 2
+          |    weight_filler {
+          |      # type: "msra"
+          |      # variance_norm: FAN_OUT
+          |      type: "constant"
+          |      value: 0.1
+          |    }
+          |    bias_filler {
+          |      # type: "gaussian"
+          |      type: "constant"
+          |      value: 0.1
+          |    }
+          |  }
+          |}
+          |layer {
+          |  bottom: "conv1"
+          |  bottom: "conv2"
+          |  top: "eltwise"
+          |  name: "eltwise"
+          |  type: "Eltwise"
+          |  eltwise_param {
+          |  }
+          |}
+          |
+       """.stripMargin
+
+    val conv1 = SpatialConvolution(nInput, nOutput, kernel, kernel, stride, stride, pad, pad, 1)
+      .setName("conv1")
+    val conv2 = SpatialConvolution(nInput, nOutput, kernel, kernel, stride, stride, pad, pad, 1)
+      .setName("conv2")
+    val model = Sequential()
+      .add(ConcatTable().add(conv2).add(conv1))
+      .add(CAddTable().setName("eltwise"))
+      .add(ReorderMemory(HeapData(outputShape, Memory.Format.nchw)))
+
+    model.compile(TrainingPhase, Array(HeapData(inputShape, Memory.Format.nchw)))
+
+    Tools.compare(prototxt, model, inputShape, outputShape)
+  }
+
+  "resnet 50" should "work correctly" in {
+    val prototxt =
+      s"""
+         |name: "ResNet-50"
+         |force_backward: true
+         |layer {
+         |  name: "data"
+         |  type: "DummyData"
+         |  top: "data"
+         |  top: "label"
+         |  include {
+         |    phase: TRAIN
+         |  }
+         |  dummy_data_param {
+         |    data_filler {
+         |      type: "constant"
+         |      value: 0.01
+         |    }
+         |    shape: { dim: 4 dim: 3 dim: 224 dim: 224 }
+         |    shape: { dim: 4 dim: 1 dim: 1 dim: 1  }
+         |  }
+         |}
+         |
+         |layer {
+         |  name: "data"
+         |  type: "DummyData"
+         |  top: "data"
+         |  top: "label"
+         |  include {
+         |    phase: TEST
+         |  }
+         |  dummy_data_param {
+         |    data_filler {
+         |      type: "constant"
+         |      value: 0.01
+         |    }
+         |    shape: { dim: 32 dim: 3 dim: 224 dim: 224 }
+         |    shape: { dim: 32 dim: 1 dim: 1   dim: 1   }
+         |  }
+         |}
+         |
+         |layer {
+         |  bottom: "data"
+         |  top: "conv1"
+         |  name: "conv1"
+         |  type: "Convolution"
+         |  convolution_param {
+         |    num_output: 64
+         |    kernel_size: 7
+         |    pad: 3
+         |    stride: 2
+         |    weight_filler {
+         |      type: "msra"
+         |      variance_norm: FAN_OUT
+         |    }
+         |    bias_filler {
+         |      type: "constant"
+         |      value: 0
+         |    }
+         |  }
+         |}
+         |
+         |# layer {
+         |#   bottom: "conv1"
+         |#   top: "conv1"
+         |#   name: "bn_conv1"
+         |#   type: "BatchNorm"
+         |#   param { lr_mult: 0 }
+         |#   param { lr_mult: 0 }
+         |#   param { lr_mult: 0 }
+         |#   batch_norm_param {
+         |#     moving_average_fraction: 0.9
+         |#     filler { value: 1 }
+         |#   }
+         |# }
+         |
+         |layer {
+         |  bottom: "conv1"
+         |  top: "conv1"
+         |  name: "scale_conv1"
+         |  type: "Scale"
+         |  param { decay_mult: 0 }
+         |  param { decay_mult: 0 }
+         |  scale_param {
+         |    bias_term: true
+         |  }
+         |}
+         |
+         |layer {
+         |  bottom: "conv1"
+         |  top: "conv1"
+         |  name: "conv1_relu"
+         |  type: "ReLU"
+         |  relu_param {
+         |  }
+         |}
+         |
+         |layer {
+         |  bottom: "conv1"
+         |  top: "pool1"
+         |  name: "pool1"
+         |  type: "Pooling"
+         |  pooling_param {
+         |    kernel_size: 3
+         |    stride: 2
+         |    pool: MAX
+         |  }
+         |}
+       """.stripMargin
+    val inputShape = Array(4, 3, 224, 224)
+    val outputShape = Array(4, 64, 56, 56)
+
+    val model = Sequential()
+      .add(SpatialConvolution(3, 64, 7, 7, 2, 2, 3, 3, propagateBack = true).setName("conv1"))
+      .add(ReLU().setName("conv1_relu"))
+      .add(MaxPooling(3, 3, 2, 2).setName("pool1"))
+      .add(ReorderMemory(HeapData(outputShape, Memory.Format.nchw)))
+    model.compile(TrainingPhase, Array(HeapData(inputShape, Memory.Format.nchw)))
+
+    Tools.compare(prototxt, model, inputShape, outputShape)
+  }
+
+  "bottleneck" should "work correctly" in {
+    val prototxt =
+      s"""
+         |name: "ResNet-50"
+         |force_backward: true
+         |layer {
+         |  name: "data"
+         |  type: "DummyData"
+         |  top: "data"
+         |  top: "label"
+         |  include {
+         |    phase: TRAIN
+         |  }
+         |  dummy_data_param {
+         |    data_filler {
+         |      type: "constant"
+         |      value: 0.01
+         |    }
+         |    shape: { dim: 4 dim: 3 dim: 224 dim: 224 }
+         |    shape: { dim: 4 dim: 1 dim: 1 dim: 1  }
+         |  }
+         |}
+         |
+         |layer {
+         |  bottom: "data"
+         |  top: "conv1"
+         |  name: "conv1"
+         |  type: "Convolution"
+         |  convolution_param {
+         |    num_output: 64
+         |    kernel_size: 7
+         |    pad: 3
+         |    stride: 2
+         |    weight_filler {
+         |      type: "msra"
+         |      variance_norm: FAN_OUT
+         |    }
+         |    bias_filler {
+         |      type: "constant"
+         |      value: 0
+         |    }
+         |  }
+         |}
+         |
+         |layer {
+         |  bottom: "conv1"
+         |  top: "conv1_relu" # delete inplace
+         |  name: "conv1_relu"
+         |  type: "ReLU"
+         |  relu_param {
+         |    fuse: false
+         |  }
+         |}
+         |
+         |layer {
+         |  bottom: "conv1_relu"
+         |  top: "pool1"
+         |  name: "pool1"
+         |  type: "Pooling"
+         |  pooling_param {
+         |    kernel_size: 3
+         |    stride: 2
+         |    pool: MAX
+         |  }
+         |}
+         |
+         |layer {
+         |  bottom: "pool1"
+         |  top: "res2a_branch1"
+         |  name: "res2a_branch1"
+         |  type: "Convolution"
+         |  convolution_param {
+         |    num_output: 256
+         |    kernel_size: 1
+         |    pad: 0
+         |    stride: 1
+         |    bias_term: true # change to true
+         |    weight_filler {
+         |      type: "msra"
+         |    }
+         |    bias_filler {
+         |      type: "constant"
+         |      value: 0
+         |    }
+         |  }
+         |}
+         |
+         |layer {
+         |  bottom: "res2a_branch1"
+         |  top: "res2a_branch1"
+         |  name: "scale2a_branch1"
+         |  type: "Scale"
+         |  param { decay_mult: 0 }
+         |  param { decay_mult: 0 }
+         |  scale_param {
+         |    bias_term: true
+         |  }
+         |}
+         |
+         |layer {
+         |  bottom: "pool1"
+         |  top: "res2a_branch2a"
+         |  name: "res2a_branch2a"
+         |  type: "Convolution"
+         |  convolution_param {
+         |
+         |    num_output: 64
+         |    kernel_size: 1
+         |    pad: 0
+         |    stride: 1
+         |    bias_term: true # change to true.
+         |    weight_filler {
+         |      type: "msra"
+         |    }
+         |    bias_filler {
+         |      type: "constant"
+         |      value: 0
+         |    }
+         |  }
+         |}
+         |
+         |layer {
+         |  bottom: "res2a_branch2a"
+         |  top: "res2a_branch2a"
+         |  name: "scale2a_branch2a"
+         |  type: "Scale"
+         |  param { decay_mult: 0 }
+         |  param { decay_mult: 0 }
+         |  scale_param {
+         |    bias_term: true
+         |  }
+         |}
+         |
+         |layer {
+         |  bottom: "res2a_branch2a"
+         |  top: "res2a_branch2a"
+         |  name: "res2a_branch2a_relu"
+         |  type: "ReLU"
+         |  relu_param {
+         |
+         |  }
+         |}
+         |
+         |layer {
+         |  bottom: "res2a_branch2a"
+         |  top: "res2a_branch2b"
+         |  name: "res2a_branch2b"
+         |  type: "Convolution"
+         |  convolution_param {
+         |    num_output: 64
+         |    kernel_size: 3
+         |    pad: 1
+         |    stride: 1
+         |    bias_term: true # change to true
+         |    weight_filler {
+         |      type: "msra"
+         |    }
+         |    bias_filler {
+         |      type: "constant"
+         |      value: 0
+         |    }
+         |  }
+         |}
+         |
+         |layer {
+         |  bottom: "res2a_branch2b"
+         |  top: "res2a_branch2b"
+         |  name: "scale2a_branch2b"
+         |  type: "Scale"
+         |  param { decay_mult: 0 }
+         |  param { decay_mult: 0 }
+         |  scale_param {
+         |    bias_term: true
+         |  }
+         |}
+         |
+         |layer {
+         |  bottom: "res2a_branch2b"
+         |  top: "res2a_branch2b"
+         |  name: "res2a_branch2b_relu"
+         |  type: "ReLU"
+         |  relu_param {
+         |
+         |  }
+         |}
+         |
+         |layer {
+         |  bottom: "res2a_branch2b"
+         |  top: "res2a_branch2c"
+         |  name: "res2a_branch2c"
+         |  type: "Convolution"
+         |  convolution_param {
+         |    num_output: 256
+         |    kernel_size: 1
+         |    pad: 0
+         |    stride: 1
+         |    bias_term: true # change to true
+         |    weight_filler {
+         |      type: "msra"
+         |    }
+         |    bias_filler {
+         |      type: "constant"
+         |      value: 0
+         |    }
+         |  }
+         |}
+         |
+         |layer {
+         |  bottom: "res2a_branch2c"
+         |  top: "res2a_branch2c"
+         |  name: "scale2a_branch2c"
+         |  type: "Scale"
+         |  param { decay_mult: 0 }
+         |  param { decay_mult: 0 }
+         |  scale_param {
+         |    bias_term: true
+         |  }
+         |}
+         |
+         |layer {
+         |  bottom: "res2a_branch1"
+         |  bottom: "res2a_branch2c"
+         |  top: "res2a"
+         |  name: "res2a"
+         |  type: "Eltwise"
+         |  eltwise_param {
+         |
+         |  }
+         |}
+         |
+         |layer {
+         |  bottom: "res2a"
+         |  top: "res2a"
+         |  name: "res2a_relu"
+         |  type: "ReLU"
+         |  relu_param {
+         |
+         |  }
+         |}
+         |layer {
+         |  bottom: "res2a"
+         |  top: "res2b_branch2a"
+         |  name: "res2b_branch2a"
+         |  type: "Convolution"
+         |  convolution_param {
+         |    num_output: 64
+         |    kernel_size: 1
+         |    pad: 0
+         |    stride: 1
+         |    bias_term: true # change to true
+         |    weight_filler {
+         |      type: "msra"
+         |    }
+         |    bias_filler {
+         |      type: "constant"
+         |      value: 0
+         |    }
+         |  }
+         |}
+         |
+         |layer {
+         |  bottom: "res2b_branch2a"
+         |  top: "res2b_branch2a"
+         |  name: "scale2b_branch2a"
+         |  type: "Scale"
+         |  param { decay_mult: 0 }
+         |  param { decay_mult: 0 }
+         |  scale_param {
+         |    bias_term: true
+         |  }
+         |}
+         |
+         |layer {
+         |  bottom: "res2b_branch2a"
+         |  top: "res2b_branch2a"
+         |  name: "res2b_branch2a_relu"
+         |  type: "ReLU"
+         |  relu_param {
+         |
+         |  }
+         |}
+         |
+         |layer {
+         |  bottom: "res2b_branch2a"
+         |  top: "res2b_branch2b"
+         |  name: "res2b_branch2b"
+         |  type: "Convolution"
+         |  convolution_param {
+         |    num_output: 64
+         |    kernel_size: 3
+         |    pad: 1
+         |    stride: 1
+         |    bias_term: true # change to true
+         |    weight_filler {
+         |      type: "msra"
+         |    }
+         |    bias_filler {
+         |      type: "constant"
+         |      value: 0
+         |    }
+         |  }
+         |}
+         |
+         |layer {
+         |  bottom: "res2b_branch2b"
+         |  top: "res2b_branch2b"
+         |  name: "scale2b_branch2b"
+         |  type: "Scale"
+         |  param { decay_mult: 0 }
+         |  param { decay_mult: 0 }
+         |  scale_param {
+         |    bias_term: true
+         |  }
+         |}
+         |
+         |layer {
+         |  bottom: "res2b_branch2b"
+         |  top: "res2b_branch2b"
+         |  name: "res2b_branch2b_relu"
+         |  type: "ReLU"
+         |  relu_param {
+         |
+         |  }
+         |}
+         |
+         |layer {
+         |  bottom: "res2b_branch2b"
+         |  top: "res2b_branch2c"
+         |  name: "res2b_branch2c"
+         |  type: "Convolution"
+         |  convolution_param {
+         |    num_output: 256
+         |    kernel_size: 1
+         |    pad: 0
+         |    stride: 1
+         |    bias_term: true # change to true
+         |    weight_filler {
+         |      type: "msra"
+         |    }
+         |    bias_filler {
+         |      type: "constant"
+         |      value: 0
+         |    }
+         |  }
+         |}
+         |
+         |layer {
+         |  bottom: "res2b_branch2c"
+         |  top: "res2b_branch2c"
+         |  name: "scale2b_branch2c"
+         |  type: "Scale"
+         |  param { decay_mult: 0 }
+         |  param { decay_mult: 0 }
+         |  scale_param {
+         |    bias_term: true
+         |  }
+         |}
+         |
+         |layer {
+         |  bottom: "res2a"
+         |  bottom: "res2b_branch2c"
+         |  top: "res2b"
+         |  name: "res2b"
+         |  type: "Eltwise"
+         |  eltwise_param {
+         |
+         |  }
+         |}
+         |
+         |layer {
+         |  bottom: "res2b"
+         |  top: "res2b"
+         |  name: "res2b_relu"
+         |  type: "ReLU"
+         |  relu_param {
+         |
+         |  }
+         |}
+         |
+         |layer {
+         |  bottom: "res2b"
+         |  top: "res2c_branch2a"
+         |  name: "res2c_branch2a"
+         |  type: "Convolution"
+         |  convolution_param {
+         |
+         |    num_output: 64
+         |    kernel_size: 1
+         |    pad: 0
+         |    stride: 1
+         |    bias_term: true # change to true
+         |    weight_filler {
+         |      type: "msra"
+         |    }
+         |    bias_filler {
+         |      type: "constant"
+         |      value: 0
+         |    }
+         |  }
+         |}
+         |
+         |layer {
+         |  bottom: "res2c_branch2a"
+         |  top: "res2c_branch2a"
+         |  name: "scale2c_branch2a"
+         |  type: "Scale"
+         |  param { decay_mult: 0 }
+         |  param { decay_mult: 0 }
+         |  scale_param {
+         |    bias_term: true
+         |  }
+         |}
+         |
+         |layer {
+         |  bottom: "res2c_branch2a"
+         |  top: "res2c_branch2a"
+         |  name: "res2c_branch2a_relu"
+         |  type: "ReLU"
+         |  relu_param {
+         |
+         |  }
+         |}
+         |
+         |layer {
+         |  bottom: "res2c_branch2a"
+         |  top: "res2c_branch2b"
+         |  name: "res2c_branch2b"
+         |  type: "Convolution"
+         |  convolution_param {
+         |    num_output: 64
+         |    kernel_size: 3
+         |    pad: 1
+         |    stride: 1
+         |    bias_term: true # change to true
+         |    weight_filler {
+         |      type: "msra"
+         |    }
+         |    bias_filler {
+         |      type: "constant"
+         |      value: 0
+         |    }
+         |  }
+         |}
+         |
+         |layer {
+         |  bottom: "res2c_branch2b"
+         |  top: "res2c_branch2b"
+         |  name: "scale2c_branch2b"
+         |  type: "Scale"
+         |  param { decay_mult: 0 }
+         |  param { decay_mult: 0 }
+         |  scale_param {
+         |    bias_term: true
+         |  }
+         |}
+         |
+         |layer {
+         |  bottom: "res2c_branch2b"
+         |  top: "res2c_branch2b"
+         |  name: "res2c_branch2b_relu"
+         |  type: "ReLU"
+         |  relu_param {
+         |
+         |  }
+         |}
+         |
+         |layer {
+         |  bottom: "res2c_branch2b"
+         |  top: "res2c_branch2c"
+         |  name: "res2c_branch2c"
+         |  type: "Convolution"
+         |  convolution_param {
+         |
+         |    num_output: 256
+         |    kernel_size: 1
+         |    pad: 0
+         |    stride: 1
+         |    bias_term: true # change to true
+         |    weight_filler {
+         |      type: "msra"
+         |    }
+         |    bias_filler {
+         |      type: "constant"
+         |      value: 0
+         |    }
+         |  }
+         |}
+         |
+         |layer {
+         |  bottom: "res2c_branch2c"
+         |  top: "res2c_branch2c"
+         |  name: "scale2c_branch2c"
+         |  type: "Scale"
+         |  param { decay_mult: 0 }
+         |  param { decay_mult: 0 }
+         |  scale_param {
+         |    bias_term: true
+         |  }
+         |}
+         |
+         |layer {
+         |  bottom: "res2b"
+         |  bottom: "res2c_branch2c"
+         |  top: "res2c"
+         |  name: "res2c"
+         |  type: "Eltwise"
+         |  eltwise_param {
+         |
+         |  }
+         |}
+         |
+         |layer {
+         |  bottom: "res2c"
+         |  top: "res2c_" # do not do inplace
+         |  name: "res2c_relu"
+         |  type: "ReLU"
+         |  relu_param {
+         |
+         |  }
+         |}
+       """.stripMargin
+    val inputShape = Array(4, 3, 224, 224)
+    val outputShape = Array(4, 256, 56, 56)
+
+    val model = ResNet50.getModel(inputShape, outputShape)
+    model.compile(TrainingPhase, Array(HeapData(inputShape, Memory.Format.nchw)))
+
+    Tools.compare(prototxt, model, inputShape, outputShape, 1e-5)
+  }
+
+  object ResNet50 {
+    var iChannels = 64
+
+    def shortcut(nInputPlane: Int, nOutputPlane: Int, stride: Int, name: String): Module[Float] = {
+      val useConv = nInputPlane != nOutputPlane
+
+      if (useConv) {
+        Sequential()
+          .add(SpatialConvolution(nInputPlane, nOutputPlane, 1, 1, stride, stride)
+            .setName(s"res${name}_branch1"))
+      } else if (nInputPlane != nOutputPlane) {
+        throw new IllegalArgumentException(s"useConv false")
+      } else {
+        Identity()
+      }
+    }
+
+    def bottleneck(n: Int, stride: Int, name: String = ""): Module[Float] = {
+      val nInputPlane = iChannels
+      iChannels = n * 4
+
+      val s = Sequential()
+      s.add(SpatialConvolution(nInputPlane, n, 1, 1, 1, 1, 0, 0).setName(s"res${name}_branch2a"))
+        .add(ReLU().setName(s"res${name}_branch2a_relu"))
+        .add(SpatialConvolution(n, n, 3, 3, stride, stride, 1, 1).setName(s"res${name}_branch2b"))
+        .add(ReLU().setName(s"res${name}_branch2b_relu"))
+        .add(SpatialConvolution(n, n*4, 1, 1, 1, 1, 0, 0).setName(s"res${name}_branch2c"))
+
+      val model = Sequential()
+        .add(ConcatTable().
+          add(s).
+          add(shortcut(nInputPlane, n*4, stride, name)).setName(s"$name/concatTable"))
+        .add(CAddTable().setName(s"res$name"))
+        .add(ReLU().setName(s"res${name}_relu"))
+      model
+    }
+
+    def layer(block: (Int, Int, String) => Module[Float], features: Int,
+      count: Int, stride: Int = 1, name : String): Module[Float] = {
+      val s = Sequential()
+      for (i <- 1 to count) {
+        s.add(block(features, if (i == 1) stride else 1, getName(i, name)))
+      }
+      s
+    }
+
+    def getName(i: Int, name: String): String = {
+      i match {
+        case 1 => name + "a"
+        case 2 => name + "b"
+        case 3 => name + "c"
+        case 4 => name + "d"
+        case 5 => name + "e"
+        case 6 => name + "f"
+      }
+    }
+
+    def getModel(inputShape: Array[Int], outputShape: Array[Int]): MklDnnContainer = {
+      iChannels = 64
+
+      Sequential()
+        .add(SpatialConvolution(3, 64, 7, 7, 2, 2, 3, 3).setName("conv1").setReLU(true))
+        .add(ReLU().setName("conv1_relu"))
+        .add(MaxPooling(3, 3, 2, 2).setName("pool1"))
+        .add(layer(bottleneck, 64, 3, name = "2"))
+        .add(ReorderMemory(HeapData(outputShape, Memory.Format.nchw)))
+    }
+  }
+
+  private def shape2Dim(shape: Array[Int]): String = {
+    shape.map(x => "dim: " + x).mkString(" ")
+  }
+}
diff --git a/spark/dl/src/test/scala/com/intel/analytics/bigdl/tensor/DenseTensorMathSpec.scala b/spark/dl/src/test/scala/com/intel/analytics/bigdl/tensor/DenseTensorMathSpec.scala
index 824628222fb..7fe820e3a81 100644
--- a/spark/dl/src/test/scala/com/intel/analytics/bigdl/tensor/DenseTensorMathSpec.scala
+++ b/spark/dl/src/test/scala/com/intel/analytics/bigdl/tensor/DenseTensorMathSpec.scala
@@ -22,29 +22,29 @@ import org.scalatest.{FlatSpec, Matchers}
 @com.intel.analytics.bigdl.tags.Parallel
 class DenseTensorMathSpec extends FlatSpec with Matchers {
   "a.dist(b, 1)" should "be correct" in {
-    val a: Tensor[Double] = new DenseTensor(Storage(Array(1.0, 2.0, 3.0)))
-    val b: Tensor[Double] = new DenseTensor(Storage(Array(2.0, 3.0, 4.0)))
+    val a: Tensor[Double] = new DenseTensor(new ArrayStorage(Array(1.0, 2.0, 3.0)))
+    val b: Tensor[Double] = new DenseTensor(new ArrayStorage(Array(2.0, 3.0, 4.0)))
 
     a.dist(b, 1) should equal(3)
   }
 
   "a.dist(b, 2)" should "be correct" in {
-    val a: Tensor[Double] = new DenseTensor(Storage(Array(1.0, 2.0, 3.0)))
-    val b: Tensor[Double] = new DenseTensor(Storage(Array(3.0, 4.0, 5.0)))
+    val a: Tensor[Double] = new DenseTensor(new ArrayStorage(Array(1.0, 2.0, 3.0)))
+    val b: Tensor[Double] = new DenseTensor(new ArrayStorage(Array(3.0, 4.0, 5.0)))
 
     a.dist(b, 2) should equal(math.sqrt(12))
   }
 
   "a.dist(b, 3)" should "be correct" in {
-    val a: Tensor[Double] = new DenseTensor(Storage(Array(1.0, 2.0, 3.0)))
-    val b: Tensor[Double] = new DenseTensor(Storage(Array(3.0, 4.0, 5.0)))
+    val a: Tensor[Double] = new DenseTensor(new ArrayStorage(Array(1.0, 2.0, 3.0)))
+    val b: Tensor[Double] = new DenseTensor(new ArrayStorage(Array(3.0, 4.0, 5.0)))
 
     a.dist(b, 3) should equal(math.pow(24, 1.0 / 3))
   }
 
   "vector + scalar" should "be correct" in {
     val s = 2.0
-    val v: Tensor[Double] = new DenseTensor(Storage(Array(1.0, 2.0, 3.0)))
+    val v: Tensor[Double] = new DenseTensor(new ArrayStorage(Array(1.0, 2.0, 3.0)))
     val r = v + s
     r(Array(1)) should be(3.0)
     r(Array(2)) should be(4.0)
@@ -52,8 +52,8 @@ class DenseTensorMathSpec extends FlatSpec with Matchers {
   }
 
   "vector + vector" should "be correct" in {
-    val v1: Tensor[Double] = new DenseTensor(Storage(Array(1.0, 2.0, 3.0)))
-    val v2: Tensor[Double] = new DenseTensor(Storage(Array(1.0, 2.0, 3.0)))
+    val v1: Tensor[Double] = new DenseTensor(new ArrayStorage(Array(1.0, 2.0, 3.0)))
+    val v2: Tensor[Double] = new DenseTensor(new ArrayStorage(Array(1.0, 2.0, 3.0)))
     val r = v1 + v2
     r(Array(1)) should be(2.0)
     r(Array(2)) should be(4.0)
@@ -63,7 +63,7 @@ class DenseTensorMathSpec extends FlatSpec with Matchers {
   "vector + vector which is not contiguous" should "be correct" in {
     val v1: Tensor[Double] = new DenseTensor[Double](2, 4).fill(1)
     v1.t()
-    val v2: Tensor[Double] = new DenseTensor(Storage(
+    val v2: Tensor[Double] = new DenseTensor(new ArrayStorage(
       Array(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0)))
     val r = v1 + v2
     r(Array(1, 1)) should be(2.0)
@@ -78,7 +78,7 @@ class DenseTensorMathSpec extends FlatSpec with Matchers {
 
   "vector - scalar" should "be correct" in {
     val s = 2.0
-    val v: Tensor[Double] = new DenseTensor(Storage(Array(1.0, 2.0, 3.0)))
+    val v: Tensor[Double] = new DenseTensor(new ArrayStorage(Array(1.0, 2.0, 3.0)))
     val r = v - s
     r(Array(1)) should be(-1.0)
     r(Array(2)) should be(0.0)
@@ -86,8 +86,8 @@ class DenseTensorMathSpec extends FlatSpec with Matchers {
   }
 
   "vector - vector" should "be correct" in {
-    val v1: Tensor[Double] = new DenseTensor(Storage(Array(1.0, 2.0, 3.0)))
-    val v2: Tensor[Double] = new DenseTensor(Storage(Array(2.0, 0.0, -1.0)))
+    val v1: Tensor[Double] = new DenseTensor(new ArrayStorage(Array(1.0, 2.0, 3.0)))
+    val v2: Tensor[Double] = new DenseTensor(new ArrayStorage(Array(2.0, 0.0, -1.0)))
     val r = v1 - v2
     r(Array(1)) should be(-1.0)
     r(Array(2)) should be(2.0)
@@ -96,7 +96,7 @@ class DenseTensorMathSpec extends FlatSpec with Matchers {
 
   "vector * scalar" should "be correct" in {
     val s = 2.0
-    val v: Tensor[Double] = new DenseTensor(Storage(Array(1.0, 2.0, 3.0)))
+    val v: Tensor[Double] = new DenseTensor(new ArrayStorage(Array(1.0, 2.0, 3.0)))
     val r = v * s
     r(Array(1)) should be(2.0)
     r(Array(2)) should be(4.0)
@@ -104,8 +104,8 @@ class DenseTensorMathSpec extends FlatSpec with Matchers {
   }
 
   "vector * vector" should "be correct" in {
-    val v1: Tensor[Double] = new DenseTensor(Storage(Array(1.0, 2.0, 3.0)))
-    val v2: Tensor[Double] = new DenseTensor(Storage(Array(2.0, 0.0, -1.0)))
+    val v1: Tensor[Double] = new DenseTensor(new ArrayStorage(Array(1.0, 2.0, 3.0)))
+    val v2: Tensor[Double] = new DenseTensor(new ArrayStorage(Array(2.0, 0.0, -1.0)))
     val r = v1 * v2
     r(Array(1)) should be(-1.0)
   }
@@ -119,7 +119,7 @@ class DenseTensorMathSpec extends FlatSpec with Matchers {
     mat(Array(2, 2)) = 6
     mat(Array(2, 3)) = 1
 
-    val vec: Tensor[Double] = new DenseTensor(Storage(Array(3.0, 1, 1)))
+    val vec: Tensor[Double] = new DenseTensor(new ArrayStorage(Array(3.0, 1, 1)))
     val r = mat * vec
     r(Array(1)) should be(13.0)
     r(Array(2)) should be(22.0)
@@ -136,7 +136,7 @@ class DenseTensorMathSpec extends FlatSpec with Matchers {
 
     val mat1 = mat.t
 
-    val vec: Tensor[Double] = new DenseTensor(Storage(Array(3.0, 1, 1)))
+    val vec: Tensor[Double] = new DenseTensor(new ArrayStorage(Array(3.0, 1, 1)))
     val r = mat1 * vec
     r(Array(1)) should be(15.0)
     r(Array(2)) should be(18.0)
@@ -153,7 +153,7 @@ class DenseTensorMathSpec extends FlatSpec with Matchers {
 
     val matrix = tensor(T(T(), T(), 1)).t()
 
-    val vec: Tensor[Double] = new DenseTensor(Storage(Array(3.0, 1, 1)))
+    val vec: Tensor[Double] = new DenseTensor(new ArrayStorage(Array(3.0, 1, 1)))
     val r = matrix * vec
     r(Array(1)) should be(15.0)
     r(Array(2)) should be(18.0)
@@ -260,7 +260,7 @@ class DenseTensorMathSpec extends FlatSpec with Matchers {
 
   "vector / scalar" should "be correct" in {
     val s = 2.0
-    val v: Tensor[Double] = new DenseTensor(Storage(Array(1.0, 2.0, 3.0)))
+    val v: Tensor[Double] = new DenseTensor(new ArrayStorage(Array(1.0, 2.0, 3.0)))
     val r = v / s
     r(Array(1)) should be(0.5)
     r(Array(2)) should be(1.0)
@@ -268,8 +268,8 @@ class DenseTensorMathSpec extends FlatSpec with Matchers {
   }
 
   "vector / vector" should "be correct" in {
-    val v1: Tensor[Double] = new DenseTensor(Storage(Array(1.0, 2.0, 3.0)))
-    val v2: Tensor[Double] = new DenseTensor(Storage(Array(2.0, 1.0, -1.0)))
+    val v1: Tensor[Double] = new DenseTensor(new ArrayStorage(Array(1.0, 2.0, 3.0)))
+    val v2: Tensor[Double] = new DenseTensor(new ArrayStorage(Array(2.0, 1.0, -1.0)))
     val r = v1 / v2
     r(Array(1)) should be(0.5)
     r(Array(2)) should be(2.0)
@@ -277,7 +277,7 @@ class DenseTensorMathSpec extends FlatSpec with Matchers {
   }
 
   "-vector" should "be correct" in {
-    val v: Tensor[Double] = new DenseTensor(Storage(Array(1.0, 2.0, 3.0)))
+    val v: Tensor[Double] = new DenseTensor(new ArrayStorage(Array(1.0, 2.0, 3.0)))
     val r = -v
     r(Array(1)) should be(-1.0)
     r(Array(2)) should be(-2.0)
@@ -345,7 +345,7 @@ class DenseTensorMathSpec extends FlatSpec with Matchers {
       1, 2, 3, 4,
       1, 2, 3, 4
     )
-    val a = new DenseTensor[Double](Storage(a_data), 1, Array(3, 4))
+    val a = new DenseTensor[Double](new ArrayStorage(a_data), 1, Array(3, 4))
 
 
     val b_data = Array(
@@ -354,7 +354,7 @@ class DenseTensorMathSpec extends FlatSpec with Matchers {
       1, 2,
       1, 2
     )
-    val b = new DenseTensor[Double](Storage(b_data), 1, Array(4, 2))
+    val b = new DenseTensor[Double](new ArrayStorage(b_data), 1, Array(4, 2))
 
     val c = Tensor[Double]()
     c.resize(Array(3, 2))
@@ -366,7 +366,7 @@ class DenseTensorMathSpec extends FlatSpec with Matchers {
       10, 20
     )
 
-    val expect_c = new DenseTensor[Double](Storage(expect_c_data), 1, Array(3, 2))
+    val expect_c = new DenseTensor[Double](new ArrayStorage(expect_c_data), 1, Array(3, 2))
     c.map(expect_c, (a, b) => {
       a should be(b +- 1e-6)
       a
@@ -379,7 +379,7 @@ class DenseTensorMathSpec extends FlatSpec with Matchers {
       1, 2, 3, 4,
       1, 2, 3, 4
     )
-    val a = new DenseTensor[Double](Storage(a_data), 1, Array(3, 4))
+    val a = new DenseTensor[Double](new ArrayStorage(a_data), 1, Array(3, 4))
 
 
     val b_data = Array(
@@ -388,14 +388,14 @@ class DenseTensorMathSpec extends FlatSpec with Matchers {
       1, 2,
       1, 2
     )
-    val b = new DenseTensor[Double](Storage(b_data), 1, Array(4, 2))
+    val b = new DenseTensor[Double](new ArrayStorage(b_data), 1, Array(4, 2))
 
     val m_data = Array(
       1.0, 2,
       1, 2,
       1, 2
     )
-    val m = new DenseTensor[Double](Storage(m_data), 1, Array(3, 2))
+    val m = new DenseTensor[Double](new ArrayStorage(m_data), 1, Array(3, 2))
 
     val c = Tensor[Double]()
     c.addmm(m, a, b)
@@ -406,7 +406,7 @@ class DenseTensorMathSpec extends FlatSpec with Matchers {
       11, 22
     )
 
-    val expect_c = new DenseTensor[Double](Storage(expect_c_data), 1, Array(3, 2))
+    val expect_c = new DenseTensor[Double](new ArrayStorage(expect_c_data), 1, Array(3, 2))
     c.map(expect_c, (a, b) => {
       a should be(b +- 1e-6)
       a
@@ -414,8 +414,8 @@ class DenseTensorMathSpec extends FlatSpec with Matchers {
   }
 
   "addr transpose" should "return correct value" in {
-    val v1: Tensor[Double] = new DenseTensor(Storage(Array(1.0, 2.0, 3.0)))
-    val v2: Tensor[Double] = new DenseTensor(Storage(Array(2.0, 0.0, -1.0)))
+    val v1: Tensor[Double] = new DenseTensor(new ArrayStorage(Array(1.0, 2.0, 3.0)))
+    val v2: Tensor[Double] = new DenseTensor(new ArrayStorage(Array(2.0, 0.0, -1.0)))
     val tensor: Tensor[Double] = new DenseTensor(3, 3)
     tensor(Array(1, 1)) = 1
     tensor(Array(1, 2)) = 2
@@ -430,26 +430,26 @@ class DenseTensorMathSpec extends FlatSpec with Matchers {
     val r = Tensor[Double]()
     r.resize(Array(3, 3))
     r.addr(1.0, mat, 1.0, v1, v2)
-    val expect_r = new DenseTensor(Storage(Array(3.0, 3.0, 4.0,
+    val expect_r = new DenseTensor(new ArrayStorage(Array(3.0, 3.0, 4.0,
       6.0, 4.0, 4.0,
       8.0, 4.0, 3.0)), 1, Array(3, 3))
     r should be (expect_r)
   }
 
   "addr" should "return correct value" in {
-    val v1: Tensor[Double] = new DenseTensor(Storage(Array(1.0, 2.0, 3.0)))
-    val v2: Tensor[Double] = new DenseTensor(Storage(Array(2.0, 0.0, -1.0)))
+    val v1: Tensor[Double] = new DenseTensor(new ArrayStorage(Array(1.0, 2.0, 3.0)))
+    val v2: Tensor[Double] = new DenseTensor(new ArrayStorage(Array(2.0, 0.0, -1.0)))
     val r = Tensor[Double]()
     r.resize(Array(3, 3))
     r.addr(v1, v2)
-    r should be (new DenseTensor[Double](Storage(Array(2.0, 0.0, -1.0,
+    r should be (new DenseTensor[Double](new ArrayStorage(Array(2.0, 0.0, -1.0,
       4.0, 0.0, -2.0,
       6.0, 0.0, -3.0)), 1, Array(3, 3)))
   }
 
   "addr noncontiguous" should "return correct value" in {
-    val v1: Tensor[Double] = new DenseTensor(Storage(Array(1.0, 2.0, 3.0)))
-    val v2: Tensor[Double] = new DenseTensor(Storage(Array(2.0, 0.0, -1.0)))
+    val v1: Tensor[Double] = new DenseTensor(new ArrayStorage(Array(1.0, 2.0, 3.0)))
+    val v2: Tensor[Double] = new DenseTensor(new ArrayStorage(Array(2.0, 0.0, -1.0)))
     val tensor: Tensor[Double] = new DenseTensor(3, 3, 2)
     tensor(Array(1, 1, 1)) = 1
     tensor(Array(1, 2, 1)) = 2
@@ -465,7 +465,7 @@ class DenseTensorMathSpec extends FlatSpec with Matchers {
     val r = Tensor[Double]()
     r.resize(Array(3, 3))
     r.addr(1, mat, 1, v1, v2)
-    r should be (new DenseTensor[Double](Storage(Array(3.0, 3.0, 4.0,
+    r should be (new DenseTensor[Double](new ArrayStorage(Array(3.0, 3.0, 4.0,
       6.0, 4.0, 4.0,
       8.0, 4.0, 3.0)), 1, Array(3, 3)))
   }
diff --git a/spark/dl/src/test/scala/com/intel/analytics/bigdl/tensor/DenseTensorSpec.scala b/spark/dl/src/test/scala/com/intel/analytics/bigdl/tensor/DenseTensorSpec.scala
index a6df82b6709..38183b21025 100644
--- a/spark/dl/src/test/scala/com/intel/analytics/bigdl/tensor/DenseTensorSpec.scala
+++ b/spark/dl/src/test/scala/com/intel/analytics/bigdl/tensor/DenseTensorSpec.scala
@@ -52,7 +52,7 @@ class DenseTensorSpec extends FlatSpec with Matchers {
 
   "Construct with storage" should "return 1D vector" in {
     val storage = Array(1.0, 2.0, 3.0)
-    val t: Tensor[Double] = new DenseTensor(Storage(storage))
+    val t: Tensor[Double] = new DenseTensor(new ArrayStorage(storage))
     t.nDimension should be(1)
     t.size().length should be(1)
     t.size(1) should be(3)
@@ -146,7 +146,7 @@ class DenseTensorSpec extends FlatSpec with Matchers {
   }
 
   "One index on a 1d-dimension tensor" should "return value" in {
-    val t: Tensor[Double] = new DenseTensor(Storage(Array(3.0, 4, 5)))
+    val t: Tensor[Double] = new DenseTensor(new ArrayStorage(Array(3.0, 4, 5)))
     t.valueAt(2) should be(4.0)
   }
 
@@ -185,7 +185,7 @@ class DenseTensorSpec extends FlatSpec with Matchers {
   "One index update a multi-dimension tensor with tensor" should
     "copy the tensor to the subset" in {
     val t: Tensor[Double] = new DenseTensor[Double](3, 2).fill(1)
-    val src: Tensor[Double] = new DenseTensor(Storage(Array(8.0, 9)))
+    val src: Tensor[Double] = new DenseTensor(new ArrayStorage(Array(8.0, 9)))
     t(2) = src
     t(Array(1, 1)) should be(1)
     t(Array(1, 2)) should be(1)
@@ -196,7 +196,7 @@ class DenseTensorSpec extends FlatSpec with Matchers {
   }
 
   "One index update a 1d-dimension tensor" should "update the value" in {
-    val t: Tensor[Double] = new DenseTensor(Storage(Array(3.0, 4, 5)))
+    val t: Tensor[Double] = new DenseTensor(new ArrayStorage(Array(3.0, 4, 5)))
     t(2) = 6
     t.valueAt(1) should be(3.0)
     t.valueAt(2) should be(6.0)
@@ -238,7 +238,7 @@ class DenseTensorSpec extends FlatSpec with Matchers {
     t(Array(3, 1)) should be(7)
     t(Array(3, 2)) should be(6)
 
-    val src: Tensor[Double] = new DenseTensor(Storage(Array(9.0, 10)))
+    val src: Tensor[Double] = new DenseTensor(new ArrayStorage(Array(9.0, 10)))
 
     t(T(T(2, 3), 1)) = src
     t(Array(1, 1)) should be(1)
@@ -290,7 +290,7 @@ class DenseTensorSpec extends FlatSpec with Matchers {
   }
 
   "clone" should "get a seperated tensor" in {
-    val t: Tensor[Double] = new DenseTensor(Storage(Array(1.0, 2, 3)))
+    val t: Tensor[Double] = new DenseTensor(new ArrayStorage(Array(1.0, 2, 3)))
     val t1 = t.clone()
     t.isSameSizeAs(t1) should be(true)
     t1.isContiguous() should be(true)
@@ -468,18 +468,18 @@ class DenseTensorSpec extends FlatSpec with Matchers {
   }
 
   "equals" should "be correct" in {
-    val t: Tensor[Double] = new DenseTensor(Storage(Array(1.0, 2, 3)))
-    val t1: Tensor[Double] = new DenseTensor(Storage(Array(1.0, 2, 3)))
-    val t2: Tensor[Double] = new DenseTensor(Storage(Array(1.0, 2, 4)))
+    val t: Tensor[Double] = new DenseTensor(new ArrayStorage(Array(1.0, 2, 3)))
+    val t1: Tensor[Double] = new DenseTensor(new ArrayStorage(Array(1.0, 2, 3)))
+    val t2: Tensor[Double] = new DenseTensor(new ArrayStorage(Array(1.0, 2, 4)))
 
     t == t1 should be(true)
     t == t2 should be(false)
   }
 
   "hashCode" should "be correct" in {
-    val t: Tensor[Double] = new DenseTensor(Storage(Array(1.0, 2, 3)))
-    val t1: Tensor[Double] = new DenseTensor(Storage(Array(1.0, 2, 3)))
-    val t2: Tensor[Double] = new DenseTensor(Storage(Array(1.0, 2, 4)))
+    val t: Tensor[Double] = new DenseTensor(new ArrayStorage(Array(1.0, 2, 3)))
+    val t1: Tensor[Double] = new DenseTensor(new ArrayStorage(Array(1.0, 2, 3)))
+    val t2: Tensor[Double] = new DenseTensor(new ArrayStorage(Array(1.0, 2, 4)))
 
     t.hashCode() == t1.hashCode() should be(true)
     t.hashCode() == t2.hashCode() should be(false)
@@ -494,7 +494,7 @@ class DenseTensorSpec extends FlatSpec with Matchers {
     t = Tensor.scalar[Double](1)
     t.toString should be("Scalar(1.0)")
 
-    t = new DenseTensor(Storage(Array(1.0, 2.0, 3.0)))
+    t = new DenseTensor(new ArrayStorage(Array(1.0, 2.0, 3.0)))
     val OneD_STRING =
       "1.0\n" +
         "2.0\n" +
@@ -692,7 +692,8 @@ class DenseTensorSpec extends FlatSpec with Matchers {
   }
 
   "Tensor to BreezeMatrix" should "correct" in {
-    val tensor = new DenseTensor[Double](Storage[Double](Array(1.0, 2, 3, 4)), 1, Array(2, 2))
+    val tensor = new DenseTensor[Double](
+      new ArrayStorage[Double](Array(1.0, 2, 3, 4)), 1, Array(2, 2))
     val matrix = tensor.toBreezeMatrix()
     matrix.isTranspose should be(true)
     matrix(0, 0) should be(1.0)
@@ -727,7 +728,7 @@ class DenseTensorSpec extends FlatSpec with Matchers {
   }
 
   "Tensor to BreezeVector" should "correct" in {
-    val tensor = new DenseTensor[Double](Storage(Array(1.0, 2, 3, 4)))
+    val tensor = new DenseTensor[Double](new ArrayStorage(Array(1.0, 2, 3, 4)))
     val vector = tensor.toBreezeVector()
     vector(0) should be(1.0)
     vector(1) should be(2.0)
@@ -745,7 +746,7 @@ class DenseTensorSpec extends FlatSpec with Matchers {
   }
 
   "Tensor to MLMatrix" should "correct" in {
-    val tensor = new DenseTensor(Storage(Array(1.0, 2, 3, 4)), 1, Array(2, 2))
+    val tensor = new DenseTensor(new ArrayStorage(Array(1.0, 2, 3, 4)), 1, Array(2, 2))
     val matrix = tensor.toMLlibMatrix()
     matrix.isTransposed should be(true)
     matrix(0, 0) should be(1.0)
@@ -780,7 +781,7 @@ class DenseTensorSpec extends FlatSpec with Matchers {
   }
 
   "Tensor to MLVector" should "correct" in {
-    val tensor = new DenseTensor(Storage(Array(1.0, 2, 3, 4)))
+    val tensor = new DenseTensor(new ArrayStorage(Array(1.0, 2, 3, 4)))
     val vector = tensor.toMLlibVector()
     vector(0) should be(1.0)
     vector(1) should be(2.0)
diff --git a/spark/dl/src/test/scala/com/intel/analytics/bigdl/tensor/DnnTensorSpec.scala b/spark/dl/src/test/scala/com/intel/analytics/bigdl/tensor/DnnTensorSpec.scala
new file mode 100644
index 00000000000..c265b46da32
--- /dev/null
+++ b/spark/dl/src/test/scala/com/intel/analytics/bigdl/tensor/DnnTensorSpec.scala
@@ -0,0 +1,72 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.intel.analytics.bigdl.tensor
+
+import com.intel.analytics.bigdl.mkl.MklDnn
+import com.intel.analytics.bigdl.utils.{BigDLSpecHelper, T}
+
+class DnnTensorSpec extends BigDLSpecHelper {
+  "nElement" should "be correct" in {
+    val tensor = DnnTensor[Float](3, 4, 5)
+    tensor.nElement() should be(3 * 4 * 5)
+  }
+
+  "DnnTensor" should "only support float" in {
+    intercept[IllegalArgumentException] {
+      val t = DnnTensor[Double](3, 4, 5)
+    }
+  }
+
+  "Copy" should "be correct" in {
+    val heapTensor = Tensor[Float](T(1, 2, 3, 4))
+    val dnnTensor1 = DnnTensor[Float](4)
+    dnnTensor1.copy(heapTensor)
+    val dnnTensor2 = DnnTensor[Float](4)
+    dnnTensor2.copy(dnnTensor1)
+    val heapTensor2 = Tensor[Float](4)
+    heapTensor2.copy(dnnTensor2)
+    heapTensor2 should be(heapTensor)
+  }
+
+  "release" should "be correct" in {
+    val tensor = DnnTensor[Float](3, 4, 5)
+    tensor.isReleased() should be(false)
+    tensor.release()
+    tensor.isReleased() should be(true)
+  }
+
+  "resize" should "be correct" in {
+    val tensor = DnnTensor[Float](3, 4)
+    tensor.size() should be(Array(3, 4))
+    tensor.resize(Array(2, 3))
+    tensor.size() should be(Array(2, 3))
+    tensor.resize(2)
+    tensor.size(1) should be(2)
+    tensor.resize(Array(5, 6, 7))
+    tensor.size() should be(Array(5, 6, 7))
+    tensor.size(2) should be(6)
+  }
+
+  "add" should "be correct" in {
+    val heapTensor1 = Tensor[Float](T(1, 2, 3, 4))
+    val heapTensor2 = Tensor[Float](T(2, 5, 1, 7))
+    val dnnTensor1 = DnnTensor[Float](4).copy(heapTensor1)
+    val dnnTensor2 = DnnTensor[Float](4).copy(heapTensor2)
+    dnnTensor1.add(dnnTensor2)
+    val heapTensor3 = Tensor[Float](4).copy(dnnTensor1)
+    heapTensor3 should be(Tensor[Float](T(3, 7, 4, 11)))
+  }
+}
diff --git a/spark/dl/src/test/scala/com/intel/analytics/bigdl/utils/serializer/SerializerSpec.scala b/spark/dl/src/test/scala/com/intel/analytics/bigdl/utils/serializer/SerializerSpec.scala
index 563538e9a3d..1bf9f4aa3bd 100644
--- a/spark/dl/src/test/scala/com/intel/analytics/bigdl/utils/serializer/SerializerSpec.scala
+++ b/spark/dl/src/test/scala/com/intel/analytics/bigdl/utils/serializer/SerializerSpec.scala
@@ -44,7 +44,24 @@ class SerializerSpec extends BigDLSpecHelper {
     "com.intel.analytics.bigdl.utils.serializer.TestModule",
     "com.intel.analytics.bigdl.utils.ExceptionTest",
     "com.intel.analytics.bigdl.utils.serializer.SubModuleOne",
-    "com.intel.analytics.bigdl.utils.serializer.SubModuleTwo"
+    "com.intel.analytics.bigdl.utils.serializer.SubModuleTwo",
+    "com.intel.analytics.bigdl.nn.mkldnn.AvgPooling",
+    "com.intel.analytics.bigdl.nn.mkldnn.CAddTable",
+    "com.intel.analytics.bigdl.nn.mkldnn.ConcatTable",
+    "com.intel.analytics.bigdl.nn.mkldnn.DnnBase",
+    "com.intel.analytics.bigdl.nn.mkldnn.Identity",
+    "com.intel.analytics.bigdl.nn.mkldnn.Input",
+    "com.intel.analytics.bigdl.nn.mkldnn.JoinTable",
+    "com.intel.analytics.bigdl.nn.mkldnn.Linear",
+    "com.intel.analytics.bigdl.nn.mkldnn.LRN",
+    "com.intel.analytics.bigdl.nn.mkldnn.MaxPooling",
+    "com.intel.analytics.bigdl.nn.mkldnn.ReLU",
+    "com.intel.analytics.bigdl.nn.mkldnn.ReorderMemory",
+    "com.intel.analytics.bigdl.nn.mkldnn.SelectTable",
+    "com.intel.analytics.bigdl.nn.mkldnn.Sequential",
+    "com.intel.analytics.bigdl.nn.mkldnn.SoftMax",
+    "com.intel.analytics.bigdl.nn.mkldnn.SpatialBatchNormalization",
+    "com.intel.analytics.bigdl.nn.mkldnn.SpatialConvolution"
   )
 
   // Maybe one serial test class contains multiple module test