From 904eefaf8a82ea10c0a804c58a11110fa296a74a Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Tue, 3 Jan 2017 15:51:07 +0800
Subject: [PATCH 01/11] add TensorShape use to represent tensor of any
 dimension.

---
 paddle/function/TensorType.h       | 125 +++++++++++++++++++++++++++++
 paddle/function/TensorTypeTest.cpp |  53 ++++++++++++
 2 files changed, 178 insertions(+)
 create mode 100644 paddle/function/TensorType.h
 create mode 100644 paddle/function/TensorTypeTest.cpp
diff --git a/paddle/function/TensorType.h b/paddle/function/TensorType.h
new file mode 100644
index 0000000000000..0b860f204606c
--- /dev/null
+++ b/paddle/function/TensorType.h
@@ -0,0 +1,125 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <glog/logging.h>
+
+namespace paddle {
+
+enum ValueType {
+  VALUE_TYPE_INT32 = 0,
+  VALUE_TYPE_FLOAT = 1,
+  VALUE_TYPE_DOUBLE = 2,
+  VALUE_TYPE_BYTE = 3
+};
+
+enum DeviceType {
+  DEVICE_TYPE_UNSPECIFIED = 0,
+  DEVICE_TYPE_CPU = 1,
+  DEVICE_TYPE_GPU = 2
+};
+
+inline int sizeOfValuType(ValueType valueType) {
+  if (valueType == VALUE_TYPE_INT32) {
+    return 4;
+  } else if (valueType == VALUE_TYPE_FLOAT) {
+    return 4;
+  } else if (valueType == VALUE_TYPE_DOUBLE) {
+    return 8;
+  } else {
+    LOG(FATAL) << "Unknown type: " << valueType;
+    return 0;
+  }
+}
+
+template <typename T>
+struct DataType;
+
+template <>
+struct DataType<float> {
+  static const ValueType value = VALUE_TYPE_FLOAT;
+};
+
+template <>
+struct DataType<double> {
+  static const ValueType value = VALUE_TYPE_DOUBLE;
+};
+
+/**
+ * TensorShape used to represent shape of normal tensor.
+ */
+class TensorShape {
+public:
+  TensorShape() : ndims_(0), nelements_(0) { initDims(0); }
+
+  TensorShape(size_t ndims) : ndims_(ndims), nelements_(1) { initDims(ndims); };
+
+  TensorShape(std::initializer_list<size_t> dims) {
+    ndims_ = dims.size();
+    initDims(ndims_);
+    std::copy(dims.begin(), dims.end(), dims_.begin());
+    numElements();
+  };
+
+  TensorShape(const TensorShape& t)
+      : ndims_(t.ndims_), nelements_(t.nelements_) {
+    initDims(ndims_);
+    std::copy(t.dims_.begin(), t.dims_.end(), dims_.begin());
+  };
+
+  // get the size of specified dimension
+  size_t operator[](size_t dim) const {
+    CHECK_GE(dim, 0);
+    CHECK_LT(dim, ndims_);
+    return dims_[dim];
+  }
+
+  // set the size of specified dimension
+  void setDim(size_t dim, size_t size) {
+    CHECK_GE(dim, 0);
+    CHECK_LT(dim, ndims_);
+    dims_[dim] = size;
+    numElements();
+  }
+
+  // number of dimensions of the tensor
+  size_t ndims() const { return ndims_; }
+
+  size_t getElements() const { return nelements_; }
+
+private:
+  // compute number of elements
+  void numElements() {
+    nelements_ = 1;
+    for (size_t n = 0; n < ndims_; n++) {
+      nelements_ *= dims_[n];
+    }
+  }
+
+  // init dims_
+  void initDims(size_t ndims) {
+    size_t count = ndims < 4 ? 4 : ndims;
+    dims_.assign(count, 1);
+  }
+
+  // number of dimensions
+  // ndims_ may be not equeal dims_.size()
+  size_t ndims_;
+  // number of elements
+  size_t nelements_;
+  std::vector<size_t> dims_;
+};
+
+}  // namespace paddle
diff --git a/paddle/function/TensorTypeTest.cpp b/paddle/function/TensorTypeTest.cpp
new file mode 100644
index 0000000000000..99c25f42a1e6c
--- /dev/null
+++ b/paddle/function/TensorTypeTest.cpp
@@ -0,0 +1,53 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "TensorType.h"
+#include <gtest/gtest.h>
+
+namespace paddle {
+
+TEST(TensorShape, Constructor) {
+  TensorShape t1;
+  EXPECT_EQ(t1.ndims(), 0);
+  EXPECT_EQ(t1.getElements(), 0);
+
+  TensorShape t2(3);
+  EXPECT_EQ(t2.ndims(), 3);
+  EXPECT_EQ(t2.getElements(), 1);
+
+  TensorShape t3({8, 10});
+  EXPECT_EQ(t3.ndims(), 2);
+  EXPECT_EQ(t3.getElements(), 80);
+
+  TensorShape t4(t3);
+  EXPECT_EQ(t4.ndims(), t3.ndims());
+  EXPECT_EQ(t4.getElements(), t3.getElements());
+
+  TensorShape t5({1, 2, 3, 4, 5});
+  EXPECT_EQ(t5.ndims(), 5);
+  EXPECT_EQ(t5.getElements(), 120);
+}
+
+TEST(TensorShape, GetAndSet) {
+  TensorShape t({1, 2, 3});
+  EXPECT_EQ(t.ndims(), 3);
+  EXPECT_EQ(t.getElements(), 6);
+
+  EXPECT_EQ(t[1], 2);
+  t.setDim(1, 100);
+  EXPECT_EQ(t.getElements(), 300);
+  EXPECT_EQ(t[1], 100);
+}
+
+}  // namespace paddle

From 0c4be7e6a687b5ec9a722fc1c9dbded70b1aa8ea Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Wed, 4 Jan 2017 16:51:49 +0800
Subject: [PATCH 02/11] add TensorType.h

---
 paddle/function/TensorShape.h       |  97 +++++++++++++++++++++++++
 paddle/function/TensorShapeTest.cpp |  53 ++++++++++++++
 paddle/function/TensorType.h        | 107 +++++++++++++---------------
 paddle/function/TensorTypeTest.cpp  |  52 ++++++--------
 4 files changed, 222 insertions(+), 87 deletions(-)
 create mode 100644 paddle/function/TensorShape.h
 create mode 100644 paddle/function/TensorShapeTest.cpp

diff --git a/paddle/function/TensorShape.h b/paddle/function/TensorShape.h
new file mode 100644
index 0000000000000..e70484a1afd99
--- /dev/null
+++ b/paddle/function/TensorShape.h
@@ -0,0 +1,97 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <glog/logging.h>
+
+namespace paddle {
+
+/**
+ * TensorShape used to represent shape of normal tensor.
+ */
+class TensorShape {
+public:
+  TensorShape() : ndims_(0), nelements_(0) { initDims(0); }
+
+  TensorShape(size_t ndims) : ndims_(ndims), nelements_(1) { initDims(ndims); };
+
+  TensorShape(std::initializer_list<size_t> dims) {
+    ndims_ = dims.size();
+    initDims(ndims_);
+    std::copy(dims.begin(), dims.end(), dims_.begin());
+    numElements();
+  };
+
+  TensorShape(const TensorShape& t)
+      : ndims_(t.ndims_), nelements_(t.nelements_) {
+    initDims(ndims_);
+    std::copy(t.dims_.begin(), t.dims_.end(), dims_.begin());
+  };
+
+  // get the size of specified dimension
+  size_t operator[](size_t dim) const {
+    CHECK_GE(dim, 0);
+    CHECK_LT(dim, ndims_);
+    return dims_[dim];
+  }
+
+  // set the size of specified dimension
+  void setDim(size_t dim, size_t size) {
+    CHECK_GE(dim, 0);
+    CHECK_LT(dim, ndims_);
+    dims_[dim] = size;
+    numElements();
+  }
+
+  // number of dimensions of the tensor
+  size_t ndims() const { return ndims_; }
+
+  size_t getElements() const { return nelements_; }
+
+  bool operator==(const TensorShape& t) const {
+    if (ndims() != t.ndims()) return false;
+    for (size_t i = 0; i < ndims(); i++) {
+      if (dims_[i] != t.dims_[i]) return false;
+    }
+
+    return true;
+  }
+
+  bool operator!=(const TensorShape& t) const { return !(*this == t); }
+
+private:
+  // compute number of elements
+  void numElements() {
+    nelements_ = 1;
+    for (size_t n = 0; n < ndims_; n++) {
+      nelements_ *= dims_[n];
+    }
+  }
+
+  // init dims_
+  void initDims(size_t ndims) {
+    size_t count = ndims < 4 ? 4 : ndims;
+    dims_.assign(count, 1);
+  }
+
+  // number of dimensions
+  // ndims_ may be not equeal dims_.size()
+  size_t ndims_;
+  // number of elements
+  size_t nelements_;
+  std::vector<size_t> dims_;
+};
+
+}  // namespace paddle
diff --git a/paddle/function/TensorShapeTest.cpp b/paddle/function/TensorShapeTest.cpp
new file mode 100644
index 0000000000000..45a2e106e7fc3
--- /dev/null
+++ b/paddle/function/TensorShapeTest.cpp
@@ -0,0 +1,53 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "TensorShape.h"
+#include <gtest/gtest.h>
+
+namespace paddle {
+
+TEST(TensorShape, Constructor) {
+  TensorShape t1;
+  EXPECT_EQ(t1.ndims(), 0);
+  EXPECT_EQ(t1.getElements(), 0);
+
+  TensorShape t2(3);
+  EXPECT_EQ(t2.ndims(), 3);
+  EXPECT_EQ(t2.getElements(), 1);
+
+  TensorShape t3({8, 10});
+  EXPECT_EQ(t3.ndims(), 2);
+  EXPECT_EQ(t3.getElements(), 80);
+
+  TensorShape t4(t3);
+  EXPECT_EQ(t4.ndims(), t3.ndims());
+  EXPECT_EQ(t4.getElements(), t3.getElements());
+
+  TensorShape t5({1, 2, 3, 4, 5});
+  EXPECT_EQ(t5.ndims(), 5);
+  EXPECT_EQ(t5.getElements(), 120);
+}
+
+TEST(TensorShape, GetAndSet) {
+  TensorShape t({1, 2, 3});
+  EXPECT_EQ(t.ndims(), 3);
+  EXPECT_EQ(t.getElements(), 6);
+
+  EXPECT_EQ(t[1], 2);
+  t.setDim(1, 100);
+  EXPECT_EQ(t.getElements(), 300);
+  EXPECT_EQ(t[1], 100);
+}
+
+}  // namespace paddle
diff --git a/paddle/function/TensorType.h b/paddle/function/TensorType.h
index 0b860f204606c..800f71a5b974c 100644
--- a/paddle/function/TensorType.h
+++ b/paddle/function/TensorType.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include <glog/logging.h>
+#include "paddle/math/Matrix.h"
 
 namespace paddle {
 
@@ -57,69 +57,60 @@ struct DataType<double> {
   static const ValueType value = VALUE_TYPE_DOUBLE;
 };
 
-/**
- * TensorShape used to represent shape of normal tensor.
- */
-class TensorShape {
-public:
-  TensorShape() : ndims_(0), nelements_(0) { initDims(0); }
-
-  TensorShape(size_t ndims) : ndims_(ndims), nelements_(1) { initDims(ndims); };
-
-  TensorShape(std::initializer_list<size_t> dims) {
-    ndims_ = dims.size();
-    initDims(ndims_);
-    std::copy(dims.begin(), dims.end(), dims_.begin());
-    numElements();
-  };
-
-  TensorShape(const TensorShape& t)
-      : ndims_(t.ndims_), nelements_(t.nelements_) {
-    initDims(ndims_);
-    std::copy(t.dims_.begin(), t.dims_.end(), dims_.begin());
-  };
-
-  // get the size of specified dimension
-  size_t operator[](size_t dim) const {
-    CHECK_GE(dim, 0);
-    CHECK_LT(dim, ndims_);
-    return dims_[dim];
-  }
+namespace detail {
 
-  // set the size of specified dimension
-  void setDim(size_t dim, size_t size) {
-    CHECK_GE(dim, 0);
-    CHECK_LT(dim, ndims_);
-    dims_[dim] = size;
-    numElements();
-  }
+template <typename VType, DeviceType Device>
+struct MatrixT;
 
-  // number of dimensions of the tensor
-  size_t ndims() const { return ndims_; }
+template <>
+struct MatrixT<real, DEVICE_TYPE_CPU> {
+  using type = CpuMatrix;
+};
 
-  size_t getElements() const { return nelements_; }
+template <>
+struct MatrixT<real, DEVICE_TYPE_GPU> {
+  using type = GpuMatrix;
+};
 
-private:
-  // compute number of elements
-  void numElements() {
-    nelements_ = 1;
-    for (size_t n = 0; n < ndims_; n++) {
-      nelements_ *= dims_[n];
-    }
-  }
+template <>
+struct MatrixT<int, DEVICE_TYPE_CPU> {
+  using type = void;  // Not implemented
+};
 
-  // init dims_
-  void initDims(size_t ndims) {
-    size_t count = ndims < 4 ? 4 : ndims;
-    dims_.assign(count, 1);
-  }
+template <>
+struct MatrixT<int, DEVICE_TYPE_GPU> {
+  using type = void;  // Not implemented
+};
+
+template <typename VType, DeviceType Device>
+struct VectorT;
+
+template <>
+struct VectorT<real, DEVICE_TYPE_CPU> {
+  using type = CpuVector;
+};
+
+template <>
+struct VectorT<real, DEVICE_TYPE_GPU> {
+  using type = GpuVector;
+};
+
+template <>
+struct VectorT<int, DEVICE_TYPE_CPU> {
+  using type = CpuIVector;
+};
+
+template <>
+struct VectorT<int, DEVICE_TYPE_GPU> {
+  using type = GpuIVector;
+};
+
+}  // namespace detail
 
-  // number of dimensions
-  // ndims_ may be not equeal dims_.size()
-  size_t ndims_;
-  // number of elements
-  size_t nelements_;
-  std::vector<size_t> dims_;
+template <typename VType, DeviceType DType>
+struct Tensor {
+  typedef typename detail::MatrixT<VType, DType>::type Matrix;
+  typedef typename detail::VectorT<VType, DType>::type Vector;
 };
 
 }  // namespace paddle
diff --git a/paddle/function/TensorTypeTest.cpp b/paddle/function/TensorTypeTest.cpp
index 99c25f42a1e6c..4a86245c2a288 100644
--- a/paddle/function/TensorTypeTest.cpp
+++ b/paddle/function/TensorTypeTest.cpp
@@ -17,37 +17,31 @@ limitations under the License. */
 
 namespace paddle {
 
-TEST(TensorShape, Constructor) {
-  TensorShape t1;
-  EXPECT_EQ(t1.ndims(), 0);
-  EXPECT_EQ(t1.getElements(), 0);
-
-  TensorShape t2(3);
-  EXPECT_EQ(t2.ndims(), 3);
-  EXPECT_EQ(t2.getElements(), 1);
-
-  TensorShape t3({8, 10});
-  EXPECT_EQ(t3.ndims(), 2);
-  EXPECT_EQ(t3.getElements(), 80);
-
-  TensorShape t4(t3);
-  EXPECT_EQ(t4.ndims(), t3.ndims());
-  EXPECT_EQ(t4.getElements(), t3.getElements());
-
-  TensorShape t5({1, 2, 3, 4, 5});
-  EXPECT_EQ(t5.ndims(), 5);
-  EXPECT_EQ(t5.getElements(), 120);
+TEST(TensorType, Matrix) {
+  Tensor<real, DEVICE_TYPE_CPU>::Matrix matrix(100, 200);
+  EXPECT_EQ(matrix.getHeight(), 100);
+  EXPECT_EQ(matrix.getWidth(), 200);
+  EXPECT_EQ(matrix.getElementCnt(), 100 * 200);
+  EXPECT_EQ(matrix.useGpu(), false);
+
+  Tensor<real, DEVICE_TYPE_GPU>::Matrix testGpu(100, 200);
+  EXPECT_EQ(testGpu.useGpu(), true);
 }
 
-TEST(TensorShape, GetAndSet) {
-  TensorShape t({1, 2, 3});
-  EXPECT_EQ(t.ndims(), 3);
-  EXPECT_EQ(t.getElements(), 6);
-
-  EXPECT_EQ(t[1], 2);
-  t.setDim(1, 100);
-  EXPECT_EQ(t.getElements(), 300);
-  EXPECT_EQ(t[1], 100);
+TEST(TensorType, Vector) {
+  Tensor<real, DEVICE_TYPE_CPU>::Vector cpuVector(100);
+  Tensor<real, DEVICE_TYPE_GPU>::Vector gpuVector(100);
+  EXPECT_EQ(cpuVector.useGpu(), false);
+  EXPECT_EQ(gpuVector.useGpu(), true);
+  EXPECT_EQ(cpuVector.getSize(), 100);
+  EXPECT_EQ(gpuVector.getSize(), 100);
+
+  Tensor<int, DEVICE_TYPE_CPU>::Vector cpuIVector(100);
+  Tensor<int, DEVICE_TYPE_GPU>::Vector gpuIVector(100);
+  EXPECT_EQ(cpuIVector.useGpu(), false);
+  EXPECT_EQ(gpuIVector.useGpu(), true);
+  EXPECT_EQ(cpuIVector.getSize(), 100);
+  EXPECT_EQ(gpuIVector.getSize(), 100);
 }
 
 }  // namespace paddle

From c5c8051657611025eeaf8bc095da09a81fb76a1d Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Wed, 4 Jan 2017 21:17:56 +0800
Subject: [PATCH 03/11] add BufferArg

---
 paddle/function/BufferArg.cpp     |  43 +++++
 paddle/function/BufferArg.h       | 260 ++++++++++++++++++++++++++++++
 paddle/function/BufferArgTest.cpp | 128 +++++++++++++++
 paddle/function/TensorType.h      |   5 +
 4 files changed, 436 insertions(+)
 create mode 100644 paddle/function/BufferArg.cpp
 create mode 100644 paddle/function/BufferArg.h
 create mode 100644 paddle/function/BufferArgTest.cpp

diff --git a/paddle/function/BufferArg.cpp b/paddle/function/BufferArg.cpp
new file mode 100644
index 0000000000000..08031917b21e1
--- /dev/null
+++ b/paddle/function/BufferArg.cpp
@@ -0,0 +1,43 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <glog/logging.h>
+
+#include "BufferArg.h"
+
+namespace paddle {
+
+const SequenceArg& BufferArg::sequence() const {
+  // CHECK_EQ(bufferType_, TENSOR_SEQUENCE_DATA);
+  return dynamic_cast<const SequenceArg&>(*this);
+}
+
+const SparseMatrixArg& BufferArg::sparse() const {
+  // CHECK_EQ(bufferType_, TENSOR_SPARSE);
+  return dynamic_cast<const SparseMatrixArg&>(*this);
+}
+
+void BufferArgs::addArg(const Matrix& arg, const TensorShape& shape) {
+  args_.push_back(std::make_shared<BufferArg>(arg, shape));
+}
+
+void BufferArgs::addArg(const CpuSparseMatrix& arg) {
+  args_.push_back(std::make_shared<SparseMatrixArg>(arg));
+}
+
+void BufferArgs::addArg(const GpuSparseMatrix& arg) {
+  args_.push_back(std::make_shared<SparseMatrixArg>(arg));
+}
+
+}  // namespace paddle
diff --git a/paddle/function/BufferArg.h b/paddle/function/BufferArg.h
new file mode 100644
index 0000000000000..9fcda7a878aad
--- /dev/null
+++ b/paddle/function/BufferArg.h
@@ -0,0 +1,260 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <glog/logging.h>
+
+#include "TensorShape.h"
+#include "TensorType.h"
+#include "paddle/math/CpuSparseMatrix.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/SparseMatrix.h"
+
+namespace paddle {
+
+enum BufferType {
+  TENSOR_NORMAL = 0,
+  TENSOR_SEQUENCE_ID = 1,
+  TENSOR_SEQUENCE_DATA = 2,
+  TENSOR_SPARSE = 3
+};
+
+enum SparseDataType {
+  SPARSE_NO_VALUE = 0,  // do not need value pointer, all values are 1
+  SPARSE_FLOAT_VALUE = 1
+};
+
+enum SparseDataFormat { SPARSE_CSR_FORMAT = 0, SPARSE_CSC_FORMAT = 1 };
+
+/**
+ * BufferArg used as the argument type for Function.
+ */
+class BufferArg;
+class SequenceArg;
+class SparseMatrixArg;
+typedef std::shared_ptr<BufferArg> BufferArgPtr;
+
+class BufferArgs {
+public:
+  BufferArgs() {}
+  size_t size() const { return args_.size(); }
+
+  // add argument into BufferArgss
+  template <typename Tensor>
+  void addArg(const Tensor& arg) {
+    args_.push_back(std::make_shared<BufferArg>(arg));
+  }
+
+  void addArg(const Matrix& arg, const TensorShape& shape);
+
+  void addArg(const CpuSparseMatrix& arg);
+  void addArg(const GpuSparseMatrix& arg);
+
+  // get argument
+  const BufferArg& operator[](size_t num) const {
+    CHECK_LT(num, args_.size());
+    return *args_[num];
+  }
+
+private:
+  std::vector<BufferArgPtr> args_;
+};
+
+// an array of arbitrary dimensions
+class BufferArg {
+public:
+  BufferArg(void* buf, ValueType valueType, const TensorShape& shape)
+      : buf_(buf), valueType_(valueType), shape_(shape) {}
+
+  BufferArg(void* buf, ValueType valueType)
+      : buf_(buf), valueType_(valueType) {}
+
+  BufferArg(const Matrix& matrix)
+      : buf_((void*)matrix.getData()),
+        valueType_(DataType<real>::value),
+        shape_(2) {
+    shape_.setDim(0, matrix.getHeight());
+    shape_.setDim(1, matrix.getWidth());
+  }
+
+  BufferArg(const Matrix& matrix, const TensorShape& shape)
+      : buf_((void*)matrix.getData()),
+        valueType_(DataType<real>::value),
+        shape_(shape) {
+    CHECK_EQ(matrix.getElementCnt(), shape.getElements());
+  }
+
+  BufferArg(const Vector& vector)
+      : buf_((void*)vector.getData()),
+        valueType_(DataType<real>::value),
+        shape_(1) {
+    shape_.setDim(0, vector.getSize());
+  }
+
+  BufferArg(const IVector& vector)
+      : buf_((void*)vector.getData()), valueType_(VALUE_TYPE_INT32), shape_(1) {
+    shape_.setDim(0, vector.getSize());
+  }
+
+  template <DeviceType DType>
+  typename Tensor<real, DType>::Matrix matrix() const {
+    CHECK(buf_);
+    CHECK(valueType_ == DataType<real>::value);
+    // CHECK(deviceType_ == DType);
+    CHECK_EQ(2, shape_.ndims());
+    return typename Tensor<real, DType>::Matrix(
+        reinterpret_cast<real*>(buf_), shape_[0], shape_[1]);
+  }
+
+  template <typename VType, DeviceType DType>
+  typename Tensor<VType, DType>::Vector vector() const {
+    CHECK(buf_);
+    CHECK(valueType_ == DataType<VType>::value);
+    // CHECK(deviceType_ == DType);
+    CHECK_EQ(1, shape_.ndims());
+    return typename Tensor<VType, DType>::Vector(
+        shape_[0], reinterpret_cast<VType*>(buf_));
+  }
+
+  virtual ~BufferArg() {}
+
+  template <typename T>
+  T* data() const {
+    return reinterpret_cast<T*>(buf_);
+  }
+
+  void* data() const { return buf_; }
+  ValueType valueType() const { return valueType_; }
+  BufferType bufferType() const { return bufferType_; }
+  const TensorShape& shape() const { return shape_; }
+
+  const SequenceArg& sequence() const;
+  const SparseMatrixArg& sparse() const;
+
+protected:
+  void* buf_;
+  ValueType valueType_;
+  TensorShape shape_;
+  BufferType bufferType_;
+  // leading dimensions. The size is dims_.size()
+  // Dims lds_;
+};
+
+// sequence start positions in a mini-batch of sequences
+// shape_.ndims() == 1
+// valueType_ = int32
+// if a < b than value_.buf_[a] < value_.buf_[b]
+class SequenceIdArg : public BufferArg {
+public:
+  SequenceIdArg(void* buf, const TensorShape& shape)
+      : BufferArg(buf, VALUE_TYPE_INT32, shape) {
+    CHECK_EQ(shape_.ndims(), 1);
+    numSeqs_ = shape_[0] - 1;
+  }
+
+  SequenceIdArg(const IVector& vector) : BufferArg(vector) {
+    numSeqs_ = shape_[0] - 1;
+  }
+
+  ~SequenceIdArg() {}
+
+  size_t numSeqs() const { return numSeqs_; }
+
+private:
+  size_t numSeqs_;
+};
+
+// sequence data
+class SequenceArg : public BufferArg {
+public:
+  SequenceArg(void* buf,
+              ValueType valueType,
+              const TensorShape& shape,
+              const SequenceIdArg& startPositions)
+      : BufferArg(buf, valueType, shape), startPositions_(startPositions) {}
+
+  SequenceArg(const Matrix& matrix, const IVector& vector)
+      : BufferArg(matrix), startPositions_(vector) {}
+
+  ~SequenceArg() {}
+
+  void* getIdBuf() const { return startPositions_.data(); }
+  size_t numSeqs() const { return startPositions_.numSeqs(); }
+
+private:
+  SequenceIdArg startPositions_;
+};
+
+// sparse matrix
+// valueType_ == float or double
+// shape_.ndims() == 2
+class SparseMatrixArg : public BufferArg {
+public:
+  SparseMatrixArg(void* buf,
+                  ValueType valueType,
+                  const TensorShape& shape,
+                  const BufferArg& row,
+                  const BufferArg& col,
+                  size_t nnz,
+                  SparseDataFormat format,
+                  SparseDataType type)
+      : BufferArg(buf, valueType, shape),
+        row_(row),
+        col_(col),
+        nnz_(nnz),
+        format_(format),
+        type_(type) {
+    CHECK((valueType == VALUE_TYPE_FLOAT) || (valueType == VALUE_TYPE_DOUBLE));
+    CHECK_EQ(shape_.ndims(), 2);
+    CHECK_EQ(row_.shape().ndims(), 1);
+    CHECK_EQ(col_.shape().ndims(), 1);
+    if (format == SPARSE_CSR_FORMAT) {
+      CHECK_EQ(nnz, col.shape()[0]);
+    } else if (format == SPARSE_CSC_FORMAT) {
+      CHECK_EQ(nnz, row.shape()[0]);
+    }
+  }
+
+  SparseMatrixArg(const CpuSparseMatrix& sparse)
+      : BufferArg(sparse),
+        row_((void*)sparse.getRows(), VALUE_TYPE_INT32),
+        col_((void*)sparse.getCols(), VALUE_TYPE_INT32) {}
+
+  SparseMatrixArg(const GpuSparseMatrix& sparse)
+      : BufferArg(sparse),
+        row_((void*)sparse.getRows(), VALUE_TYPE_INT32),
+        col_((void*)sparse.getCols(), VALUE_TYPE_INT32) {}
+
+  ~SparseMatrixArg() {}
+
+  void* getRowBuf() const { return row_.data(); }
+
+  void* getColBuf() const { return col_.data(); }
+
+  size_t nnz() const { return nnz_; }
+
+  SparseDataFormat dataFormat() const { return format_; }
+
+  SparseDataType dataType() const { return type_; }
+
+private:
+  BufferArg row_;
+  BufferArg col_;
+  size_t nnz_;
+  SparseDataFormat format_;
+  SparseDataType type_;
+};
+
+}  // namespace paddle
diff --git a/paddle/function/BufferArgTest.cpp b/paddle/function/BufferArgTest.cpp
new file mode 100644
index 0000000000000..5d669b8137e1a
--- /dev/null
+++ b/paddle/function/BufferArgTest.cpp
@@ -0,0 +1,128 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "BufferArg.h"
+#include <gtest/gtest.h>
+#include "paddle/math/MemoryHandle.h"
+
+namespace paddle {
+
+TEST(BufferTest, BufferArg) {
+  TensorShape shape({8, 10});
+  CpuMemoryHandle memory(shape.getElements() *
+                         sizeOfValuType(VALUE_TYPE_FLOAT));
+  BufferArg buffer(memory.getBuf(), VALUE_TYPE_FLOAT, shape);
+  EXPECT_EQ(buffer.data(), memory.getBuf());
+}
+
+TEST(BufferTest, SequenceIdArg) {
+  TensorShape shape({10});
+  CpuMemoryHandle memory(shape.getElements() *
+                         sizeOfValuType(VALUE_TYPE_INT32));
+  SequenceIdArg buffer(memory.getBuf(), shape);
+  EXPECT_EQ(buffer.data(), memory.getBuf());
+  EXPECT_EQ(buffer.numSeqs(), 9);
+}
+
+TEST(BufferTest, asArgument) {
+  MatrixPtr matrix = Matrix::create(100, 200);
+  VectorPtr vector = Vector::create(100, false);
+  CpuSparseMatrix sparse(200, 300, 50);
+
+  // prepare arguments
+  BufferArgs argments;
+  argments.addArg(*matrix);
+  argments.addArg(*vector);
+  argments.addArg(sparse);
+
+  // function
+  auto function = [=](const BufferArgs& inputs) {
+    EXPECT_EQ(inputs.size(), 3);
+
+    // check inputs[0]
+    EXPECT_EQ(inputs[0].shape().ndims(), 2);
+    EXPECT_EQ(inputs[0].shape()[0], 100);
+    EXPECT_EQ(inputs[0].shape()[1], 200);
+    EXPECT_EQ(inputs[0].data(), matrix->getData());
+
+    EXPECT_EQ(inputs[0].matrix<DEVICE_TYPE_CPU>().getHeight(),
+              matrix->getHeight());
+    EXPECT_EQ(inputs[0].matrix<DEVICE_TYPE_CPU>().getWidth(),
+              matrix->getWidth());
+    EXPECT_EQ(inputs[0].matrix<DEVICE_TYPE_CPU>().getData(), matrix->getData());
+
+    // check inputs[1]
+    EXPECT_EQ(inputs[1].shape().ndims(), 1);
+    EXPECT_EQ(inputs[1].shape()[0], 100);
+    EXPECT_EQ(inputs[1].data(), vector->getData());
+    CpuVector inVector = inputs[1].vector<real, DEVICE_TYPE_CPU>();
+    EXPECT_EQ(inVector.getSize(), vector->getSize());
+    EXPECT_EQ(inVector.getData(), vector->getData());
+
+    // check inputs[2]
+    EXPECT_EQ(inputs[2].shape().ndims(), 2);
+    EXPECT_EQ(inputs[2].shape()[0], 200);
+    EXPECT_EQ(inputs[2].shape()[1], 300);
+    EXPECT_EQ(inputs[2].data(), sparse.getData());
+    // CHECK_EQ(inputs[2].sparse().nnz(), 50);
+    // CHECK_EQ(inputs[2].sparse().dataFormat(), SPARSE_CSR_FORMAT);
+    // CHECK_EQ(inputs[2].sparse().dataType(), SPARSE_FLOAT_VALUE);
+    EXPECT_EQ(inputs[2].sparse().getRowBuf(), sparse.getRows());
+    EXPECT_EQ(inputs[2].sparse().getColBuf(), sparse.getCols());
+  };
+
+  // call function
+  function(argments);
+}
+
+template <DeviceType DType>
+void FunctionApi(typename Tensor<real, DType>::Matrix& output,
+                 const typename Tensor<real, DType>::Matrix& input);
+
+template <>
+void FunctionApi<DEVICE_TYPE_CPU>(CpuMatrix& output, const CpuMatrix& input) {
+  EXPECT_EQ(output.getHeight(), 100);
+  EXPECT_EQ(output.getWidth(), 200);
+}
+
+template <>
+void FunctionApi<DEVICE_TYPE_GPU>(GpuMatrix& output, const GpuMatrix& input) {
+  EXPECT_EQ(output.getHeight(), 10);
+  EXPECT_EQ(output.getWidth(), 20);
+}
+
+template <DeviceType DType>
+void Function(const BufferArgs& arguments) {
+  auto input = arguments[0].matrix<DType>();
+  auto output = arguments[1].matrix<DType>();
+  FunctionApi<DType>(output, input);
+}
+
+TEST(BufferTest, Function) {
+  CpuMatrix cpuInput = CpuMatrix(100, 200);
+  CpuMatrix cpuOutput = CpuMatrix(100, 200);
+  BufferArgs cpuArgments;
+  cpuArgments.addArg(cpuInput);
+  cpuArgments.addArg(cpuOutput);
+  Function<DEVICE_TYPE_CPU>(cpuArgments);
+
+  GpuMatrix gpuInput = GpuMatrix(10, 20);
+  GpuMatrix gpuOutput = GpuMatrix(10, 20);
+  BufferArgs gpuArgments;
+  gpuArgments.addArg(gpuInput);
+  gpuArgments.addArg(gpuOutput);
+  Function<DEVICE_TYPE_GPU>(gpuArgments);
+}
+
+}  // namespace paddle
diff --git a/paddle/function/TensorType.h b/paddle/function/TensorType.h
index 800f71a5b974c..98942cff9e2ea 100644
--- a/paddle/function/TensorType.h
+++ b/paddle/function/TensorType.h
@@ -57,6 +57,11 @@ struct DataType<double> {
   static const ValueType value = VALUE_TYPE_DOUBLE;
 };
 
+template <>
+struct DataType<int> {
+  static const ValueType value = VALUE_TYPE_INT32;
+};
+
 namespace detail {
 
 template <typename VType, DeviceType Device>

From 68156c88c50aff2c614ecc69b56bd5f814dc30be Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Thu, 5 Jan 2017 19:45:12 +0800
Subject: [PATCH 04/11] Modify the argument type of Function

---
 paddle/function/CrossMapNormalOp.cpp          | 68 +++++++++----------
 paddle/function/Function.h                    | 53 ++-------------
 paddle/gserver/layers/NormProjectionLayer.cpp | 30 +++++---
 paddle/gserver/layers/NormProjectionLayer.h   |  2 +-
 4 files changed, 56 insertions(+), 97 deletions(-)

diff --git a/paddle/function/CrossMapNormalOp.cpp b/paddle/function/CrossMapNormalOp.cpp
index f13eb78d27d90..ec27db9c21296 100644
--- a/paddle/function/CrossMapNormalOp.cpp
+++ b/paddle/function/CrossMapNormalOp.cpp
@@ -125,27 +125,25 @@ class CrossMapNormalFunc : public FunctionBase {
     pow_ = config.get<real>("pow");
   }
 
-  void calc(const Arguments& inputs,
-            const Arguments& outputs,
-            const Arguments& inouts) override {
+  void calc(const BufferArgs& inputs,
+            const BufferArgs& outputs,
+            const BufferArgs& inouts) override {
     CHECK_EQ(1, inputs.size());
     CHECK_EQ(2, outputs.size());
     CHECK_EQ(0, inouts.size());
 
-    CHECK_EQ(inputs[0].dims_.size(), 4);
-    for (size_t i = 0; i < inputs[0].dims_.size(); i++) {
-      CHECK_EQ(inputs[0].dims_[i], outputs[0].dims_[i]);
-      CHECK_EQ(inputs[0].dims_[i], outputs[1].dims_[i]);
-    }
+    CHECK_EQ(inputs[0].shape().ndims(), 4);
+    CHECK(inputs[0].shape() == outputs[0].shape());
+    CHECK(inputs[0].shape() == outputs[1].shape());
 
-    size_t samples = inputs[0].dims_[0];
-    size_t channels = inputs[0].dims_[1];
-    size_t height = inputs[0].dims_[2];
-    size_t width = inputs[0].dims_[3];
+    size_t samples = inputs[0].shape()[0];
+    size_t channels = inputs[0].shape()[1];
+    size_t height = inputs[0].shape()[2];
+    size_t width = inputs[0].shape()[3];
 
-    CrossMapNormal<Device>(outputs[0].getData(),
-                           outputs[1].getData(),
-                           inputs[0].getData(),
+    CrossMapNormal<Device>(outputs[0].data<real>(),
+                           outputs[1].data<real>(),
+                           inputs[0].data<real>(),
                            samples,
                            channels,
                            height,
@@ -177,31 +175,29 @@ class CrossMapNormalGradFunc : public FunctionBase {
     pow_ = config.get<real>("pow");
   }
 
-  void calc(const Arguments& inputs,
-            const Arguments& outputs,
-            const Arguments& inouts) override {
+  void calc(const BufferArgs& inputs,
+            const BufferArgs& outputs,
+            const BufferArgs& inouts) override {
     CHECK_EQ(4, inputs.size());
     CHECK_EQ(1, outputs.size());
     CHECK_EQ(0, inouts.size());
 
-    CHECK_EQ(inputs[0].dims_.size(), 4);
-    for (size_t i = 0; i < inputs[0].dims_.size(); i++) {
-      CHECK_EQ(inputs[0].dims_[i], inputs[1].dims_[i]);
-      CHECK_EQ(inputs[0].dims_[i], inputs[2].dims_[i]);
-      CHECK_EQ(inputs[0].dims_[i], inputs[3].dims_[i]);
-      CHECK_EQ(inputs[0].dims_[i], outputs[0].dims_[i]);
-    }
-
-    size_t samples = inputs[0].dims_[0];
-    size_t channels = inputs[0].dims_[1];
-    size_t height = inputs[0].dims_[2];
-    size_t width = inputs[0].dims_[3];
-
-    CrossMapNormalGrad<Device>(outputs[0].getData(),
-                               inputs[0].getData(),
-                               inputs[1].getData(),
-                               inputs[2].getData(),
-                               inputs[3].getData(),
+    CHECK_EQ(inputs[0].shape().ndims(), 4);
+    CHECK(inputs[0].shape() == inputs[1].shape());
+    CHECK(inputs[0].shape() == inputs[2].shape());
+    CHECK(inputs[0].shape() == inputs[3].shape());
+    CHECK(inputs[0].shape() == outputs[0].shape());
+
+    size_t samples = inputs[0].shape()[0];
+    size_t channels = inputs[0].shape()[1];
+    size_t height = inputs[0].shape()[2];
+    size_t width = inputs[0].shape()[3];
+
+    CrossMapNormalGrad<Device>(outputs[0].data<real>(),
+                               inputs[0].data<real>(),
+                               inputs[1].data<real>(),
+                               inputs[2].data<real>(),
+                               inputs[3].data<real>(),
                                samples,
                                channels,
                                height,
diff --git a/paddle/function/Function.h b/paddle/function/Function.h
index 9e8cbb8e48c30..024575b4f7bcd 100644
--- a/paddle/function/Function.h
+++ b/paddle/function/Function.h
@@ -16,57 +16,12 @@ limitations under the License. */
 
 #include <map>
 #include <vector>
+#include "BufferArg.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/utils/ClassRegistrar.h"
 
 namespace paddle {
 
-enum DeviceType {
-  DEVICE_TYPE_UNSPECIFIED = 0,
-  DEVICE_TYPE_CPU = 1,
-  DEVICE_TYPE_GPU = 2,
-};
-
-template <DeviceType Device>
-struct MatrixT;
-
-template <>
-struct MatrixT<DEVICE_TYPE_CPU> {
-  using type = CpuMatrix;
-};
-
-template <>
-struct MatrixT<DEVICE_TYPE_GPU> {
-  using type = GpuMatrix;
-};
-
-template <DeviceType Device>
-struct SequenceT;
-
-template <>
-struct SequenceT<DEVICE_TYPE_CPU> {
-  using type = CpuIVector;
-};
-
-template <>
-struct SequenceT<DEVICE_TYPE_GPU> {
-  using type = GpuIVector;
-};
-
-typedef std::vector<size_t> Dims;
-
-class Tensor {
-public:
-  Tensor(real* data, const Dims& dim) : buf_(data), dims_(dim) {}
-
-  real* getData() const { return buf_; }
-
-  real* buf_;
-  Dims dims_;
-};
-
-typedef std::vector<Tensor> Arguments;
-
 class FuncConfig {
 public:
   union value {
@@ -92,9 +47,9 @@ class FunctionBase {
 
   virtual void init(const FuncConfig& config) {}
 
-  virtual void calc(const Arguments& inputs,
-                    const Arguments& outputs,
-                    const Arguments& inouts) {}
+  virtual void calc(const BufferArgs& inputs,
+                    const BufferArgs& outputs,
+                    const BufferArgs& inouts) {}
 
   static ClassRegistrar<FunctionBase> funcRegistrar_;
 };
diff --git a/paddle/gserver/layers/NormProjectionLayer.cpp b/paddle/gserver/layers/NormProjectionLayer.cpp
index 262d757c67e10..573de152fd0d5 100644
--- a/paddle/gserver/layers/NormProjectionLayer.cpp
+++ b/paddle/gserver/layers/NormProjectionLayer.cpp
@@ -71,11 +71,16 @@ void CMRProjectionNormLayer::forward(PassType passType) {
 
   Matrix::resizeOrCreate(denoms_, batchSize, size, /* trans */ false, useGpu_);
 
-  dims_ = {batchSize, channels_, imgSizeH_, imgSizeW_};
-  forward_[0]->calc(
-      {Tensor(input->getData(), dims_)},
-      {Tensor(outV->getData(), dims_), Tensor(denoms_->getData(), dims_)},
-      {});
+  shape_ = TensorShape({batchSize, channels_, imgSizeH_, imgSizeW_});
+
+  BufferArgs inputs;
+  BufferArgs outputs;
+  BufferArgs inouts;
+  inputs.addArg(*input, shape_);
+  outputs.addArg(*outV, shape_);
+  outputs.addArg(*denoms_, shape_);
+
+  forward_[0]->calc(inputs, outputs, inouts);
 }
 
 void CMRProjectionNormLayer::backward(const UpdateCallback& callback) {
@@ -90,11 +95,14 @@ void CMRProjectionNormLayer::backward(const UpdateCallback& callback) {
   MatrixPtr localOutV = getOutputValue();
   MatrixPtr preOutV = inputLayers_[0]->getOutputValue();
 
-  backward_[0]->calc({Tensor(preOutV->getData(), dims_),
-                      Tensor(localOutV->getData(), dims_),
-                      Tensor(localGrad->getData(), dims_),
-                      Tensor(denoms_->getData(), dims_)},
-                     {Tensor(preOutGrad->getData(), dims_)},
-                     {});
+  BufferArgs inputs;
+  BufferArgs outputs;
+  BufferArgs inouts;
+  inputs.addArg(*preOutV, shape_);
+  inputs.addArg(*localOutV, shape_);
+  inputs.addArg(*localGrad, shape_);
+  inputs.addArg(*denoms_, shape_);
+  outputs.addArg(*preOutGrad, shape_);
+  backward_[0]->calc(inputs, outputs, inouts);
 }
 }  // namespace paddle
diff --git a/paddle/gserver/layers/NormProjectionLayer.h b/paddle/gserver/layers/NormProjectionLayer.h
index 6b2c5dde0d74d..2c0d8a3a718c4 100644
--- a/paddle/gserver/layers/NormProjectionLayer.h
+++ b/paddle/gserver/layers/NormProjectionLayer.h
@@ -41,6 +41,6 @@ class CMRProjectionNormLayer : public ResponseNormLayer {
   void backward(const UpdateCallback& callback = nullptr);
 
 protected:
-  Dims dims_;
+  TensorShape shape_;
 };
 }  // namespace paddle

From 41c52d3b0ce619ba25ff9d681ef39613daa1c868 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Thu, 5 Jan 2017 20:33:09 +0800
Subject: [PATCH 05/11] Modify the argument type of ContextProjectionFunc

---
 paddle/function/CMakeLists.txt              |   12 +-
 paddle/function/ContextProjectionOp.cpp     |  161 +-
 paddle/function/ContextProjectionOp.h       |   54 +-
 paddle/function/ContextProjectionOpGpu.cu   |   44 +-
 paddle/function/TensorTypeTest.cpp          |   17 +
 paddle/gserver/layers/ContextProjection.cpp |   42 +-
 paddle/math/Matrix.h                        |    4 +
 paddle/math/Matrix.h~RFbb8b484f.TMP         | 1870 +++++++++++++++++++
 8 files changed, 2048 insertions(+), 156 deletions(-)
 create mode 100644 paddle/math/Matrix.h~RFbb8b484f.TMP

diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 0b3126155d0c0..37c011549eca9 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -3,6 +3,7 @@ file(GLOB cpp_files . *Op.cpp)
 
 list(APPEND h_files Function.h)
 list(APPEND cpp_files Function.cpp)
+list(APPEND cpp_files BufferArg.cpp)
 
 if(WITH_GPU)
     file(GLOB cu_files . *OpGpu.cu)
@@ -16,10 +17,13 @@ if(WITH_TESTING)
     # TODO:
     # file(GLOB test_files . *OpTest.cpp)
     # add_executable(${test_bin} EXCLUDE_FROM_ALL ${test_files})
-    add_simple_unittest(CrossMapNormalOpTest)
-    add_unittest(ContextProjectionOpTest
-        ContextProjectionOpTest.cpp
-        ../gserver/tests/TestUtil.cpp)
+    # add_simple_unittest(CrossMapNormalOpTest)
+    add_simple_unittest(TensorShapeTest)
+    add_simple_unittest(TensorTypeTest)
+    add_simple_unittest(BufferArgTest)
+    # add_unittest(ContextProjectionOpTest
+    #    ContextProjectionOpTest.cpp
+    #    ../gserver/tests/TestUtil.cpp)
 endif()
 endif()
 
diff --git a/paddle/function/ContextProjectionOp.cpp b/paddle/function/ContextProjectionOp.cpp
index bd367a859e10c..1a483c47953b1 100644
--- a/paddle/function/ContextProjectionOp.cpp
+++ b/paddle/function/ContextProjectionOp.cpp
@@ -19,17 +19,15 @@ limitations under the License. */
 namespace paddle {
 
 template <>
-void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix* out_mat,
-                                               const CpuMatrix* input_mat,
-                                               const CpuMatrix* weight_mat,
+void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
+                                               const CpuMatrix& input_mat,
+                                               const CpuMatrix& weight_mat,
                                                const CpuIVector& seq_vec,
                                                size_t context_length,
                                                int context_start,
                                                size_t begin_pad) {
   const int* starts = seq_vec.getData();
   const size_t num_sequences = seq_vec.getSize() - 1;
-  auto w_mat = const_cast<CpuMatrix*>(weight_mat);
-  auto in_mat = const_cast<CpuMatrix*>(input_mat);
   for (size_t i = 0; i < num_sequences; ++i) {
     for (size_t j = 0; j < context_length; ++j) {
       int begin = starts[i] + context_start + j;
@@ -39,10 +37,11 @@ void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix* out_mat,
       if (begin < starts[i]) {
         int64_t pad_size =
             std::min(starts[i] - begin, starts[i + 1] - starts[i]);
-        MatrixPtr mat = out_mat->subMatrix(starts[i], pad_size);
-        if (w_mat) {
-          MatrixPtr sub = w_mat->subMatrix(j, pad_size);
-          mat->addAtOffset(*sub, j * in_mat->getWidth());
+        MatrixPtr mat = out_mat.subMatrix(starts[i], pad_size);
+        if (weight_mat) {
+          MatrixPtr sub =
+              const_cast<CpuMatrix&>(weight_mat).subMatrix(j, pad_size);
+          mat->addAtOffset(*sub, j * input_mat.getWidth());
         }
         dst_begin = starts[i] + pad_size;
         begin = starts[i];
@@ -50,19 +49,22 @@ void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix* out_mat,
       if (end > starts[i + 1]) {
         int64_t pad_size =
             std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
-        MatrixPtr mat = out_mat->subMatrix(starts[i + 1] - pad_size, pad_size);
-        if (w_mat) {
-          MatrixPtr sub = w_mat->subMatrix(
-              begin_pad + context_start + j - pad_size, pad_size);
-          mat->addAtOffset(*sub, j * in_mat->getWidth());
+        MatrixPtr mat = out_mat.subMatrix(starts[i + 1] - pad_size, pad_size);
+        if (weight_mat) {
+          MatrixPtr sub =
+              const_cast<CpuMatrix&>(weight_mat)
+                  .subMatrix(begin_pad + context_start + j - pad_size,
+                             pad_size);
+          mat->addAtOffset(*sub, j * input_mat.getWidth());
         }
         dst_end = starts[i + 1] - pad_size;
         end = starts[i + 1];
       }
       if (end <= begin) continue;
-      MatrixPtr src = in_mat->subMatrix(begin, end - begin);
-      MatrixPtr dst = out_mat->subMatrix(dst_begin, dst_end - dst_begin);
-      dst->addAtOffset(*src, j * in_mat->getWidth());
+      MatrixPtr src =
+          const_cast<CpuMatrix&>(input_mat).subMatrix(begin, end - begin);
+      MatrixPtr dst = out_mat.subMatrix(dst_begin, dst_end - dst_begin);
+      dst->addAtOffset(*src, j * input_mat.getWidth());
     }
   }
 }
@@ -82,40 +84,34 @@ class ContextProjectionForwardFunc : public FunctionBase {
     begin_pad_ = config.get<size_t>("begin_pad");
   }
 
-  void calc(const Arguments& inputs,
-            const Arguments& outputs,
-            const Arguments& inouts) override {
+  void calc(const BufferArgs& inputs,
+            const BufferArgs& outputs,
+            const BufferArgs& inouts) override {
     CHECK_EQ(3, inputs.size());
     CHECK_EQ(1, outputs.size());
     CHECK_EQ(0, inouts.size());
 
-    CHECK(outputs[0].getData() && inputs[0].getData() && inputs[2].getData());
-    CHECK_EQ(outputs[0].dims_.size(), 2);
-    CHECK_EQ(inputs[0].dims_.size(), 2);
-    CHECK_EQ(inputs[1].dims_.size(), 2);
-    CHECK_EQ(inputs[2].dims_.size(), 1);
+    CHECK(outputs[0].data() && inputs[0].data() && inputs[2].data());
+    CHECK_EQ(outputs[0].shape().ndims(), 2);
+    CHECK_EQ(inputs[0].shape().ndims(), 2);
+    CHECK_EQ(inputs[1].shape().ndims(), 2);
+    CHECK_EQ(inputs[2].shape().ndims(), 1);
     /// dim of output = dim of input * context_length
-    CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
+    CHECK_EQ(outputs[0].shape()[1], inputs[0].shape()[1] * context_length_);
     /// dim of input == dim of weight
-    CHECK_EQ(inputs[0].dims_[1], inputs[1].dims_[1]);
+    CHECK_EQ(inputs[0].shape()[1], inputs[1].shape()[1]);
     /// input and output has the same batch_size
-    CHECK_EQ(inputs[0].dims_[0], outputs[0].dims_[0]);
-
-    auto out_mat = std::make_shared<typename MatrixT<Device>::type>(
-        outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
-    const auto in_mat = std::make_shared<typename MatrixT<Device>::type>(
-        inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
-    const auto w_mat =
-        !inputs[1].getData()
-            ? nullptr
-            : std::make_shared<typename MatrixT<Device>::type>(
-                  inputs[1].getData(), inputs[1].dims_[0], inputs[1].dims_[1]);
-    typename SequenceT<Device>::type seq_vec(
-        inputs[2].dims_[0], reinterpret_cast<int*>(inputs[2].getData()));
-
-    ContextProjectionForward<Device>(out_mat.get(),
-                                     in_mat.get(),
-                                     w_mat.get(),
+    CHECK_EQ(inputs[0].shape()[0], outputs[0].shape()[0]);
+
+    auto out_mat = outputs[0].matrix<Device>();
+    auto in_mat = inputs[0].matrix<Device>();
+    auto w_mat = !inputs[1].data()
+                     ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
+                     : inputs[1].matrix<Device>();
+    auto seq_vec = inputs[2].vector<int, Device>();
+    ContextProjectionForward<Device>(out_mat,
+                                     in_mat,
+                                     w_mat,
                                      seq_vec,
                                      context_length_,
                                      context_start_,
@@ -129,18 +125,17 @@ class ContextProjectionForwardFunc : public FunctionBase {
 };
 
 template <>
-void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix* out_grad_mat,
-                                                CpuMatrix* in_grad_mat,
-                                                CpuMatrix* w_grad_mat,
+void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix& out_grad_mat,
+                                                CpuMatrix& in_grad_mat,
+                                                CpuMatrix& w_grad_mat,
                                                 const CpuIVector& seq_vec,
                                                 size_t context_length,
                                                 int context_start,
                                                 size_t begin_pad,
                                                 bool is_padding,
                                                 size_t total_pad) {
-  CHECK(out_grad_mat);
-  size_t input_dim = in_grad_mat ? in_grad_mat->getWidth()
-                                 : w_grad_mat ? w_grad_mat->getWidth() : 0;
+  size_t input_dim = in_grad_mat ? in_grad_mat.getWidth()
+                                 : w_grad_mat ? w_grad_mat.getWidth() : 0;
   const int* starts = seq_vec.getData();
   size_t num_sequences = seq_vec.getSize() - 1;
   for (size_t i = 0; i < num_sequences; ++i) {
@@ -153,8 +148,8 @@ void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix* out_grad_mat,
         int64_t pad_size =
             std::min(starts[i] - begin, starts[i + 1] - starts[i]);
         if (is_padding && w_grad_mat) {
-          MatrixPtr mat = out_grad_mat->subMatrix(starts[i], pad_size);
-          MatrixPtr sub = w_grad_mat->subMatrix(j, pad_size);
+          MatrixPtr mat = out_grad_mat.subMatrix(starts[i], pad_size);
+          MatrixPtr sub = w_grad_mat.subMatrix(j, pad_size);
           sub->addAtOffset(*mat, j * input_dim);
         }
         dst_begin = starts[i] + pad_size;
@@ -165,8 +160,8 @@ void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix* out_grad_mat,
             std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
         if (is_padding && w_grad_mat) {
           MatrixPtr mat =
-              out_grad_mat->subMatrix(starts[i + 1] - pad_size, pad_size);
-          MatrixPtr sub = w_grad_mat->subMatrix(
+              out_grad_mat.subMatrix(starts[i + 1] - pad_size, pad_size);
+          MatrixPtr sub = w_grad_mat.subMatrix(
               begin_pad + context_start + j - pad_size, pad_size);
           sub->addAtOffset(*mat, j * input_dim);
         }
@@ -175,8 +170,8 @@ void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix* out_grad_mat,
       }
       if (end <= begin) continue;
       if (!in_grad_mat) continue;
-      MatrixPtr src = in_grad_mat->subMatrix(begin, end - begin);
-      MatrixPtr dst = out_grad_mat->subMatrix(dst_begin, dst_end - dst_begin);
+      MatrixPtr src = in_grad_mat.subMatrix(begin, end - begin);
+      MatrixPtr dst = out_grad_mat.subMatrix(dst_begin, dst_end - dst_begin);
       src->addAtOffset(*dst, j * input_dim);
     }
   }
@@ -199,44 +194,37 @@ class ContextProjectionBackwardFunc : public FunctionBase {
     total_pad_ = config.get<size_t>("total_pad");
   }
 
-  void calc(const Arguments& inputs,
-            const Arguments& outputs,
-            const Arguments& inouts) override {
+  void calc(const BufferArgs& inputs,
+            const BufferArgs& outputs,
+            const BufferArgs& inouts) override {
     CHECK_EQ(3, inputs.size());
     CHECK_EQ(1, outputs.size());
     CHECK_EQ(0, inouts.size());
 
-    CHECK(outputs[0].getData() && inputs[2].getData());
-    CHECK_EQ(outputs[0].dims_.size(), 2);
-    CHECK_EQ(inputs[0].dims_.size(), 2);
-    CHECK_EQ(inputs[1].dims_.size(), 2);
-    CHECK_EQ(inputs[2].dims_.size(), 1);
+    CHECK(outputs[0].data() && inputs[2].data());
+    CHECK_EQ(outputs[0].shape().ndims(), 2);
+    CHECK_EQ(inputs[0].shape().ndims(), 2);
+    CHECK_EQ(inputs[1].shape().ndims(), 2);
+    CHECK_EQ(inputs[2].shape().ndims(), 1);
 
     /// dim of input == dim of weight
-    CHECK_EQ(inputs[0].dims_[1], inputs[1].dims_[1]);
+    CHECK_EQ(inputs[0].shape()[1], inputs[1].shape()[1]);
     /// input and output has the same batch_size
-    CHECK_EQ(inputs[0].dims_[0], outputs[0].dims_[0]);
+    CHECK_EQ(inputs[0].shape()[0], outputs[0].shape()[0]);
     /// dim of output = dim of input * context_length
-    CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
+    CHECK_EQ(outputs[0].shape()[1], inputs[0].shape()[1] * context_length_);
 
-    auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
-        outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
+    auto out_grad_mat = outputs[0].matrix<Device>();
     auto in_grad_mat =
-        !inputs[0].getData()
-            ? nullptr
-            : std::make_shared<typename MatrixT<Device>::type>(
-                  inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
-    auto w_grad_mat =
-        !inputs[1].getData()
-            ? nullptr
-            : std::make_shared<typename MatrixT<Device>::type>(
-                  inputs[1].getData(), inputs[1].dims_[0], inputs[1].dims_[1]);
-    typename SequenceT<Device>::type seq_vec(
-        inputs[2].dims_[0], reinterpret_cast<int*>(inputs[2].getData()));
-
-    ContextProjectionBackward<Device>(out_grad_mat.get(),
-                                      in_grad_mat ? in_grad_mat.get() : nullptr,
-                                      w_grad_mat ? w_grad_mat.get() : nullptr,
+        !inputs[0].data() ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
+                          : inputs[0].matrix<Device>();
+    auto w_grad_mat = !inputs[1].data()
+                          ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
+                          : inputs[1].matrix<Device>();
+    auto seq_vec = inputs[2].vector<int, Device>();
+    ContextProjectionBackward<Device>(out_grad_mat,
+                                      in_grad_mat,
+                                      w_grad_mat,
                                       seq_vec,
                                       context_length_,
                                       context_start_,
@@ -253,6 +241,7 @@ class ContextProjectionBackwardFunc : public FunctionBase {
   size_t total_pad_;
 };
 
+#if 0
 /**
  * \param inputs[0] input grad.
  * \param inputs[1] input sequence.
@@ -272,6 +261,7 @@ class ContextProjectionBackwardDataFunc : public FunctionBase {
     CHECK_EQ(2, inputs.size());
     CHECK_EQ(1, outputs.size());
     CHECK_EQ(0, inouts.size());
+
     CHECK(inputs[0].getData() && outputs[0].getData() && inputs[1].getData());
     CHECK_EQ(outputs[0].dims_.size(), 2);
     CHECK_EQ(inputs[0].dims_.size(), 2);
@@ -349,6 +339,7 @@ class ContextProjectionBackwardWeightFunc : public FunctionBase {
   size_t begin_pad_;
   size_t total_pad_;
 };
+#endif
 
 REGISTER_TYPED_FUNC(ContextProjectionForward,
                     CPU,
@@ -363,6 +354,7 @@ REGISTER_TYPED_FUNC(ContextProjectionForward,
 REGISTER_TYPED_FUNC(ContextProjectionBackward,
                     GPU,
                     ContextProjectionBackwardFunc);
+#if 0
 REGISTER_TYPED_FUNC(ContextProjectionBackwardData,
                     GPU,
                     ContextProjectionBackwardDataFunc);
@@ -370,4 +362,5 @@ REGISTER_TYPED_FUNC(ContextProjectionBackwardWeight,
                     GPU,
                     ContextProjectionBackwardWeightFunc);
 #endif
+#endif
 }  // namespace paddle
diff --git a/paddle/function/ContextProjectionOp.h b/paddle/function/ContextProjectionOp.h
index 93eb050fde35f..a558df5e072f2 100644
--- a/paddle/function/ContextProjectionOp.h
+++ b/paddle/function/ContextProjectionOp.h
@@ -31,14 +31,15 @@ namespace paddle {
  * \param[in]   is_padding        whether padding 0 or not.
  *
  */
-template <DeviceType Device>
-void ContextProjectionForward(typename MatrixT<Device>::type* output,
-                              const typename MatrixT<Device>::type* input,
-                              const typename MatrixT<Device>::type* weight,
-                              const typename SequenceT<Device>::type& sequence,
-                              size_t context_length,
-                              int context_start,
-                              size_t begin_pad);
+template <DeviceType DType>
+void ContextProjectionForward(
+    typename Tensor<real, DType>::Matrix& output,
+    const typename Tensor<real, DType>::Matrix& input,
+    const typename Tensor<real, DType>::Matrix& weight,
+    const typename Tensor<int, DType>::Vector& sequence,
+    size_t context_length,
+    int context_start,
+    size_t begin_pad);
 
 /**
  * \brief   Context Projection Backward.
@@ -53,30 +54,31 @@ void ContextProjectionForward(typename MatrixT<Device>::type* output,
  * \param[in]   is_padding        whether padding 0 or not.
  *
  */
-template <DeviceType Device>
-void ContextProjectionBackward(typename MatrixT<Device>::type* out_grad,
-                               typename MatrixT<Device>::type* in_grad,
-                               typename MatrixT<Device>::type* w_grad,
-                               const typename SequenceT<Device>::type& seq_vec,
-                               size_t context_length,
-                               int context_start,
-                               size_t begin_pad,
-                               bool is_padding,
-                               size_t total_pad);
+template <DeviceType DType>
+void ContextProjectionBackward(
+    typename Tensor<real, DType>::Matrix& out_grad,
+    typename Tensor<real, DType>::Matrix& in_grad,
+    typename Tensor<real, DType>::Matrix& w_grad,
+    const typename Tensor<int, DType>::Vector& seq_vec,
+    size_t context_length,
+    int context_start,
+    size_t begin_pad,
+    bool is_padding,
+    size_t total_pad);
 
-template <DeviceType Device>
+template <DeviceType DType>
 void ContextProjectionBackwardData(
-    typename MatrixT<Device>::type* out_grad,
-    typename MatrixT<Device>::type* in_grad,
-    const typename SequenceT<Device>::type& sequence,
+    typename Tensor<real, DType>::Matrix& out_grad,
+    typename Tensor<real, DType>::Matrix& in_grad,
+    const typename Tensor<int, DType>::Vector& sequence,
     size_t context_length,
     int context_start);
 
-template <DeviceType Device>
+template <DeviceType DType>
 void ContextProjectionBackwardWeight(
-    typename MatrixT<Device>::type* out_grad,
-    typename MatrixT<Device>::type* w_grad,
-    const typename SequenceT<Device>::type& seq_vec,
+    typename Tensor<real, DType>::Matrix& out_grad,
+    typename Tensor<real, DType>::Matrix& w_grad,
+    const typename Tensor<int, DType>::Vector& seq_vec,
     size_t context_length,
     int context_start,
     size_t total_pad,
diff --git a/paddle/function/ContextProjectionOpGpu.cu b/paddle/function/ContextProjectionOpGpu.cu
index 1ec7058f96c82..6a4a01a651041 100644
--- a/paddle/function/ContextProjectionOpGpu.cu
+++ b/paddle/function/ContextProjectionOpGpu.cu
@@ -120,20 +120,19 @@ void hl_context_projection_forward(const real* input,
 }
 
 template <>
-void ContextProjectionForward<DEVICE_TYPE_GPU>(GpuMatrix* output,
-                                               const GpuMatrix* input,
-                                               const GpuMatrix* weight,
+void ContextProjectionForward<DEVICE_TYPE_GPU>(GpuMatrix& output,
+                                               const GpuMatrix& input,
+                                               const GpuMatrix& weight,
                                                const GpuIVector& sequence,
                                                size_t context_length,
                                                int context_start,
                                                size_t begin_pad) {
-  CHECK(input && output);
-  hl_context_projection_forward(input->getData(),
+  hl_context_projection_forward(input.getData(),
                                 sequence.getData(),
-                                weight ? weight->getData() : nullptr,
-                                output->getData(),
+                                weight ? weight.getData() : nullptr,
+                                output.getData(),
                                 sequence.getSize() - 1,
-                                input->getWidth(),
+                                input.getWidth(),
                                 context_length,
                                 context_start,
                                 begin_pad);
@@ -217,17 +216,16 @@ void hl_context_projection_backward_data(real* out_grad,
 }
 
 template <>
-void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(GpuMatrix* out_grad,
-                                                    GpuMatrix* in_grad,
+void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(GpuMatrix& out_grad,
+                                                    GpuMatrix& in_grad,
                                                     const GpuIVector& sequence,
                                                     size_t context_length,
                                                     int context_start) {
-  CHECK(in_grad && out_grad);
-  hl_context_projection_backward_data(out_grad->getData(),
+  hl_context_projection_backward_data(out_grad.getData(),
                                       sequence.getData(),
-                                      in_grad->getData(),
+                                      in_grad.getData(),
                                       sequence.getSize() - 1,
-                                      in_grad->getWidth(),
+                                      in_grad.getWidth(),
                                       context_length,
                                       context_start);
 }
@@ -348,19 +346,18 @@ void hl_context_projection_backward_weight(real* out_grad,
 
 template <>
 void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
-        GpuMatrix* out_grad,
-        GpuMatrix* w_grad,
+        GpuMatrix& out_grad,
+        GpuMatrix& w_grad,
         const GpuIVector& seq_vec,
         size_t context_length,
         int context_start,
         size_t total_pad,
         size_t begin_pad) {
-  CHECK(out_grad && w_grad);
-  hl_context_projection_backward_weight(out_grad->getData(),
+  hl_context_projection_backward_weight(out_grad.getData(),
                                         seq_vec.getData(),
-                                        w_grad->getData(),
+                                        w_grad.getData(),
                                         seq_vec.getSize() - 1,
-                                        w_grad->getWidth(),
+                                        w_grad.getWidth(),
                                         total_pad,
                                         context_length,
                                         context_start,
@@ -368,16 +365,15 @@ void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
 }
 
 template <>
-void ContextProjectionBackward<DEVICE_TYPE_GPU>(GpuMatrix* out_grad,
-                                                GpuMatrix* in_grad,
-                                                GpuMatrix* w_grad,
+void ContextProjectionBackward<DEVICE_TYPE_GPU>(GpuMatrix& out_grad,
+                                                GpuMatrix& in_grad,
+                                                GpuMatrix& w_grad,
                                                 const GpuIVector& sequence,
                                                 size_t context_length,
                                                 int context_start,
                                                 size_t begin_pad,
                                                 bool is_padding,
                                                 size_t total_pad) {
-    CHECK(out_grad);
     if (in_grad) {
         ContextProjectionBackwardData<DEVICE_TYPE_GPU>(
                 out_grad,
diff --git a/paddle/function/TensorTypeTest.cpp b/paddle/function/TensorTypeTest.cpp
index 4a86245c2a288..e50e46f3e9911 100644
--- a/paddle/function/TensorTypeTest.cpp
+++ b/paddle/function/TensorTypeTest.cpp
@@ -44,4 +44,21 @@ TEST(TensorType, Vector) {
   EXPECT_EQ(gpuIVector.getSize(), 100);
 }
 
+TEST(TensorType, EmptyMatrix) {
+  CpuMatrix empty(nullptr, 0, 0);
+  CpuMatrix nonEmpty(10, 10);
+  EXPECT_EQ(empty.isEmpty(), true);
+  EXPECT_EQ(nonEmpty.isEmpty(), false);
+  CHECK(nonEmpty);
+  auto function = [](const CpuMatrix& matrix) {
+    if (matrix) {
+      EXPECT_NE(matrix.getData(), nullptr);
+    } else {
+      EXPECT_EQ(matrix.getData(), nullptr);
+    }
+  };
+  function(empty);
+  function(nonEmpty);
+}
+
 }  // namespace paddle
diff --git a/paddle/gserver/layers/ContextProjection.cpp b/paddle/gserver/layers/ContextProjection.cpp
index e947b2b9ecbeb..26783a42cac42 100644
--- a/paddle/gserver/layers/ContextProjection.cpp
+++ b/paddle/gserver/layers/ContextProjection.cpp
@@ -110,7 +110,7 @@ void ContextProjection::forward() {
   size_t input_dim = in_->value->getWidth();
   size_t dim = out_->value->getWidth();
   CHECK_EQ(dim, input_dim * config_.context_length());
-  size_t batch_size = in_->value->getHeight();
+  // size_t batch_size = in_->value->getHeight();
   CHECK_EQ(forward_.size(), 1) << "Only one forward function here";
 
   REGISTER_TIMER_INFO("ContextProjectionForward", getName().c_str());
@@ -119,14 +119,17 @@ void ContextProjection::forward() {
   auto w_ptr =
       state_ ? state_.get() : is_padding ? weight_->getW().get() : nullptr;
   auto start_pos = in_->sequenceStartPositions;
-  forward_[0]->calc({Tensor(in_->value->getData(), Dims{batch_size, input_dim}),
-                     Tensor(w_ptr ? w_ptr->getData() : nullptr,
-                            Dims{w_ptr ? w_ptr->getHeight() : 0, input_dim}),
-                     Tensor(reinterpret_cast<real*>(
-                                const_cast<int*>(start_pos->getData(useGpu_))),
-                            Dims{start_pos->getSize()})},
-                    {Tensor(out_->value->getData(), Dims{batch_size, dim})},
-                    {});
+
+  BufferArgs inputs;
+  BufferArgs outputs;
+  BufferArgs inouts;
+  inputs.addArg(*in_->value);
+  inputs.addArg(CpuMatrix(w_ptr ? w_ptr->getData() : nullptr,
+                          w_ptr ? w_ptr->getHeight() : 0,
+                          input_dim));
+  inputs.addArg(*in_->sequenceStartPositions->getVector(useGpu_));
+  outputs.addArg(*out_->value);
+  forward_[0]->calc(inputs, outputs, inouts);
 
   if (state_ && config_.context_start() < 0) {
     CHECK_EQ(1, in_->getNumSequences());
@@ -160,15 +163,18 @@ void ContextProjection::backward(const UpdateCallback& callback) {
   bool is_padding = config_.trainable_padding();
   auto start_pos = in_->sequenceStartPositions;
   auto w_ptr = is_padding ? weight_->getWGrad() : nullptr;
-  backward_[0]->calc({Tensor(in_->grad ? in_->grad->getData() : nullptr,
-                             Dims{batch_size, input_dim}),
-                      Tensor(w_ptr ? w_ptr->getData() : nullptr,
-                             Dims{w_ptr ? w_ptr->getHeight() : 0, input_dim}),
-                      Tensor(reinterpret_cast<real*>(
-                                 const_cast<int*>(start_pos->getData(useGpu_))),
-                             Dims{start_pos->getSize()})},
-                     {Tensor(out_->grad->getData(), Dims{batch_size, dim})},
-                     {});
+
+  BufferArgs inputs;
+  BufferArgs outputs;
+  BufferArgs inouts;
+  inputs.addArg(CpuMatrix(
+      in_->grad ? in_->grad->getData() : nullptr, batch_size, input_dim));
+  inputs.addArg(CpuMatrix(w_ptr ? w_ptr->getData() : nullptr,
+                          w_ptr ? w_ptr->getHeight() : 0,
+                          input_dim));
+  inputs.addArg(*in_->sequenceStartPositions->getVector(useGpu_));
+  outputs.addArg(*out_->grad);
+  backward_[0]->calc(inputs, outputs, inouts);
 
   if (config_.trainable_padding()) {
     weight_->getParameterPtr()->incUpdate(callback);
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index 4865a081a5aaa..60c6560396854 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -1091,6 +1091,10 @@ class Matrix : public BaseMatrix {
       TensorCpuApply<real>(*this, expr);
     }
   }
+
+  bool isEmpty() const { return data_ == nullptr; }
+
+  explicit operator bool() const { return !isEmpty(); }
 };
 
 inline std::ostream& operator<<(std::ostream& os, const Matrix& mat) {
diff --git a/paddle/math/Matrix.h~RFbb8b484f.TMP b/paddle/math/Matrix.h~RFbb8b484f.TMP
new file mode 100644
index 0000000000000..d89b0f67b3c98
--- /dev/null
+++ b/paddle/math/Matrix.h~RFbb8b484f.TMP
@@ -0,0 +1,1870 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+#include <memory>
+#include <thread>
+
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/ThreadLocal.h"
+
+#include <hl_gpu.h>
+
+#include "BaseMatrix.h"
+#include "MemoryHandle.h"
+#include "Vector.h"
+#include "paddle/utils/ThreadLocal.h"
+#include "paddle/utils/common.h"
+
+namespace paddle {
+
+enum SparseValueType { NO_VALUE = 0, FLOAT_VALUE = 1 };
+
+/**
+ * @brief  matrix sparse_format .
+ *
+ * nnz represents nonzero number in sparse matrix.
+ *
+ * SPARSE_CSR: row major matrix. length of row is height_ + 1, each element
+ * represents row start index in Matrix. length of col and value are nnz.
+ *
+ * SPARSE_CSC: col major matrix. length of col is width_ + 1, each element
+ * represents col start index in Matrix. length of col and value are nnz.
+ *
+ * @code
+ * for example: [0, 1, 0, 2, 0;
+ *               1, 0, 0, 0, 0;
+ *               0, 0, 0, 2, 5];
+ * SPARSE_CSR row   [0, 2, 3, 5];
+ *            col   [1, 3, 0, 3, 4];
+ *            value [1, 2, 1, 2, 5]
+ * SPARSE_CSC col   [0, 1, 2, 2, 4, 5];
+ *            row   [1, 0, 0, 2, 2];
+ *            value [1, 1, 2, 2, 5]
+ * @endcode
+ */
+enum SparseFormat { SPARSE_CSR = 0, SPARSE_CSC = 1 };
+
+class Matrix;
+class GpuMatrix;
+class CpuMatrix;
+class CpuSparseMatrix;
+class GpuSparseMatrix;
+typedef std::shared_ptr<Matrix> MatrixPtr;
+typedef std::shared_ptr<GpuMatrix> GpuMatrixPtr;
+typedef std::shared_ptr<CpuMatrix> CpuMatrixPtr;
+typedef std::shared_ptr<GpuSparseMatrix> GpuSparseMatrixPtr;
+typedef std::shared_ptr<CpuSparseMatrix> CpuSparseMatrixPtr;
+
+/**
+ * Copy or assignemnt constructor will share the data as opposed to making a
+ * copy of the original data. To make a copy of the orinal data, use copyFrom()
+ * instead.
+ */
+class Matrix : public BaseMatrix {
+protected:
+  Matrix(MemoryHandlePtr memHandle,
+         size_t height,
+         size_t width,
+         bool trans,
+         bool use_gpu);
+
+  Matrix(real* data, size_t height, size_t width, bool trans, bool use_gpu);
+
+  Matrix(real* data,
+         size_t height,
+         size_t width,
+         size_t stride,
+         bool trans,
+         bool use_gpu);
+
+  static ThreadLocal<MatrixPtr> tmpMat_;
+
+public:
+  size_t elementCnt_;  // maximal number of elements which can be held in data_
+  MemoryHandlePtr memoryHandle_;
+
+public:
+  virtual ~Matrix() {}
+
+  static MatrixPtr create(MemoryHandlePtr memHandle,
+                          size_t height,
+                          size_t width,
+                          bool trans = false);
+  static MatrixPtr create(size_t height,
+                          size_t width,
+                          bool trans = false,
+                          bool useGpu = false);
+  static MatrixPtr create(real* data,
+                          size_t height,
+                          size_t width,
+                          bool trans = false,
+                          bool useGpu = false);
+  static MatrixPtr create(real* data,
+                          size_t height,
+                          size_t width,
+                          size_t stride,
+                          bool trans = false,
+                          bool useGpu = false);
+
+  static MatrixPtr createSparseMatrix(size_t height,
+                                      size_t width,
+                                      size_t nnz,
+                                      SparseValueType valueType = FLOAT_VALUE,
+                                      bool trans = false,
+                                      bool useGpu = false);
+  static MatrixPtr createSparseMatrix(size_t height,
+                                      size_t width,
+                                      size_t nnz,
+                                      SparseValueType valueType = FLOAT_VALUE,
+                                      SparseFormat foramt = SPARSE_CSR,
+                                      bool trans = false,
+                                      bool useGpu = false);
+
+  static MatrixPtr createSparseMatrix(real* data,
+                                      int* row,
+                                      int* col,
+                                      size_t height,
+                                      size_t width,
+                                      size_t nnz, /* used to allocate space */
+                                      SparseValueType valueType, /*value type*/
+                                      SparseFormat format,
+                                      bool trans,
+                                      bool useGpu);
+
+  static void resizeOrCreateSparseMatrix(
+      MatrixPtr& matrix,
+      size_t height,
+      size_t width,
+      size_t nnz,
+      SparseValueType valueType = FLOAT_VALUE,
+      SparseFormat foramt = SPARSE_CSR,
+      bool trans = false,
+      bool useGpu = false);
+
+  static void resizeOrCreate(MatrixPtr& a,
+                             size_t height,
+                             size_t width,
+                             bool trans = false,
+                             bool useGpu = false);
+
+  /**
+   * @brief  set the data buffer used to hold the matrix data.
+   *
+   * caller should make sure that the size of data is at least
+   * sizeof(real)*height*width.
+   */
+  void setData(real* data) {
+    BaseMatrix::setData(data);
+    memoryHandle_.reset();
+  }
+
+  /// the data should be contiguous
+  void setData(real* data, size_t newHeight, size_t newWidth) {
+    setData(data);
+    height_ = newHeight;
+    width_ = newWidth;
+    elementCnt_ = newHeight * newWidth;
+    stride_ = width_;
+  }
+
+  size_t getWidth() const { return width_; }
+  size_t getHeight() const { return height_; }
+  size_t getStride() const { return stride_; }
+  size_t getElementCnt() const { return elementCnt_; }
+  virtual real* getData() { return data_; }
+  virtual const real* getData() const { return data_; }
+  bool isTransposed() const { return trans_; }
+  bool isContiguous() const { return stride_ == width_ || height_ == 1; }
+
+  // If sparse matrix, need to dynamic_cast to CpuSparseMatrix/GpuSparseMatrix
+  // befor call the following functions.
+  // Declare these functions in the base class just easy to call them.
+  // And these declarations should be moved to base class of sparse matrix
+  // if refactor sparse matrix
+  virtual int* getRows() const {
+    LOG(FATAL) << "Not implemented";
+    return nullptr;  //! suppress warning for no return value.
+  }
+
+  virtual int* getCols() const {
+    LOG(FATAL) << "Not implemented";
+    return nullptr;  //! suppress warning for no return value.
+  }
+
+  virtual SparseFormat getFormat() const {
+    LOG(FATAL) << "Not implemented";
+    return SPARSE_CSR;  //! suppress warning for no return value.
+  }
+
+  virtual SparseValueType getValueType() const {
+    LOG(FATAL) << "Not implemented";
+    return NO_VALUE;  //! suppress warning for no return value.
+  }
+
+  /**
+   * @brief matrix elment-wise add
+   *
+   * Named add3 just because add/add2 has been used in BaseMatrix.cu
+   * and they are not virtual function.
+   */
+  virtual void add3(MatrixPtr b) { LOG(FATAL) << "Not implemented"; }
+
+  MemoryHandlePtr getMemoryHandle() const { return memoryHandle_; }
+
+  virtual void zeroMem() { LOG(FATAL) << "Not implemented"; }
+
+  virtual void resetOne() { LOG(FATAL) << "Not implemented"; }
+
+  void setDiag(real value);
+
+  virtual void copyFrom(const Matrix& src) { LOG(FATAL) << "Not implemented"; }
+
+  virtual void trimFrom(const CpuSparseMatrix& src) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  // asynchronous copy
+  virtual void copyFrom(const Matrix& src, hl_stream_t stream) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  MatrixPtr subMatrix(size_t startRow,
+                      size_t endRow,
+                      size_t startCol,
+                      size_t endCol);
+
+  MatrixPtr subRowMatrix(size_t startRow, size_t endRow) {
+    return subMatrix(startRow, endRow, 0, getWidth());
+  }
+
+  MatrixPtr subColMatrix(size_t startCol, size_t endCol) {
+    return subMatrix(0, getHeight(), startCol, endCol);
+  }
+
+  virtual MatrixPtr subMatrix(size_t startRow, size_t numRows) {
+    CHECK_LE(startRow + numRows, getHeight());
+    return Matrix::create(getData() + startRow * getWidth(),
+                          numRows,
+                          getWidth(),
+                          trans_,
+                          useGpu_);
+  }
+  virtual MatrixPtr subMatrix(size_t startRow, size_t numRows, MatrixPtr dest) {
+    CHECK_LE(startRow + numRows, getHeight());
+    CHECK_EQ(useGpu_, dest->useGpu_);
+    dest->setData(this->rowBuf(startRow), numRows, getWidth());
+    return dest;
+  }
+
+  /**
+   * If this is GpuMatrix, src is assumed to be CPU memory
+   *
+   * If this is CpuMatrix, src is assumed to be CPU memory
+   */
+  virtual void copyFrom(const real* src, size_t size) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void copyFrom(const real* src, const int64_t* seq) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @brief convert a int vector to a real matrix.
+   *
+   * (1) source and dest are both in CPU.
+   *
+   * (2) sizes are exactly match.
+   */
+  virtual void copyFrom(const IVector& src) {
+    LOG(FATAL) << "copy data from int vector only available on CpuMatrix.";
+  }
+
+  virtual void copyByRowIndex(Matrix& b, const IVector& rowIndex) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @brief Create a matrix with the same type (GpuMatrix, CpuMatrix,
+   *        NonValueSparseMatrix, etc.) as this.
+   *
+   * If height and width is zero, the new matrix will have the same size
+   * as this, otherwise the new matrix will have the specified size.
+   *
+   */
+  virtual MatrixPtr clone(size_t height = 0,
+                          size_t width = 0,
+                          bool useGpu = false) {
+    LOG(FATAL) << "Not implemented";
+    return nullptr;
+  }
+
+  virtual real* getRowBuf(size_t row) {
+    LOG(FATAL) << "Not implemented";
+    return nullptr;
+  }
+
+  virtual real getElement(size_t x, size_t y) const {
+    LOG(FATAL) << "Not implemented";
+    return 0;
+  }
+
+  virtual real getSum() {
+    LOG(FATAL) << "Not implemented";
+    return 0;
+  }
+
+  virtual void accumulateColSum(Matrix& src) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual real getAbsSum() {
+    LOG(FATAL) << "Not implemented";
+    return 0;
+  }
+
+  /**
+   * @note Original data may not be preserved after resize().
+   */
+  virtual void resize(size_t newHeight, size_t newWidth) = 0;
+
+  /**
+   * @note This should only be used for sparse matrix.
+   */
+  virtual void resize(size_t newHeight,
+                      size_t newWidth,
+                      size_t newNnz, /* total item used to allocate space */
+                      SparseValueType valueType,
+                      SparseFormat format) = 0;
+
+  /**
+   * @brief This should only be used for sparse matrix.
+   *
+   * Currently must be called for each row in order.
+   * The matrix is not valid until setRow is called for the last row.
+   */
+  virtual void setRow(size_t row,
+                      size_t colNum,
+                      const unsigned int* cols,
+                      const real* values) = 0;
+
+  virtual MatrixPtr getTranspose() = 0;
+
+  /**
+   * @brief  hard transpose.
+   *
+   * allocate matTrans' memory outside, then set memAlloc as false;
+   * else set as true.
+   */
+  virtual void transpose(MatrixPtr matTrans, bool memAlloc) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual MatrixPtr getInverse() {
+    LOG(FATAL) << "Not implemented";
+    return nullptr;
+  }
+
+  /**
+   * @brief  inverse.
+   *
+   * if allocate matInv's memory outside, then set memAlloc as false;
+   * else set as true.
+   */
+  virtual void inverse(MatrixPtr matInv, bool memAlloc) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+public:
+  /// Only set all variables to 0 or NULL but not free them.
+  virtual void clear() {
+    height_ = 0;
+    width_ = 0;
+    data_ = NULL;
+  }
+
+  void reshape(size_t height, size_t width);
+
+  /// add b to each sample of this.
+  virtual void addBias(Matrix& b, real scale) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void addSharedBias(Matrix& b, real scale) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  void addBias(Matrix& b, real scale, bool sharedBias) {
+    if (!sharedBias) {
+      addBias(b, scale);
+    } else {
+      addSharedBias(b, scale);
+    }
+  }
+
+  /// add each sample from a to this.
+  virtual void collectBias(Matrix& a, real scale) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void collectSharedBias(Matrix& a, real scale) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  void collectBias(Matrix& a, real scale, bool sharedBias) {
+    if (!sharedBias) {
+      collectBias(a, scale);
+    } else {
+      collectSharedBias(a, scale);
+    }
+  }
+
+  virtual void sequenceAvgForward(Matrix& a,
+                                  const IVector& startsPos,
+                                  int mode) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * this = scaleAB*(a*b) + scaleT*this
+   * @endcode
+   */
+  virtual void mul(const Matrix& a,
+                   const Matrix& b,
+                   real scaleAB,
+                   real scaleT) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// Add a vector (column) b to matrix a, column by column.
+  virtual void addColumnVector(const Matrix& b) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * For j < codeLength:
+   *   this(i, j) += vec(index(i, j), 0)
+   * where index(i, j) = ((codes(i) + numClasses) >> (j + 1)) - 1
+   * @endcode
+   */
+  virtual void addByBitCode(size_t numClasses,
+                            const IVector& codes,
+                            const Matrix& vec) {
+    (void)numClasses;
+    (void)codes;
+    (void)vec;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   * For j < codeLength:
+   *   vec(index(i, j), 0) += this(i, j)
+   * where index is same as the index for addByBitCode
+   * @endcode
+   */
+  virtual void addByBitCodeBackward(size_t numClasses,
+                                    const IVector& codes,
+                                    Matrix& vec) {
+    (void)numClasses;
+    (void)codes;
+    (void)vec;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   * For j < codeLength:
+   *   this(i, j) += <mat.row(index(i, j)), input.row(i)>
+   * where index is same as the index for addByBitCode
+   * @endcode
+   */
+  virtual void mulByBitCode(size_t numClasses,
+                            const IVector& codes,
+                            const Matrix& mat,
+                            const Matrix& input) {
+    (void)numClasses;
+    (void)codes;
+    (void)mat;
+    (void)input;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   * For j < codeLength:
+   *   mat.row(index(i, j)) += this(i, j) * input.row(i)
+   * where index is same as the index for addByBitCode
+   * @endcode
+   */
+  virtual void mulByBitCodeBackwardWeight(size_t numClasses,
+                                          const IVector& codes,
+                                          Matrix& mat,
+                                          const Matrix& input) {
+    (void)numClasses;
+    (void)codes;
+    (void)mat;
+    (void)input;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   * For j < codeLength:
+   *   input.row(i) += this(i, j) * mat.row(index(i, j))
+   * where index is same as the index for addByBitCode
+   * @endcode
+   */
+  virtual void mulByBitCodeBackwardError(size_t numClasses,
+                                         const IVector& codes,
+                                         const Matrix& mat,
+                                         Matrix& input) {
+    (void)numClasses;
+    (void)codes;
+    (void)mat;
+    (void)input;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   * For j < codeLength
+   *   sum(i, 0) = scaleSum * \sum_j  bit(i, j) * this(i, j)
+   * where bit(i, j) = ((codes(i) + numClasses) & 2^j) ? 1 : 0
+   * @endcode
+   */
+  virtual void sumByBitCode(size_t numClasses,
+                            IVector& codes,
+                            Matrix& sum,
+                            real scaleSum) {
+    (void)numClasses;
+    (void)codes;
+    (void)sum;
+    (void)scaleSum;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   * For j < codeLength
+   *  this(i, j) -= bit(i, j)
+   * where bit(i, j) is same as that for sumByBitCode
+   * @endcode
+   */
+  virtual void subByBitCode(size_t numClasses_, IVector& codes) {
+    (void)numClasses_;
+    (void)codes;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * add the sum of each row of this to mat
+   */
+  virtual void rowSum(Matrix& sum) {
+    (void)sum;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * set the max of each row of this to mat
+   */
+  virtual void rowMax(Matrix& max) {
+    (void)max;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * set the max of each column of this to mat
+   */
+  virtual void colMax(Matrix& max) { LOG(FATAL) << "not implemented"; }
+
+  /**
+   * @brief Get the top k elements of each column of this matrix.
+   *
+   * The row ids and values of these elements are stored in
+   * maxIds and max respectively. where k is the size of maxIds.
+   * And note that the top k elements are not sorted.
+   */
+  virtual void colMax(IVector& maxIds, Matrix& maxVal) {
+    LOG(FATAL) << "not implemented";
+  }
+
+  virtual void maxoutForward(Matrix& a,
+                             IVector& id,
+                             size_t channels,
+                             size_t groups) {
+    LOG(FATAL) << "not implemented";
+  }
+
+  virtual void maxoutBackward(Matrix& a,
+                              IVector& id,
+                              size_t channels,
+                              size_t groups) {
+    LOG(FATAL) << "not implemented";
+  }
+
+  virtual void rowMaxId(IVector& maxIds) { LOG(FATAL) << "Not implemented"; }
+
+  /**
+   * @brief Get the top k elements of each row of this matrix.
+   *
+   * The column ids and values of these elements are stored in
+   * maxIds and max respectively. where k is the size of maxIds.
+   * And note that the top k elements are not sorted.
+   */
+  virtual void rowMax(IVector& maxIds, Matrix& max) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// normalize each row so that the sum of each row is 1.
+  virtual void rowNormalizeL1(Matrix& out) {
+    (void)out;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   *  this = a*b
+   * @endcode
+   */
+  virtual void mul(const Matrix& a, const Matrix& b) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * this = scaleAB*(this*b) +  scaleT*this
+   * @endcode
+   */
+  virtual void rightMul(Matrix& b, real scaleAB, real scaleT) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * this = this* b
+   * @endcode
+   */
+  virtual void rightMul(Matrix& b) { LOG(FATAL) << "Not implemented"; }
+
+  /**
+   * @code
+   * this = scaleAB*(a*this) +  scaleT*this
+   * @endcode
+   */
+  virtual void leftMul(Matrix& a, real scaleAB, real scaleT) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * this = a*this)
+   * @endcode
+   */
+  virtual void leftMul(Matrix& a) { LOG(FATAL) << "Not implemented"; }
+
+  /// merge the element for each col.
+  virtual void colMerge(Matrix& src) { LOG(FATAL) << "Not implemented"; }
+
+  /// copy -log(output[label]) to this->data[i].
+  virtual void oneHotCrossEntropy(Matrix& output, IVector& label) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// calculate the error of outputV according to label.
+  virtual void oneHotCrossEntropyBp(Matrix& outputV, IVector& label) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// copy -log(output[label]) to this->data[i].
+  virtual void oneHotCrossEntropyWithSelfNorm(Matrix& output,
+                                              IVector& label,
+                                              real alpha) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// calculate the error of outputV according to label.
+  virtual void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
+                                                IVector& label,
+                                                real alpha) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * \f[
+   *  a[i] = \sum_{j=-(N-1)/2}^{(N-1)/2} b_{i+j} * c_{j}
+   * \f]
+   *
+   * b contains M elements,
+   * c contains N elements (N is odd),
+   * b's index arithmetic is computed modulo M,
+   * c's index arithmetic is computed modulo N.
+   */
+  virtual void circularConv(Matrix& b, Matrix& c) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void circularConvDerivative(Matrix& output,
+                                      Matrix& prevOut1,
+                                      Matrix& prevOut2,
+                                      Matrix& prevGrad1,
+                                      Matrix& prevGrad2) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /* output_ij = exp(this_{ij}) / (sum_j exp(this_ij)) */
+  virtual void softmax(Matrix& output) {
+    (void)output;
+    LOG(FATAL) << "Not implemeted";
+  }
+  virtual void sequenceSoftmax(Matrix& output, const IVector& index) {
+    (void)output;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void softmaxBackward(Matrix& outputV) {
+    (void)outputV;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /*
+    sum_i = sum_j this_ij * output_ij
+    this_ij = output_ij* (this_ij - sum_i)
+  */
+  virtual void softmaxDerivative(Matrix& output, Matrix& sftmaxSum) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// calculate the sum of squares diff cost.
+  virtual void sumOfSquares(Matrix& output, Matrix& label) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// gradient of sumOfSquares.
+  virtual void sumOfSquaresBp(Matrix& outputV, Matrix& label) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void tanh(Matrix& output) { LOG(FATAL) << "Not implemented"; }
+
+  virtual void tanhDerivative(Matrix& output) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void softrelu(Matrix& output) { LOG(FATAL) << "Not implemented"; }
+
+  virtual void softreluDerivative(Matrix& output) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void scaledTanh(Matrix& output, real p1, real p2) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * cosine similarity, for each row i,
+   *   this[i] = cos(output1[i], output2[i])
+   *
+   * output2 can only have one row, then for each row i,
+   *   this[i] = cos(output1[i], output2[0])
+   */
+  virtual void cosSim(Matrix& output1, Matrix& output2, real scale = 1.0f) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void cosSimDerivative(Matrix& output,
+                                Matrix& prevOut1,
+                                Matrix& prevOut2,
+                                Matrix& prevGrad1,
+                                Matrix& prevGrad2,
+                                real scale = 1.0f) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// print out the values of elements to os
+  virtual void print(std::ostream& os) const {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * print a part of the matrix
+   * from the (top,left) value to the (height, width) value (not included)
+   */
+  virtual void print(std::ostream& os, size_t height, size_t width) const {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// print one row to os
+  virtual void printOneRow(std::ostream& os, size_t idx) const {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void check(std::ostream& os, Matrix& refMat, bool printDiff = true) {}
+
+  virtual real getMin() {
+    LOG(FATAL) << "Not implemented";
+    return 0;
+  }
+  virtual real getMax() {
+    LOG(FATAL) << "Not implemented";
+    return 0;
+  }
+
+  virtual void randomizeUniform() { LOG(FATAL) << "Not implemented"; }
+
+  /**
+   * @brief  calulate the error of classification
+   *
+   * output[i] = 1 if row i is an error.
+   *
+   * output[i] = 0 if row i is correct.
+   */
+  virtual void classificationError(Matrix& output, IVector& label) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * This function is used to calculate the convolution:
+   *
+   * It will expand a feature matrix according to the
+   * convolution filters
+   */
+  virtual void convExpand(Matrix& feature,
+                          int feaImgHeight,
+                          int feaImgWidth,
+                          int channels,
+                          int blockH,
+                          int blockW,
+                          int strideH,
+                          int strideW,
+                          int paddingH,
+                          int paddingW,
+                          int outputH,
+                          int outputW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * This function is the reverse implementation of convExpand:
+   *
+   * Its function is to restore a expanded-matrix into a feature matrix
+   */
+  virtual void convShrink(Matrix& expandColMat,
+                          int thisImgHeight,
+                          int thisImgWidth,
+                          int channels,
+                          int blockH,
+                          int blockW,
+                          int strideH,
+                          int strideW,
+                          int paddingH,
+                          int paddingW,
+                          int outputH,
+                          int outputW,
+                          real alpha = 1.0f,
+                          real beta = 0.0f) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * Pooling forward operation, pick out the largest element
+   * in the sizeX of value
+   */
+  virtual void maxPoolForward(Matrix& inputMat,
+                              size_t imgSizeH,
+                              size_t imgSizeW,
+                              size_t channels,
+                              size_t sizeX,
+                              size_t sizeY,
+                              size_t strideH,
+                              size_t strideW,
+                              size_t outputH,
+                              size_t outputW,
+                              size_t paddingH,
+                              size_t paddingW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /// Pooling backward operation.
+  virtual void maxPoolBackward(Matrix& image,
+                               size_t imgSizeH,
+                               size_t imgSizeW,
+                               Matrix& outGrad,
+                               Matrix& outV,
+                               size_t sizeX,
+                               size_t sizeY,
+                               size_t strideH,
+                               size_t strideW,
+                               size_t outputH,
+                               size_t outputW,
+                               real scaleTargets,
+                               real scaleOutput,
+                               size_t paddingH,
+                               size_t paddingW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /// Pooling forward operation, caculate the average of sizeX elements.
+  virtual void avgPoolForward(Matrix& input,
+                              size_t imgSizeH,
+                              size_t imgSizeW,
+                              size_t channels,
+                              size_t sizeX,
+                              size_t sizeY,
+                              size_t strideH,
+                              size_t strideW,
+                              size_t outputH,
+                              size_t outputW,
+                              size_t paddingH,
+                              size_t paddingW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void avgPoolBackward(Matrix& input,
+                               size_t imgSizeH,
+                               size_t imgSizeW,
+                               size_t sizeX,
+                               size_t sizeY,
+                               size_t strideH,
+                               size_t strideW,
+                               size_t outputH,
+                               size_t outputW,
+                               real scaleTargets,
+                               real scaleOutput,
+                               size_t paddingH,
+                               size_t paddingW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * Input: one or more sequences. Each sequence contains some instances.
+   *
+   * Output: output size is the number of input sequences (NOT input
+   * instances).
+   *
+   * output[i] is set to max_input[i].
+   */
+  virtual void maxSequenceForward(Matrix& input,
+                                  const IVector& sequence,
+                                  IVector& index) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void maxSequenceBackward(Matrix& outputGrad,
+                                   const IVector& sequence,
+                                   IVector& index) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   * this.row[i] += table.row[ids[i]]
+   * if ids[i] == -1, it will be ignored
+   * @endcode
+   */
+  virtual void selectRows(Matrix& table, IVector& ids) {
+    (void)table;
+    (void)ids;
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * this[i] = table[i, id[i]]
+   * @endcode
+   */
+  virtual void selectElements(Matrix& table, IVector& ids) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * table.row[ids[i]] += this.row[i]
+   * if ids[i] == -1, it will be ignored
+   * @endcode
+   */
+  virtual void addToRows(Matrix& table, IVector& ids) {
+    (void)table;
+    (void)ids;
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * table[i, id[i]] += this[i]
+   * @endcode
+   */
+  virtual void addElements(Matrix& table, IVector& ids) {
+    LOG(FATAL) << "Not implemented";
+  }
+  /**
+   * @brief  cross entropy for multi binary labels
+   *
+   * @code
+   * this[i] = -sum(label[i][j]*log(output[i][j])
+   *           + (1-label[i][j])*log(1-output[i][j]))
+   * @endcode
+   */
+  virtual void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @brief  The gradient of cross entropy for multi binary labels on output
+   *
+   * @code
+   * this[i][j] = -label[i][j]/output[i][j]
+   *              + (1-label[i][j])/(1-output[i][j])
+   * @endcode
+   */
+  virtual void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @brief  Calculate the classification error for multi binary labels
+   *
+   * @code
+   * this[i] = sum((output[i][j] >= threshold && label[i][j] == 0)
+   *            || (output[i][j] < threshold && label[i][j] == 1))
+   *            / output->getWidth()
+   * @endcode
+   */
+  virtual void classificationErrorMulti(Matrix& output,
+                                        Matrix& label,
+                                        real threshold) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void paramReluForward(Matrix& data, Matrix& W) {
+    LOG(FATAL) << "Not implemented";
+  }
+  virtual void paramReluBackwardW(Matrix& oGrad, Matrix& data) {
+    LOG(FATAL) << "Not implemented";
+  }
+  virtual void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void bilinearForward(const Matrix& in,
+                               const size_t inImgH,
+                               const size_t inImgW,
+                               const size_t outImgH,
+                               const size_t outImgW,
+                               const size_t numChannels,
+                               const real ratioH,
+                               const real ratioW) {
+    LOG(FATAL) << "Not implemented";
+  }
+  virtual void bilinearBackward(const Matrix& out,
+                                const size_t outImgH,
+                                const size_t outImgW,
+                                const size_t inImgH,
+                                const size_t inImgW,
+                                const size_t numChannels,
+                                const real ratioH,
+                                const real ratioW) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    if (useGpu_) {
+      TensorGpuApply<real>(*this, expr);
+    } else {
+      TensorCpuApply<real>(*this, expr);
+    }
+  }
+
+  bool isEmpty() const {
+    return data_ == nullptr;
+  }
+
+  explicit operator bool() const {
+    return !isEmpty();
+  }
+};
+
+inline std::ostream& operator<<(std::ostream& os, const Matrix& mat) {
+  mat.print(os);
+  return os;
+}
+
+class GpuMatrix : public Matrix {
+public:
+  GpuMatrix();
+
+  GpuMatrix(size_t height, size_t width, bool trans = false);
+  GpuMatrix(real* data, size_t height, size_t width, bool trans = false)
+      : Matrix(data, height, width, trans, true) {}
+  GpuMatrix(real* data,
+            size_t height,
+            size_t width,
+            size_t stride,
+            bool trans = false)
+      : Matrix(data, height, width, stride, trans, true) {}
+  GpuMatrix(GpuMemHandlePtr dataHandle,
+            size_t height,
+            size_t width,
+            bool trans = false)
+      : Matrix(dataHandle, height, width, trans, true) {}
+  ~GpuMatrix();
+
+  void zeroMem();
+  void resetOne();
+  void setDiag(real value);
+
+  void resize(size_t newHeight, size_t newWidth);
+  void resize(size_t newHeight,
+              size_t newWidth,
+              size_t newNnz, /* used to allocate space */
+              SparseValueType valueType,
+              SparseFormat format) {
+    LOG(FATAL) << "Only Support Sparse Matrix";
+  }
+  void setRow(size_t row,
+              size_t colNum,
+              const unsigned int* cols,
+              const real* values) {
+    LOG(FATAL) << "Only Support Sparse Matrix";
+  }
+
+  /**
+   * Copy the data from cpu_memory buffer
+   */
+  void copyFrom(const real* hostSrc, size_t size);
+
+  void copyFrom(const real* hostSrc, const int64_t* seq);
+
+  void copyFrom(const Matrix& src, hl_stream_t stream);
+
+  void copyFrom(const Matrix& src);
+
+  void copyFrom(const IVector& src);
+
+  void copyByRowIndex(Matrix& b, const IVector& rowIndex);
+
+  MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
+
+  real getElement(size_t x, size_t y) const;
+
+  real* getRow(size_t row) { return BaseMatrix::rowBuf(row); }
+  virtual real* getRowBuf(size_t row) { return getRow(row); }
+
+  real getSum();
+  void accumulateColSum(Matrix& src);
+  real getAbsSum();
+
+  MatrixPtr getTranspose();
+  void transpose(MatrixPtr matTrans, bool memAlloc);
+
+  MatrixPtr getInverse();
+  void inverse(MatrixPtr matInv, bool memAlloc);
+
+  /// add b to each sample of this.
+  void addBias(Matrix& b, real scale);
+  void addSharedBias(Matrix& b, real scale);
+
+  /**
+   * @code
+   * add each sample from a to this.
+   * @endcode
+   */
+  void collectBias(Matrix& a, real scale);
+  void collectSharedBias(Matrix& a, real scale);
+
+  void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
+
+  /**
+   * @code
+   * this.row[i] += table.row[ids[i]]
+   * @endcode
+   */
+  virtual void selectRows(Matrix& table, IVector& ids);
+
+  /**
+   * @code
+   * this[i] = table[i, id[i]]
+   * @endcode
+   */
+  virtual void selectElements(Matrix& table, IVector& ids);
+
+  /**
+   * @code
+   * table.row[ids[i]] += this.row[i]
+   * @endcode
+   */
+  virtual void addToRows(Matrix& table, IVector& ids);
+
+  void addColumnVector(const Matrix& b);
+
+  /**
+   * @code
+   * this = scaleAB*(a*b) + scaleT*this
+   * @endcode
+   */
+  void mul(const Matrix& a, const Matrix& b, real scaleAB, real scaleT);
+
+  /**
+   * @code
+   * this = a*b
+   * @endcode
+   */
+  void mul(const Matrix& a, const Matrix& b);
+
+  void mul(const GpuMatrix& a, const GpuMatrix& b, real scaleAB, real scaleT);
+
+  void mul(const GpuSparseMatrix& a,
+           const GpuMatrix& b,
+           real scaleAB,
+           real scaleT);
+
+  void mul(const GpuMatrix& a,
+           const GpuSparseMatrix& b,
+           real scaleAB,
+           real scaleT);
+
+  /**
+   * @code
+   * this = scaleAB*(this*b) +  scaleT*this
+   * @endcode
+   */
+  void rightMul(Matrix& b, real scaleAB, real scaleT);
+
+  /**
+   * @code
+   * this = this* b
+   * @endcode
+   */
+  void rightMul(Matrix& b);
+
+  /**
+   * @code
+   * this = scaleAB*(a*this) +  scaleT*this
+   * @endcode
+   */
+  void leftMul(Matrix& a, real scaleAB, real scaleT);
+
+  /**
+   * @code
+   * this = a*this
+   * @endcode
+   */
+  void leftMul(Matrix& a);
+
+  void colMerge(Matrix& src);
+  void rowSum(Matrix& sum);
+  void rowMax(Matrix& max);
+  void rowMax(IVector& maxIds, Matrix& max);
+  void colMax(Matrix& max);
+  void colMax(IVector& maxIds, Matrix& max);
+  void maxoutForward(Matrix& a, IVector& id, size_t channels, size_t groups);
+  void maxoutBackward(Matrix& a, IVector& id, size_t channels, size_t groups);
+
+  void oneHotCrossEntropy(Matrix& output, IVector& label);
+  void oneHotCrossEntropyBp(Matrix& outputV, IVector& label);
+  void oneHotCrossEntropyWithSelfNorm(Matrix& output,
+                                      IVector& label,
+                                      real alpha);
+  void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
+                                        IVector& label,
+                                        real alpha);
+
+  void softmax(Matrix& output);
+  void sequenceSoftmax(Matrix& output, const IVector& index);
+  void softmaxBackward(Matrix& outputV);
+  void softmaxDerivative(Matrix& output, Matrix& sftmaxSum);
+
+  /// calculate the sum of squares diff cost.
+  void sumOfSquares(Matrix& output, Matrix& label);
+
+  /// gradient of sumOfSquares.
+  void sumOfSquaresBp(Matrix& outputV, Matrix& label);
+  void tanh(Matrix& output);
+  void tanhDerivative(Matrix& output);
+  void softrelu(Matrix& output);
+  void softreluDerivative(Matrix& output);
+  void scaledTanh(Matrix& output, real p1, real p2);
+
+  void cosSim(Matrix& output1, Matrix& output2, real scale);
+  void cosSimDerivative(Matrix& output,
+                        Matrix& prevOut1,
+                        Matrix& prevOut2,
+                        Matrix& prevGrad1,
+                        Matrix& prevGrad2,
+                        real scale);
+
+  virtual void print(std::ostream& os) const;
+  virtual void print(std::ostream& os, size_t height, size_t width) const;
+
+  void paramReluForward(Matrix& data, Matrix& W);
+  void paramReluBackwardW(Matrix& oGrad, Matrix& data);
+  void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W);
+
+  void check(std::ostream& os, Matrix& refMat, bool printDiff = true);
+  void randomizeUniform();
+
+  void classificationError(Matrix& output, IVector& label);
+
+  void convExpand(Matrix& feature,
+                  int feaImgHeight,
+                  int feaImgWidth,
+                  int channels,
+                  int blockH,
+                  int blockW,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  int outputH,
+                  int outputW);
+
+  void convShrink(Matrix& expandColMat,
+                  int thisImgHeight,
+                  int thisImgWidth,
+                  int channels,
+                  int blockH,
+                  int blochW,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingWreal,
+                  int outputH,
+                  int outputW,
+                  real alpha = 1.0f,
+                  real beta = 0.0f);
+
+  void maxPoolForward(Matrix& inputMat,
+                      size_t imgSizeH,
+                      size_t imgSizeW,
+                      size_t channels,
+                      size_t sizeX,
+                      size_t sizeY,
+                      size_t strideH,
+                      size_t strideW,
+                      size_t outputH,
+                      size_t outputW,
+                      size_t paddingH,
+                      size_t paddingW);
+
+  void maxPoolBackward(Matrix& image,
+                       size_t imgSizeH,
+                       size_t imgSizeW,
+                       Matrix& outGrad,
+                       Matrix& outV,
+                       size_t sizeX,
+                       size_t sizeY,
+                       size_t strideH,
+                       size_t strideW,
+                       size_t outputH,
+                       size_t outputW,
+                       real scaleTargets,
+                       real scaleOutput,
+                       size_t paddingH,
+                       size_t paddingW);
+
+  void avgPoolForward(Matrix& input,
+                      size_t imgSizeH,
+                      size_t imgSizeW,
+                      size_t channels,
+                      size_t sizeX,
+                      size_t sizeY,
+                      size_t strideH,
+                      size_t strideW,
+                      size_t outputH,
+                      size_t outputW,
+                      size_t paddingH,
+                      size_t paddingW);
+
+  void avgPoolBackward(Matrix& input,
+                       size_t imgSizeH,
+                       size_t imgSizeW,
+                       size_t sizeX,
+                       size_t sizeY,
+                       size_t strideH,
+                       size_t strideW,
+                       size_t outputH,
+                       size_t outputW,
+                       real scaleTargets,
+                       real scaleOutput,
+                       size_t paddingH,
+                       size_t paddingW);
+
+  void maxSequenceForward(Matrix& input,
+                          const IVector& sequence,
+                          IVector& index);
+
+  void maxSequenceBackward(Matrix& outputGrad,
+                           const IVector& sequence,
+                           IVector& index);
+
+  void bilinearForward(const Matrix& in,
+                       const size_t inImgH,
+                       const size_t inImgW,
+                       const size_t outImgH,
+                       const size_t outImgW,
+                       const size_t numChannels,
+                       const real ratioH,
+                       const real ratioW);
+
+  void bilinearBackward(const Matrix& out,
+                        const size_t outImgH,
+                        const size_t outImgW,
+                        const size_t inImgH,
+                        const size_t inImgW,
+                        const size_t numChannels,
+                        const real ratioH,
+                        const real ratioW);
+
+  void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label);
+
+  void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label);
+
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    TensorGpuApply<real>(*this, expr);
+  }
+};
+
+class CpuMatrix : public Matrix {
+public:
+  CpuMatrix(size_t height, size_t width, bool trans = false);
+  CpuMatrix(real* data, size_t height, size_t width, bool trans = false)
+      : Matrix(data, height, width, trans, false) {}
+  CpuMatrix(real* data,
+            size_t height,
+            size_t width,
+            size_t stride,
+            bool trans = false)
+      : Matrix(data, height, width, stride, trans, false) {}
+
+  CpuMatrix(CpuMemHandlePtr dataHandle,
+            size_t height,
+            size_t width,
+            bool trans = false)
+      : Matrix(dataHandle, height, width, trans, false) {}
+
+  ~CpuMatrix();
+
+  void zeroMem();
+  void resetOne();
+  void setDiag(real value);
+
+  void resize(size_t newHeight, size_t newWidth);
+  void resize(size_t newHeight,
+              size_t newWidth,
+              size_t newNnz, /* used to allocate space */
+              SparseValueType valueType,
+              SparseFormat format) {
+    LOG(FATAL) << "Only Support Sparse Matrix";
+  }
+  void setRow(size_t row,
+              size_t colNum,
+              const unsigned int* cols,
+              const real* values) {
+    LOG(FATAL) << "Only Support Sparse Matrix";
+  }
+
+  real getElement(size_t x, size_t y) const;
+  real getSum();
+  void accumulateColSum(Matrix& src);
+  real getAbsSum();
+
+  MatrixPtr getTranspose();
+  void transpose(MatrixPtr matTrans, bool memAlloc);
+
+  MatrixPtr getInverse();
+  void inverse(MatrixPtr matInv, bool memAlloc);
+
+  void copyFrom(const Matrix& src);
+
+  void copyFrom(const Matrix& src, hl_stream_t stream);
+
+  void copyFrom(const real* cpuSrc, size_t size);
+
+  void copyFrom(const real* cpuSrc, const int64_t* seq);
+
+  void copyFrom(const IVector& src);
+
+  void copyFrom(CpuSparseMatrix& src);
+
+  void copyByRowIndex(Matrix& b, const IVector& rowIndex);
+
+  MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
+
+  void convExpand(Matrix& feature,
+                  int feaImgHeight,
+                  int feaImgWidth,
+                  int channels,
+                  int blcokH,
+                  int blockW,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  int outputH,
+                  int outputW);
+
+  void convShrink(Matrix& expandFeat,
+                  int thisImgHeight,
+                  int thisImgWidth,
+                  int channels,
+                  int blockH,
+                  int blockW,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  int outputH,
+                  int outputW,
+                  real alpha = 1.0f,
+                  real beta = 0.0f);
+
+  void maxPoolForward(Matrix& inputMat,
+                      size_t imgSizeH,
+                      size_t imgSizeW,
+                      size_t channels,
+                      size_t sizeX,
+                      size_t sizeY,
+                      size_t strideH,
+                      size_t strideW,
+                      size_t outputH,
+                      size_t outputW,
+                      size_t paddingH,
+                      size_t paddingW);
+
+  void maxPoolBackward(Matrix& image,
+                       size_t imgSizeH,
+                       size_t imgSizeW,
+                       Matrix& outGrad,
+                       Matrix& outV,
+                       size_t sizeX,
+                       size_t sizeY,
+                       size_t strideH,
+                       size_t strideW,
+                       size_t outputH,
+                       size_t outputW,
+                       real scaleTargets,
+                       real scaleOutput,
+                       size_t paddingH,
+                       size_t paddingW);
+
+  void avgPoolForward(Matrix& input,
+                      size_t imgSizeH,
+                      size_t imgSizeW,
+                      size_t channels,
+                      size_t sizeX,
+                      size_t sizeY,
+                      size_t strideH,
+                      size_t strideW,
+                      size_t outputH,
+                      size_t outputW,
+                      size_t paddingH,
+                      size_t paddingW);
+
+  void avgPoolBackward(Matrix& input,
+                       size_t imgSizeH,
+                       size_t imgSizeW,
+                       size_t sizeX,
+                       size_t sizeY,
+                       size_t strideH,
+                       size_t strideW,
+                       size_t outputH,
+                       size_t outputW,
+                       real scaleTargets,
+                       real scaleOutput,
+                       size_t paddingH,
+                       size_t paddingW);
+
+  void maxSequenceForward(Matrix& input,
+                          const IVector& sequence,
+                          IVector& index);
+
+  void maxSequenceBackward(Matrix& outputGrad,
+                           const IVector& sequence,
+                           IVector& index);
+
+  real* getRow(size_t row) { return BaseMatrix::rowBuf(row); }
+  virtual real* getRowBuf(size_t row) { return getRow(row); }
+
+public:
+  /// add b to each sample of this.
+  void addBias(Matrix& b, real scale);
+  void addSharedBias(Matrix& b, real scale);
+
+  /// add each sample of a to this.
+  void collectBias(Matrix& a, real scale);
+  void collectSharedBias(Matrix& a, real scale);
+
+  void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
+
+  /**
+   * @code
+   * this.row[i] += table.row[ids[i]]
+   * @endcode
+   */
+  virtual void selectRows(Matrix& table, IVector& ids);
+
+  /**
+   * @code
+   * table.row[ids[i]] += this.row[i]
+   * @endcode
+   */
+  virtual void addToRows(Matrix& table, IVector& ids);
+
+  /**
+   * @code
+   * this[i] = table[i, id[i]]
+   * @endcode
+   */
+  virtual void selectElements(Matrix& table, IVector& ids);
+
+  /**
+   * @code
+   * table[i, id[i]] += this[i]
+   * @endcode
+   */
+  virtual void addElements(Matrix& table, IVector& ids);
+
+  /**
+   * use abstract getRow() to get row from table.
+   *
+   * Define table as template instead of virtual class for performance sake.
+   * internal used by above two virtual funcs.
+   */
+  template <typename TableMatType>
+  void selectRowsImp(TableMatType& table, IVector& ids);
+  template <typename TableMatType>
+  void addToRowsImp(TableMatType& table, IVector& ids);
+
+  void addColumnVector(const Matrix& b);
+
+  void mul(const Matrix& a, const Matrix& b, real scaleAB, real scaleT);
+  void mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
+
+  void mul(CpuMatrix* a, CpuSparseMatrix* b, real scaleAB, real scaleT);
+
+  static void mul(CpuMatrix* a,
+                  CpuMatrix* b,
+                  CpuSparseMatrix* c,
+                  real scaleAB,
+                  real scaleT);
+
+  /**
+   * c = a * b
+   *
+   * use abstract getRow() to get row from B,C.
+   * Define B,C as template instead of virtual class for performance sake.
+   */
+  template <typename MatBType, typename MatCType>
+  static void mul(
+      CpuSparseMatrix* a, MatBType* b, MatCType* c, real scaleAB, real scaleT);
+
+  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
+
+  void mul(const Matrix& a, const Matrix& b);
+
+  void rightMul(Matrix& b, real scaleAB, real scaleT);
+  void rightMul(Matrix& b);
+
+  void leftMul(Matrix& a, real scaleAB, real scaleT);
+  void leftMul(Matrix& a);
+  void colMerge(Matrix& src);
+  void rowSum(Matrix& sum);
+  void rowMaxId(IVector& maxIds);
+  void rowMax(Matrix& max);
+  void rowMax(IVector& maxIds, Matrix& maxVal);
+  void colMax(Matrix& max);
+  void colMax(IVector& maxIds, Matrix& maxVal);
+  void maxoutForward(Matrix& a, IVector& id, size_t channels, size_t groups);
+  void maxoutBackward(Matrix& a, IVector& id, size_t channels, size_t groups);
+  void rowNormalizeL1(Matrix& out);
+
+  void oneHotCrossEntropy(Matrix& output, IVector& label);
+  void oneHotCrossEntropyBp(Matrix& outputV, IVector& label);
+  void oneHotCrossEntropyWithSelfNorm(Matrix& output,
+                                      IVector& label,
+                                      real alpha);
+  void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
+                                        IVector& label,
+                                        real alpha);
+
+  void circularConv(Matrix& b, Matrix& c);
+  void circularConvDerivative(Matrix& output,
+                              Matrix& prevOut1,
+                              Matrix& prevOut2,
+                              Matrix& prevGrad1,
+                              Matrix& prevGrad2);
+
+  void softmax(Matrix& output);
+  void sequenceSoftmax(Matrix& output, const IVector& index);
+  void softmaxDerivative(Matrix& output, Matrix& sftmaxSum);
+
+  /// calculate the sum of squares diff cost.
+  void sumOfSquares(Matrix& output, Matrix& label);
+
+  /// gradient of sumOfSquares.
+  void sumOfSquaresBp(Matrix& outputV, Matrix& label);
+
+  void tanh(Matrix& output);
+  void tanhDerivative(Matrix& output);
+
+  void softrelu(Matrix& output);
+  void softreluDerivative(Matrix& output);
+  void scaledTanh(Matrix& output, real p1, real p2);
+
+  void cosSim(Matrix& output1, Matrix& output2, real scale);
+  void cosSimDerivative(Matrix& output,
+                        Matrix& prevOut1,
+                        Matrix& prevOut2,
+                        Matrix& prevGrad1,
+                        Matrix& prevGrad2,
+                        real scale);
+
+  void print(std::ostream& os) const;
+  void print(std::ostream& os, size_t height, size_t width) const;
+  void printOneRow(std::ostream& os, size_t idx) const;
+
+  void paramReluForward(Matrix& data, Matrix& W);
+  void paramReluBackwardW(Matrix& oGrad, Matrix& data);
+  void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W);
+
+  void check(std::ostream& os, Matrix& refMat, bool printDiff = true);
+
+  real getMin();
+  real getMax();
+
+  void randomizeUniform();
+
+  void classificationError(Matrix& output, IVector& label);
+
+  void addByBitCode(size_t numClasses, const IVector& codes, const Matrix& vec);
+
+  void addByBitCodeBackward(size_t numClasses,
+                            const IVector& codes,
+                            Matrix& vec);
+
+  void mulByBitCode(size_t numClasses,
+                    const IVector& codes,
+                    const Matrix& mat,
+                    const Matrix& input);
+
+  void mulByBitCodeBackwardWeight(size_t numClasses,
+                                  const IVector& codes,
+                                  Matrix& mat,
+                                  const Matrix& input);
+
+  void mulByBitCodeBackwardError(size_t numClasses,
+                                 const IVector& codes,
+                                 const Matrix& mat,
+                                 Matrix& input);
+
+  void sumByBitCode(size_t numClasses,
+                    IVector& codes,
+                    Matrix& sum,
+                    real scaleSum);
+
+  void subByBitCode(size_t numClasses_, IVector& codes);
+
+  void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label);
+  void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label);
+  void classificationErrorMulti(Matrix& output, Matrix& label, real threshold);
+
+  void bilinearForward(const Matrix& in,
+                       const size_t inImgH,
+                       const size_t inImgW,
+                       const size_t outImgH,
+                       const size_t outImgW,
+                       const size_t numChannels,
+                       const real ratioH,
+                       const real ratioW);
+
+  void bilinearBackward(const Matrix& out,
+                        const size_t outImgH,
+                        const size_t outImgW,
+                        const size_t inImgH,
+                        const size_t inImgW,
+                        const size_t numChannels,
+                        const real ratioH,
+                        const real ratioW);
+
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    TensorCpuApply<real>(*this, expr);
+  }
+};
+
+class SharedCpuMatrix : public CpuMatrix {
+public:
+  /* blockNum is number of partitions of the matrix  */
+  SharedCpuMatrix(int blockNum, size_t height, size_t width, bool trans = false)
+      : CpuMatrix(height, width, trans) {
+    initShared(blockNum);
+  }
+  SharedCpuMatrix(
+      int blockNum, real* data, size_t height, size_t width, bool trans = false)
+      : CpuMatrix(data, height, width, trans) {
+    initShared(blockNum);
+  }
+
+  SharedCpuMatrix(int blockNum,
+                  CpuMemHandlePtr dataHandle,
+                  size_t height,
+                  size_t width,
+                  bool trans = false)
+      : CpuMatrix(dataHandle, height, width, trans) {
+    initShared(blockNum);
+  }
+
+  SharedCpuMatrix(CpuMemHandlePtr dataHandle,
+                  size_t height,
+                  size_t width,
+                  bool trans = false)
+      : CpuMatrix(dataHandle, height, width, trans) {
+    initBlock(1);
+  }
+
+  ~SharedCpuMatrix() {}
+
+public:
+  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
+  virtual void add(Matrix& b, real p1, real p2);
+  virtual void add(real p1, real p2);
+
+private:
+  using Matrix::mul;
+  void initShared(int blockNum);
+  void initBlock(int blockNum);
+
+  int blockNum_;
+  std::vector<std::unique_ptr<std::mutex>> blockLocks_;
+  ThreadLocal<CpuMatrixPtr> localBuf_;
+  ThreadLocal<std::vector<int>> localBufRows_;
+  ThreadLocal<std::vector<int>> blockSeq_;
+};
+
+typedef struct { unsigned int col; } sparse_non_value_t;
+
+typedef struct {
+  unsigned int col;
+  float value;
+} sparse_float_value_t;
+
+}  // namespace paddle
+#include "ExecViaCpu.h"

From f3fdfd941f170fbcfa5162246803b4cf8be6131c Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Thu, 5 Jan 2017 21:32:55 +0800
Subject: [PATCH 06/11] add some comments for Function.h

---
 paddle/function/BufferArg.h | 26 -----------------------
 paddle/function/Function.h  | 42 +++++++++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 26 deletions(-)

diff --git a/paddle/function/BufferArg.h b/paddle/function/BufferArg.h
index 9fcda7a878aad..52494afed3b85 100644
--- a/paddle/function/BufferArg.h
+++ b/paddle/function/BufferArg.h
@@ -46,32 +46,6 @@ class SequenceArg;
 class SparseMatrixArg;
 typedef std::shared_ptr<BufferArg> BufferArgPtr;
 
-class BufferArgs {
-public:
-  BufferArgs() {}
-  size_t size() const { return args_.size(); }
-
-  // add argument into BufferArgss
-  template <typename Tensor>
-  void addArg(const Tensor& arg) {
-    args_.push_back(std::make_shared<BufferArg>(arg));
-  }
-
-  void addArg(const Matrix& arg, const TensorShape& shape);
-
-  void addArg(const CpuSparseMatrix& arg);
-  void addArg(const GpuSparseMatrix& arg);
-
-  // get argument
-  const BufferArg& operator[](size_t num) const {
-    CHECK_LT(num, args_.size());
-    return *args_[num];
-  }
-
-private:
-  std::vector<BufferArgPtr> args_;
-};
-
 // an array of arbitrary dimensions
 class BufferArg {
 public:
diff --git a/paddle/function/Function.h b/paddle/function/Function.h
index 024575b4f7bcd..27ebe808aaf44 100644
--- a/paddle/function/Function.h
+++ b/paddle/function/Function.h
@@ -22,6 +22,11 @@ limitations under the License. */
 
 namespace paddle {
 
+/**
+ * Function Configuration.
+ * The argument type of Function::init.
+ * Follow-up will consider moving this data structure to Proto inside.
+ */
 class FuncConfig {
 public:
   union value {
@@ -41,6 +46,43 @@ class FuncConfig {
   std::map<std::string, value> valueMap_;
 };
 
+/**
+ * Argument type for Function::calc().
+ * A BufferArgs contains a set of BufferArg,
+ * because Function can have multiple inputs, outputs and inouts.
+ */
+class BufferArgs {
+public:
+  BufferArgs() {}
+  size_t size() const { return args_.size(); }
+
+  // add argument into BufferArgss
+  template <typename Tensor>
+  void addArg(const Tensor& arg) {
+    args_.push_back(std::make_shared<BufferArg>(arg));
+  }
+
+  void addArg(const Matrix& arg, const TensorShape& shape);
+
+  void addArg(const CpuSparseMatrix& arg);
+  void addArg(const GpuSparseMatrix& arg);
+
+  // get argument
+  const BufferArg& operator[](size_t num) const {
+    CHECK_LT(num, args_.size());
+    return *args_[num];
+  }
+
+private:
+  std::vector<BufferArgPtr> args_;
+};
+
+/**
+ * Base class for Function.
+ * The basic Function implementation requires override init and calc interfaces.
+ * Need to pay attention to the inouts argument. For the input argument
+ * that will be modified, it needs to be passed through inouts.
+ */
 class FunctionBase {
 public:
   virtual ~FunctionBase() {}

From ccf0b1bb2e32e0b52b56af18c1c8e339eface97e Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Thu, 5 Jan 2017 21:45:00 +0800
Subject: [PATCH 07/11] add FunctionTest.cpp

---
 paddle/function/BufferArg.cpp     | 12 -------
 paddle/function/BufferArgTest.cpp | 40 +--------------------
 paddle/function/CMakeLists.txt    |  1 +
 paddle/function/Function.cpp      | 12 +++++++
 paddle/function/FunctionTest.cpp  | 59 +++++++++++++++++++++++++++++++
 5 files changed, 73 insertions(+), 51 deletions(-)
 create mode 100644 paddle/function/FunctionTest.cpp

diff --git a/paddle/function/BufferArg.cpp b/paddle/function/BufferArg.cpp
index 08031917b21e1..65c6f303041d8 100644
--- a/paddle/function/BufferArg.cpp
+++ b/paddle/function/BufferArg.cpp
@@ -28,16 +28,4 @@ const SparseMatrixArg& BufferArg::sparse() const {
   return dynamic_cast<const SparseMatrixArg&>(*this);
 }
 
-void BufferArgs::addArg(const Matrix& arg, const TensorShape& shape) {
-  args_.push_back(std::make_shared<BufferArg>(arg, shape));
-}
-
-void BufferArgs::addArg(const CpuSparseMatrix& arg) {
-  args_.push_back(std::make_shared<SparseMatrixArg>(arg));
-}
-
-void BufferArgs::addArg(const GpuSparseMatrix& arg) {
-  args_.push_back(std::make_shared<SparseMatrixArg>(arg));
-}
-
 }  // namespace paddle
diff --git a/paddle/function/BufferArgTest.cpp b/paddle/function/BufferArgTest.cpp
index 5d669b8137e1a..a9ee3ab079e33 100644
--- a/paddle/function/BufferArgTest.cpp
+++ b/paddle/function/BufferArgTest.cpp
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "BufferArg.h"
 #include <gtest/gtest.h>
+#include "Function.h"
 #include "paddle/math/MemoryHandle.h"
 
 namespace paddle {
@@ -86,43 +87,4 @@ TEST(BufferTest, asArgument) {
   function(argments);
 }
 
-template <DeviceType DType>
-void FunctionApi(typename Tensor<real, DType>::Matrix& output,
-                 const typename Tensor<real, DType>::Matrix& input);
-
-template <>
-void FunctionApi<DEVICE_TYPE_CPU>(CpuMatrix& output, const CpuMatrix& input) {
-  EXPECT_EQ(output.getHeight(), 100);
-  EXPECT_EQ(output.getWidth(), 200);
-}
-
-template <>
-void FunctionApi<DEVICE_TYPE_GPU>(GpuMatrix& output, const GpuMatrix& input) {
-  EXPECT_EQ(output.getHeight(), 10);
-  EXPECT_EQ(output.getWidth(), 20);
-}
-
-template <DeviceType DType>
-void Function(const BufferArgs& arguments) {
-  auto input = arguments[0].matrix<DType>();
-  auto output = arguments[1].matrix<DType>();
-  FunctionApi<DType>(output, input);
-}
-
-TEST(BufferTest, Function) {
-  CpuMatrix cpuInput = CpuMatrix(100, 200);
-  CpuMatrix cpuOutput = CpuMatrix(100, 200);
-  BufferArgs cpuArgments;
-  cpuArgments.addArg(cpuInput);
-  cpuArgments.addArg(cpuOutput);
-  Function<DEVICE_TYPE_CPU>(cpuArgments);
-
-  GpuMatrix gpuInput = GpuMatrix(10, 20);
-  GpuMatrix gpuOutput = GpuMatrix(10, 20);
-  BufferArgs gpuArgments;
-  gpuArgments.addArg(gpuInput);
-  gpuArgments.addArg(gpuOutput);
-  Function<DEVICE_TYPE_GPU>(gpuArgments);
-}
-
 }  // namespace paddle
diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 37c011549eca9..31c395c8484a3 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -21,6 +21,7 @@ if(WITH_TESTING)
     add_simple_unittest(TensorShapeTest)
     add_simple_unittest(TensorTypeTest)
     add_simple_unittest(BufferArgTest)
+    add_simple_unittest(FunctionTest)
     # add_unittest(ContextProjectionOpTest
     #    ContextProjectionOpTest.cpp
     #    ../gserver/tests/TestUtil.cpp)
diff --git a/paddle/function/Function.cpp b/paddle/function/Function.cpp
index 6f82a8d053bc2..2f56cfc1b5492 100644
--- a/paddle/function/Function.cpp
+++ b/paddle/function/Function.cpp
@@ -72,6 +72,18 @@ FuncConfig& FuncConfig::set<bool>(const std::string& key, bool v) {
   return *this;
 }
 
+void BufferArgs::addArg(const Matrix& arg, const TensorShape& shape) {
+  args_.push_back(std::make_shared<BufferArg>(arg, shape));
+}
+
+void BufferArgs::addArg(const CpuSparseMatrix& arg) {
+  args_.push_back(std::make_shared<SparseMatrixArg>(arg));
+}
+
+void BufferArgs::addArg(const GpuSparseMatrix& arg) {
+  args_.push_back(std::make_shared<SparseMatrixArg>(arg));
+}
+
 ClassRegistrar<FunctionBase> FunctionBase::funcRegistrar_;
 
 }  // namespace paddle
diff --git a/paddle/function/FunctionTest.cpp b/paddle/function/FunctionTest.cpp
new file mode 100644
index 0000000000000..7c3d6684cded1
--- /dev/null
+++ b/paddle/function/FunctionTest.cpp
@@ -0,0 +1,59 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Function.h"
+#include <gtest/gtest.h>
+
+namespace paddle {
+
+template <DeviceType DType>
+void FunctionApi(typename Tensor<real, DType>::Matrix& output,
+                 const typename Tensor<real, DType>::Matrix& input);
+
+template <>
+void FunctionApi<DEVICE_TYPE_CPU>(CpuMatrix& output, const CpuMatrix& input) {
+  EXPECT_EQ(output.getHeight(), 100);
+  EXPECT_EQ(output.getWidth(), 200);
+}
+
+template <>
+void FunctionApi<DEVICE_TYPE_GPU>(GpuMatrix& output, const GpuMatrix& input) {
+  EXPECT_EQ(output.getHeight(), 10);
+  EXPECT_EQ(output.getWidth(), 20);
+}
+
+template <DeviceType DType>
+void Function(const BufferArgs& arguments) {
+  auto input = arguments[0].matrix<DType>();
+  auto output = arguments[1].matrix<DType>();
+  FunctionApi<DType>(output, input);
+}
+
+TEST(Function, BufferArgs) {
+  CpuMatrix cpuInput = CpuMatrix(100, 200);
+  CpuMatrix cpuOutput = CpuMatrix(100, 200);
+  BufferArgs cpuArgments;
+  cpuArgments.addArg(cpuInput);
+  cpuArgments.addArg(cpuOutput);
+  Function<DEVICE_TYPE_CPU>(cpuArgments);
+
+  GpuMatrix gpuInput = GpuMatrix(10, 20);
+  GpuMatrix gpuOutput = GpuMatrix(10, 20);
+  BufferArgs gpuArgments;
+  gpuArgments.addArg(gpuInput);
+  gpuArgments.addArg(gpuOutput);
+  Function<DEVICE_TYPE_GPU>(gpuArgments);
+}
+
+}  // namespace paddle

From d35ef9de10b3b97f63fa0156a8c7d36e7e89c8b8 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Mon, 9 Jan 2017 11:47:18 +0800
Subject: [PATCH 08/11] follow commit

---
 paddle/function/BufferArg.h   | 20 +++++++++++---------
 paddle/function/TensorShape.h |  4 ++--
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/paddle/function/BufferArg.h b/paddle/function/BufferArg.h
index 52494afed3b85..d787d2814d88b 100644
--- a/paddle/function/BufferArg.h
+++ b/paddle/function/BufferArg.h
@@ -56,7 +56,7 @@ class BufferArg {
       : buf_(buf), valueType_(valueType) {}
 
   BufferArg(const Matrix& matrix)
-      : buf_((void*)matrix.getData()),
+      : buf_(reinterpret_cast<void*>(matrix.getData())),
         valueType_(DataType<real>::value),
         shape_(2) {
     shape_.setDim(0, matrix.getHeight());
@@ -64,21 +64,23 @@ class BufferArg {
   }
 
   BufferArg(const Matrix& matrix, const TensorShape& shape)
-      : buf_((void*)matrix.getData()),
+      : buf_(reinterpret_cast<void*>(matrix.getData())),
         valueType_(DataType<real>::value),
         shape_(shape) {
     CHECK_EQ(matrix.getElementCnt(), shape.getElements());
   }
 
   BufferArg(const Vector& vector)
-      : buf_((void*)vector.getData()),
+      : buf_(reinterpret_cast<void*>(vector.getData())),
         valueType_(DataType<real>::value),
         shape_(1) {
     shape_.setDim(0, vector.getSize());
   }
 
   BufferArg(const IVector& vector)
-      : buf_((void*)vector.getData()), valueType_(VALUE_TYPE_INT32), shape_(1) {
+      : buf_(reinterpret_cast<void*>(vector.getData())),
+        valueType_(VALUE_TYPE_INT32),
+        shape_(1) {
     shape_.setDim(0, vector.getSize());
   }
 
@@ -129,7 +131,7 @@ class BufferArg {
 // sequence start positions in a mini-batch of sequences
 // shape_.ndims() == 1
 // valueType_ = int32
-// if a < b than value_.buf_[a] < value_.buf_[b]
+// if a < b then value_.buf_[a] < value_.buf_[b]
 class SequenceIdArg : public BufferArg {
 public:
   SequenceIdArg(void* buf, const TensorShape& shape)
@@ -203,13 +205,13 @@ class SparseMatrixArg : public BufferArg {
 
   SparseMatrixArg(const CpuSparseMatrix& sparse)
       : BufferArg(sparse),
-        row_((void*)sparse.getRows(), VALUE_TYPE_INT32),
-        col_((void*)sparse.getCols(), VALUE_TYPE_INT32) {}
+        row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
+        col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32) {}
 
   SparseMatrixArg(const GpuSparseMatrix& sparse)
       : BufferArg(sparse),
-        row_((void*)sparse.getRows(), VALUE_TYPE_INT32),
-        col_((void*)sparse.getCols(), VALUE_TYPE_INT32) {}
+        row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
+        col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32) {}
 
   ~SparseMatrixArg() {}
 
diff --git a/paddle/function/TensorShape.h b/paddle/function/TensorShape.h
index e70484a1afd99..0333fe18316ba 100644
--- a/paddle/function/TensorShape.h
+++ b/paddle/function/TensorShape.h
@@ -30,14 +30,14 @@ class TensorShape {
   TensorShape(std::initializer_list<size_t> dims) {
     ndims_ = dims.size();
     initDims(ndims_);
-    std::copy(dims.begin(), dims.end(), dims_.begin());
+    dims_.assign(dims);
     numElements();
   };
 
   TensorShape(const TensorShape& t)
       : ndims_(t.ndims_), nelements_(t.nelements_) {
     initDims(ndims_);
-    std::copy(t.dims_.begin(), t.dims_.end(), dims_.begin());
+    dims_.assign(t.dims_.begin(), t.dims_.end());
   };
 
   // get the size of specified dimension

From 57e252119eee99523a92ecd323532bec355f9144 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Mon, 9 Jan 2017 15:21:47 +0800
Subject: [PATCH 09/11] BufferArg add ArgType and Function remove inouts

---
 paddle/function/BufferArg.h         |   45 +-
 paddle/function/Function.h          |   28 +-
 paddle/function/FunctionTest.cpp    |    2 +-
 paddle/math/Matrix.h~RFbb8b484f.TMP | 1870 ---------------------------
 4 files changed, 59 insertions(+), 1886 deletions(-)
 delete mode 100644 paddle/math/Matrix.h~RFbb8b484f.TMP

diff --git a/paddle/function/BufferArg.h b/paddle/function/BufferArg.h
index d787d2814d88b..3d28249f69c2b 100644
--- a/paddle/function/BufferArg.h
+++ b/paddle/function/BufferArg.h
@@ -38,16 +38,40 @@ enum SparseDataType {
 
 enum SparseDataFormat { SPARSE_CSR_FORMAT = 0, SPARSE_CSC_FORMAT = 1 };
 
-/**
- * BufferArg used as the argument type for Function.
- */
 class BufferArg;
 class SequenceArg;
 class SparseMatrixArg;
 typedef std::shared_ptr<BufferArg> BufferArgPtr;
 
-// an array of arbitrary dimensions
+/**
+ * \brief BufferArg used as the argument type of Function.
+ *
+ * The arguments of the Paddle Function have four Buffer types.
+ * 1. BufferArg for a dense Buffer of any dimension.
+ * 2. SequenceIdArg for a Buffer of sequence start positions.
+ * 3. SequenceArg for a Buffer of sequence data.
+ * 4. SparseMatrixArg for a Buffer of sparse matrix.
+ *
+ * There is an ArgType property for the BufferArg used as Function Output.
+ * Whether the result of the Function calculation is assigned to the
+ * output Buffer or added to the output Buffer is determined by the
+ * argType_ property of the output BufferArg.
+ */
 class BufferArg {
+public:
+  // ArgType is only used by output BufferArg.
+  // For input argument, argType_ is ignored.
+  // For output argument, need to set the argType_ of the BufferArg.
+  enum ArgType {
+    UNSPECIFIED = 0,
+    ASSIGN_TO = 1,
+    ADD_TO = 2,
+  };
+
+  void setArgType(ArgType argType) { argType_ = argType; }
+
+  ArgType getArgType() const { return argType_; }
+
 public:
   BufferArg(void* buf, ValueType valueType, const TensorShape& shape)
       : buf_(buf), valueType_(valueType), shape_(shape) {}
@@ -56,7 +80,8 @@ class BufferArg {
       : buf_(buf), valueType_(valueType) {}
 
   BufferArg(const Matrix& matrix)
-      : buf_(reinterpret_cast<void*>(matrix.getData())),
+      : buf_(
+            const_cast<void*>(reinterpret_cast<const void*>(matrix.getData()))),
         valueType_(DataType<real>::value),
         shape_(2) {
     shape_.setDim(0, matrix.getHeight());
@@ -64,21 +89,24 @@ class BufferArg {
   }
 
   BufferArg(const Matrix& matrix, const TensorShape& shape)
-      : buf_(reinterpret_cast<void*>(matrix.getData())),
+      : buf_(
+            const_cast<void*>(reinterpret_cast<const void*>(matrix.getData()))),
         valueType_(DataType<real>::value),
         shape_(shape) {
     CHECK_EQ(matrix.getElementCnt(), shape.getElements());
   }
 
   BufferArg(const Vector& vector)
-      : buf_(reinterpret_cast<void*>(vector.getData())),
+      : buf_(
+            const_cast<void*>(reinterpret_cast<const void*>(vector.getData()))),
         valueType_(DataType<real>::value),
         shape_(1) {
     shape_.setDim(0, vector.getSize());
   }
 
   BufferArg(const IVector& vector)
-      : buf_(reinterpret_cast<void*>(vector.getData())),
+      : buf_(
+            const_cast<void*>(reinterpret_cast<const void*>(vector.getData()))),
         valueType_(VALUE_TYPE_INT32),
         shape_(1) {
     shape_.setDim(0, vector.getSize());
@@ -124,6 +152,7 @@ class BufferArg {
   ValueType valueType_;
   TensorShape shape_;
   BufferType bufferType_;
+  ArgType argType_ = UNSPECIFIED;
   // leading dimensions. The size is dims_.size()
   // Dims lds_;
 };
diff --git a/paddle/function/Function.h b/paddle/function/Function.h
index 27ebe808aaf44..88d6824aa3939 100644
--- a/paddle/function/Function.h
+++ b/paddle/function/Function.h
@@ -56,12 +56,18 @@ class BufferArgs {
   BufferArgs() {}
   size_t size() const { return args_.size(); }
 
-  // add argument into BufferArgss
+  // add argument into BufferArgs
+  // Tensor can be Matrix, Vector, IVector.
   template <typename Tensor>
   void addArg(const Tensor& arg) {
     args_.push_back(std::make_shared<BufferArg>(arg));
   }
 
+  // Add arg into BufferArgs and reshape the arg.
+  //
+  // For example, arg represents an image buffer,
+  // but Matrix can only represent a two-dimensional Tensor.
+  // So need an extra argument to describe the shape of the image buffer.
   void addArg(const Matrix& arg, const TensorShape& shape);
 
   void addArg(const CpuSparseMatrix& arg);
@@ -78,10 +84,20 @@ class BufferArgs {
 };
 
 /**
- * Base class for Function.
+ * \brief Base class for Function.
  * The basic Function implementation requires override init and calc interfaces.
- * Need to pay attention to the inouts argument. For the input argument
- * that will be modified, it needs to be passed through inouts.
+ *
+ * Function inputs are readonly, Function outputs have two modes: ASSIGN_TO
+ * and ADD_TO.
+ * If output.getArgType() == ASSIGN_TO, this is assign mode, and the calculation
+ * result of Function assigned to the output BufferArg.
+ * If output.getArgType() == ADD_TO, this is add mode, and the calculation
+ * result of Function need added to the output BufferArg.
+ *
+ * For example:
+ * ASSIGN_TO: output = Function(inputs)
+ * ADD_TO: output += Function(inputs)
+ * If Function has more than one output, each output can have different modes.
  */
 class FunctionBase {
 public:
@@ -89,9 +105,7 @@ class FunctionBase {
 
   virtual void init(const FuncConfig& config) {}
 
-  virtual void calc(const BufferArgs& inputs,
-                    const BufferArgs& outputs,
-                    const BufferArgs& inouts) {}
+  virtual void calc(const BufferArgs& inputs, const BufferArgs& outputs) {}
 
   static ClassRegistrar<FunctionBase> funcRegistrar_;
 };
diff --git a/paddle/function/FunctionTest.cpp b/paddle/function/FunctionTest.cpp
index 7c3d6684cded1..7ce908320a6f6 100644
--- a/paddle/function/FunctionTest.cpp
+++ b/paddle/function/FunctionTest.cpp
@@ -35,7 +35,7 @@ void FunctionApi<DEVICE_TYPE_GPU>(GpuMatrix& output, const GpuMatrix& input) {
 
 template <DeviceType DType>
 void Function(const BufferArgs& arguments) {
-  auto input = arguments[0].matrix<DType>();
+  const auto input = arguments[0].matrix<DType>();
   auto output = arguments[1].matrix<DType>();
   FunctionApi<DType>(output, input);
 }
diff --git a/paddle/math/Matrix.h~RFbb8b484f.TMP b/paddle/math/Matrix.h~RFbb8b484f.TMP
deleted file mode 100644
index d89b0f67b3c98..0000000000000
--- a/paddle/math/Matrix.h~RFbb8b484f.TMP
+++ /dev/null
@@ -1,1870 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdint.h>
-#include <memory>
-#include <thread>
-
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/ThreadLocal.h"
-
-#include <hl_gpu.h>
-
-#include "BaseMatrix.h"
-#include "MemoryHandle.h"
-#include "Vector.h"
-#include "paddle/utils/ThreadLocal.h"
-#include "paddle/utils/common.h"
-
-namespace paddle {
-
-enum SparseValueType { NO_VALUE = 0, FLOAT_VALUE = 1 };
-
-/**
- * @brief  matrix sparse_format .
- *
- * nnz represents nonzero number in sparse matrix.
- *
- * SPARSE_CSR: row major matrix. length of row is height_ + 1, each element
- * represents row start index in Matrix. length of col and value are nnz.
- *
- * SPARSE_CSC: col major matrix. length of col is width_ + 1, each element
- * represents col start index in Matrix. length of col and value are nnz.
- *
- * @code
- * for example: [0, 1, 0, 2, 0;
- *               1, 0, 0, 0, 0;
- *               0, 0, 0, 2, 5];
- * SPARSE_CSR row   [0, 2, 3, 5];
- *            col   [1, 3, 0, 3, 4];
- *            value [1, 2, 1, 2, 5]
- * SPARSE_CSC col   [0, 1, 2, 2, 4, 5];
- *            row   [1, 0, 0, 2, 2];
- *            value [1, 1, 2, 2, 5]
- * @endcode
- */
-enum SparseFormat { SPARSE_CSR = 0, SPARSE_CSC = 1 };
-
-class Matrix;
-class GpuMatrix;
-class CpuMatrix;
-class CpuSparseMatrix;
-class GpuSparseMatrix;
-typedef std::shared_ptr<Matrix> MatrixPtr;
-typedef std::shared_ptr<GpuMatrix> GpuMatrixPtr;
-typedef std::shared_ptr<CpuMatrix> CpuMatrixPtr;
-typedef std::shared_ptr<GpuSparseMatrix> GpuSparseMatrixPtr;
-typedef std::shared_ptr<CpuSparseMatrix> CpuSparseMatrixPtr;
-
-/**
- * Copy or assignemnt constructor will share the data as opposed to making a
- * copy of the original data. To make a copy of the orinal data, use copyFrom()
- * instead.
- */
-class Matrix : public BaseMatrix {
-protected:
-  Matrix(MemoryHandlePtr memHandle,
-         size_t height,
-         size_t width,
-         bool trans,
-         bool use_gpu);
-
-  Matrix(real* data, size_t height, size_t width, bool trans, bool use_gpu);
-
-  Matrix(real* data,
-         size_t height,
-         size_t width,
-         size_t stride,
-         bool trans,
-         bool use_gpu);
-
-  static ThreadLocal<MatrixPtr> tmpMat_;
-
-public:
-  size_t elementCnt_;  // maximal number of elements which can be held in data_
-  MemoryHandlePtr memoryHandle_;
-
-public:
-  virtual ~Matrix() {}
-
-  static MatrixPtr create(MemoryHandlePtr memHandle,
-                          size_t height,
-                          size_t width,
-                          bool trans = false);
-  static MatrixPtr create(size_t height,
-                          size_t width,
-                          bool trans = false,
-                          bool useGpu = false);
-  static MatrixPtr create(real* data,
-                          size_t height,
-                          size_t width,
-                          bool trans = false,
-                          bool useGpu = false);
-  static MatrixPtr create(real* data,
-                          size_t height,
-                          size_t width,
-                          size_t stride,
-                          bool trans = false,
-                          bool useGpu = false);
-
-  static MatrixPtr createSparseMatrix(size_t height,
-                                      size_t width,
-                                      size_t nnz,
-                                      SparseValueType valueType = FLOAT_VALUE,
-                                      bool trans = false,
-                                      bool useGpu = false);
-  static MatrixPtr createSparseMatrix(size_t height,
-                                      size_t width,
-                                      size_t nnz,
-                                      SparseValueType valueType = FLOAT_VALUE,
-                                      SparseFormat foramt = SPARSE_CSR,
-                                      bool trans = false,
-                                      bool useGpu = false);
-
-  static MatrixPtr createSparseMatrix(real* data,
-                                      int* row,
-                                      int* col,
-                                      size_t height,
-                                      size_t width,
-                                      size_t nnz, /* used to allocate space */
-                                      SparseValueType valueType, /*value type*/
-                                      SparseFormat format,
-                                      bool trans,
-                                      bool useGpu);
-
-  static void resizeOrCreateSparseMatrix(
-      MatrixPtr& matrix,
-      size_t height,
-      size_t width,
-      size_t nnz,
-      SparseValueType valueType = FLOAT_VALUE,
-      SparseFormat foramt = SPARSE_CSR,
-      bool trans = false,
-      bool useGpu = false);
-
-  static void resizeOrCreate(MatrixPtr& a,
-                             size_t height,
-                             size_t width,
-                             bool trans = false,
-                             bool useGpu = false);
-
-  /**
-   * @brief  set the data buffer used to hold the matrix data.
-   *
-   * caller should make sure that the size of data is at least
-   * sizeof(real)*height*width.
-   */
-  void setData(real* data) {
-    BaseMatrix::setData(data);
-    memoryHandle_.reset();
-  }
-
-  /// the data should be contiguous
-  void setData(real* data, size_t newHeight, size_t newWidth) {
-    setData(data);
-    height_ = newHeight;
-    width_ = newWidth;
-    elementCnt_ = newHeight * newWidth;
-    stride_ = width_;
-  }
-
-  size_t getWidth() const { return width_; }
-  size_t getHeight() const { return height_; }
-  size_t getStride() const { return stride_; }
-  size_t getElementCnt() const { return elementCnt_; }
-  virtual real* getData() { return data_; }
-  virtual const real* getData() const { return data_; }
-  bool isTransposed() const { return trans_; }
-  bool isContiguous() const { return stride_ == width_ || height_ == 1; }
-
-  // If sparse matrix, need to dynamic_cast to CpuSparseMatrix/GpuSparseMatrix
-  // befor call the following functions.
-  // Declare these functions in the base class just easy to call them.
-  // And these declarations should be moved to base class of sparse matrix
-  // if refactor sparse matrix
-  virtual int* getRows() const {
-    LOG(FATAL) << "Not implemented";
-    return nullptr;  //! suppress warning for no return value.
-  }
-
-  virtual int* getCols() const {
-    LOG(FATAL) << "Not implemented";
-    return nullptr;  //! suppress warning for no return value.
-  }
-
-  virtual SparseFormat getFormat() const {
-    LOG(FATAL) << "Not implemented";
-    return SPARSE_CSR;  //! suppress warning for no return value.
-  }
-
-  virtual SparseValueType getValueType() const {
-    LOG(FATAL) << "Not implemented";
-    return NO_VALUE;  //! suppress warning for no return value.
-  }
-
-  /**
-   * @brief matrix elment-wise add
-   *
-   * Named add3 just because add/add2 has been used in BaseMatrix.cu
-   * and they are not virtual function.
-   */
-  virtual void add3(MatrixPtr b) { LOG(FATAL) << "Not implemented"; }
-
-  MemoryHandlePtr getMemoryHandle() const { return memoryHandle_; }
-
-  virtual void zeroMem() { LOG(FATAL) << "Not implemented"; }
-
-  virtual void resetOne() { LOG(FATAL) << "Not implemented"; }
-
-  void setDiag(real value);
-
-  virtual void copyFrom(const Matrix& src) { LOG(FATAL) << "Not implemented"; }
-
-  virtual void trimFrom(const CpuSparseMatrix& src) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  // asynchronous copy
-  virtual void copyFrom(const Matrix& src, hl_stream_t stream) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  MatrixPtr subMatrix(size_t startRow,
-                      size_t endRow,
-                      size_t startCol,
-                      size_t endCol);
-
-  MatrixPtr subRowMatrix(size_t startRow, size_t endRow) {
-    return subMatrix(startRow, endRow, 0, getWidth());
-  }
-
-  MatrixPtr subColMatrix(size_t startCol, size_t endCol) {
-    return subMatrix(0, getHeight(), startCol, endCol);
-  }
-
-  virtual MatrixPtr subMatrix(size_t startRow, size_t numRows) {
-    CHECK_LE(startRow + numRows, getHeight());
-    return Matrix::create(getData() + startRow * getWidth(),
-                          numRows,
-                          getWidth(),
-                          trans_,
-                          useGpu_);
-  }
-  virtual MatrixPtr subMatrix(size_t startRow, size_t numRows, MatrixPtr dest) {
-    CHECK_LE(startRow + numRows, getHeight());
-    CHECK_EQ(useGpu_, dest->useGpu_);
-    dest->setData(this->rowBuf(startRow), numRows, getWidth());
-    return dest;
-  }
-
-  /**
-   * If this is GpuMatrix, src is assumed to be CPU memory
-   *
-   * If this is CpuMatrix, src is assumed to be CPU memory
-   */
-  virtual void copyFrom(const real* src, size_t size) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void copyFrom(const real* src, const int64_t* seq) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @brief convert a int vector to a real matrix.
-   *
-   * (1) source and dest are both in CPU.
-   *
-   * (2) sizes are exactly match.
-   */
-  virtual void copyFrom(const IVector& src) {
-    LOG(FATAL) << "copy data from int vector only available on CpuMatrix.";
-  }
-
-  virtual void copyByRowIndex(Matrix& b, const IVector& rowIndex) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @brief Create a matrix with the same type (GpuMatrix, CpuMatrix,
-   *        NonValueSparseMatrix, etc.) as this.
-   *
-   * If height and width is zero, the new matrix will have the same size
-   * as this, otherwise the new matrix will have the specified size.
-   *
-   */
-  virtual MatrixPtr clone(size_t height = 0,
-                          size_t width = 0,
-                          bool useGpu = false) {
-    LOG(FATAL) << "Not implemented";
-    return nullptr;
-  }
-
-  virtual real* getRowBuf(size_t row) {
-    LOG(FATAL) << "Not implemented";
-    return nullptr;
-  }
-
-  virtual real getElement(size_t x, size_t y) const {
-    LOG(FATAL) << "Not implemented";
-    return 0;
-  }
-
-  virtual real getSum() {
-    LOG(FATAL) << "Not implemented";
-    return 0;
-  }
-
-  virtual void accumulateColSum(Matrix& src) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual real getAbsSum() {
-    LOG(FATAL) << "Not implemented";
-    return 0;
-  }
-
-  /**
-   * @note Original data may not be preserved after resize().
-   */
-  virtual void resize(size_t newHeight, size_t newWidth) = 0;
-
-  /**
-   * @note This should only be used for sparse matrix.
-   */
-  virtual void resize(size_t newHeight,
-                      size_t newWidth,
-                      size_t newNnz, /* total item used to allocate space */
-                      SparseValueType valueType,
-                      SparseFormat format) = 0;
-
-  /**
-   * @brief This should only be used for sparse matrix.
-   *
-   * Currently must be called for each row in order.
-   * The matrix is not valid until setRow is called for the last row.
-   */
-  virtual void setRow(size_t row,
-                      size_t colNum,
-                      const unsigned int* cols,
-                      const real* values) = 0;
-
-  virtual MatrixPtr getTranspose() = 0;
-
-  /**
-   * @brief  hard transpose.
-   *
-   * allocate matTrans' memory outside, then set memAlloc as false;
-   * else set as true.
-   */
-  virtual void transpose(MatrixPtr matTrans, bool memAlloc) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual MatrixPtr getInverse() {
-    LOG(FATAL) << "Not implemented";
-    return nullptr;
-  }
-
-  /**
-   * @brief  inverse.
-   *
-   * if allocate matInv's memory outside, then set memAlloc as false;
-   * else set as true.
-   */
-  virtual void inverse(MatrixPtr matInv, bool memAlloc) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-public:
-  /// Only set all variables to 0 or NULL but not free them.
-  virtual void clear() {
-    height_ = 0;
-    width_ = 0;
-    data_ = NULL;
-  }
-
-  void reshape(size_t height, size_t width);
-
-  /// add b to each sample of this.
-  virtual void addBias(Matrix& b, real scale) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void addSharedBias(Matrix& b, real scale) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  void addBias(Matrix& b, real scale, bool sharedBias) {
-    if (!sharedBias) {
-      addBias(b, scale);
-    } else {
-      addSharedBias(b, scale);
-    }
-  }
-
-  /// add each sample from a to this.
-  virtual void collectBias(Matrix& a, real scale) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void collectSharedBias(Matrix& a, real scale) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  void collectBias(Matrix& a, real scale, bool sharedBias) {
-    if (!sharedBias) {
-      collectBias(a, scale);
-    } else {
-      collectSharedBias(a, scale);
-    }
-  }
-
-  virtual void sequenceAvgForward(Matrix& a,
-                                  const IVector& startsPos,
-                                  int mode) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * this = scaleAB*(a*b) + scaleT*this
-   * @endcode
-   */
-  virtual void mul(const Matrix& a,
-                   const Matrix& b,
-                   real scaleAB,
-                   real scaleT) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// Add a vector (column) b to matrix a, column by column.
-  virtual void addColumnVector(const Matrix& b) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * For j < codeLength:
-   *   this(i, j) += vec(index(i, j), 0)
-   * where index(i, j) = ((codes(i) + numClasses) >> (j + 1)) - 1
-   * @endcode
-   */
-  virtual void addByBitCode(size_t numClasses,
-                            const IVector& codes,
-                            const Matrix& vec) {
-    (void)numClasses;
-    (void)codes;
-    (void)vec;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength:
-   *   vec(index(i, j), 0) += this(i, j)
-   * where index is same as the index for addByBitCode
-   * @endcode
-   */
-  virtual void addByBitCodeBackward(size_t numClasses,
-                                    const IVector& codes,
-                                    Matrix& vec) {
-    (void)numClasses;
-    (void)codes;
-    (void)vec;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength:
-   *   this(i, j) += <mat.row(index(i, j)), input.row(i)>
-   * where index is same as the index for addByBitCode
-   * @endcode
-   */
-  virtual void mulByBitCode(size_t numClasses,
-                            const IVector& codes,
-                            const Matrix& mat,
-                            const Matrix& input) {
-    (void)numClasses;
-    (void)codes;
-    (void)mat;
-    (void)input;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength:
-   *   mat.row(index(i, j)) += this(i, j) * input.row(i)
-   * where index is same as the index for addByBitCode
-   * @endcode
-   */
-  virtual void mulByBitCodeBackwardWeight(size_t numClasses,
-                                          const IVector& codes,
-                                          Matrix& mat,
-                                          const Matrix& input) {
-    (void)numClasses;
-    (void)codes;
-    (void)mat;
-    (void)input;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength:
-   *   input.row(i) += this(i, j) * mat.row(index(i, j))
-   * where index is same as the index for addByBitCode
-   * @endcode
-   */
-  virtual void mulByBitCodeBackwardError(size_t numClasses,
-                                         const IVector& codes,
-                                         const Matrix& mat,
-                                         Matrix& input) {
-    (void)numClasses;
-    (void)codes;
-    (void)mat;
-    (void)input;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength
-   *   sum(i, 0) = scaleSum * \sum_j  bit(i, j) * this(i, j)
-   * where bit(i, j) = ((codes(i) + numClasses) & 2^j) ? 1 : 0
-   * @endcode
-   */
-  virtual void sumByBitCode(size_t numClasses,
-                            IVector& codes,
-                            Matrix& sum,
-                            real scaleSum) {
-    (void)numClasses;
-    (void)codes;
-    (void)sum;
-    (void)scaleSum;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength
-   *  this(i, j) -= bit(i, j)
-   * where bit(i, j) is same as that for sumByBitCode
-   * @endcode
-   */
-  virtual void subByBitCode(size_t numClasses_, IVector& codes) {
-    (void)numClasses_;
-    (void)codes;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * add the sum of each row of this to mat
-   */
-  virtual void rowSum(Matrix& sum) {
-    (void)sum;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * set the max of each row of this to mat
-   */
-  virtual void rowMax(Matrix& max) {
-    (void)max;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * set the max of each column of this to mat
-   */
-  virtual void colMax(Matrix& max) { LOG(FATAL) << "not implemented"; }
-
-  /**
-   * @brief Get the top k elements of each column of this matrix.
-   *
-   * The row ids and values of these elements are stored in
-   * maxIds and max respectively. where k is the size of maxIds.
-   * And note that the top k elements are not sorted.
-   */
-  virtual void colMax(IVector& maxIds, Matrix& maxVal) {
-    LOG(FATAL) << "not implemented";
-  }
-
-  virtual void maxoutForward(Matrix& a,
-                             IVector& id,
-                             size_t channels,
-                             size_t groups) {
-    LOG(FATAL) << "not implemented";
-  }
-
-  virtual void maxoutBackward(Matrix& a,
-                              IVector& id,
-                              size_t channels,
-                              size_t groups) {
-    LOG(FATAL) << "not implemented";
-  }
-
-  virtual void rowMaxId(IVector& maxIds) { LOG(FATAL) << "Not implemented"; }
-
-  /**
-   * @brief Get the top k elements of each row of this matrix.
-   *
-   * The column ids and values of these elements are stored in
-   * maxIds and max respectively. where k is the size of maxIds.
-   * And note that the top k elements are not sorted.
-   */
-  virtual void rowMax(IVector& maxIds, Matrix& max) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// normalize each row so that the sum of each row is 1.
-  virtual void rowNormalizeL1(Matrix& out) {
-    (void)out;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   *  this = a*b
-   * @endcode
-   */
-  virtual void mul(const Matrix& a, const Matrix& b) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * this = scaleAB*(this*b) +  scaleT*this
-   * @endcode
-   */
-  virtual void rightMul(Matrix& b, real scaleAB, real scaleT) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * this = this* b
-   * @endcode
-   */
-  virtual void rightMul(Matrix& b) { LOG(FATAL) << "Not implemented"; }
-
-  /**
-   * @code
-   * this = scaleAB*(a*this) +  scaleT*this
-   * @endcode
-   */
-  virtual void leftMul(Matrix& a, real scaleAB, real scaleT) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * this = a*this)
-   * @endcode
-   */
-  virtual void leftMul(Matrix& a) { LOG(FATAL) << "Not implemented"; }
-
-  /// merge the element for each col.
-  virtual void colMerge(Matrix& src) { LOG(FATAL) << "Not implemented"; }
-
-  /// copy -log(output[label]) to this->data[i].
-  virtual void oneHotCrossEntropy(Matrix& output, IVector& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// calculate the error of outputV according to label.
-  virtual void oneHotCrossEntropyBp(Matrix& outputV, IVector& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// copy -log(output[label]) to this->data[i].
-  virtual void oneHotCrossEntropyWithSelfNorm(Matrix& output,
-                                              IVector& label,
-                                              real alpha) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// calculate the error of outputV according to label.
-  virtual void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
-                                                IVector& label,
-                                                real alpha) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * \f[
-   *  a[i] = \sum_{j=-(N-1)/2}^{(N-1)/2} b_{i+j} * c_{j}
-   * \f]
-   *
-   * b contains M elements,
-   * c contains N elements (N is odd),
-   * b's index arithmetic is computed modulo M,
-   * c's index arithmetic is computed modulo N.
-   */
-  virtual void circularConv(Matrix& b, Matrix& c) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void circularConvDerivative(Matrix& output,
-                                      Matrix& prevOut1,
-                                      Matrix& prevOut2,
-                                      Matrix& prevGrad1,
-                                      Matrix& prevGrad2) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /* output_ij = exp(this_{ij}) / (sum_j exp(this_ij)) */
-  virtual void softmax(Matrix& output) {
-    (void)output;
-    LOG(FATAL) << "Not implemeted";
-  }
-  virtual void sequenceSoftmax(Matrix& output, const IVector& index) {
-    (void)output;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void softmaxBackward(Matrix& outputV) {
-    (void)outputV;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /*
-    sum_i = sum_j this_ij * output_ij
-    this_ij = output_ij* (this_ij - sum_i)
-  */
-  virtual void softmaxDerivative(Matrix& output, Matrix& sftmaxSum) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// calculate the sum of squares diff cost.
-  virtual void sumOfSquares(Matrix& output, Matrix& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// gradient of sumOfSquares.
-  virtual void sumOfSquaresBp(Matrix& outputV, Matrix& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void tanh(Matrix& output) { LOG(FATAL) << "Not implemented"; }
-
-  virtual void tanhDerivative(Matrix& output) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void softrelu(Matrix& output) { LOG(FATAL) << "Not implemented"; }
-
-  virtual void softreluDerivative(Matrix& output) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void scaledTanh(Matrix& output, real p1, real p2) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * cosine similarity, for each row i,
-   *   this[i] = cos(output1[i], output2[i])
-   *
-   * output2 can only have one row, then for each row i,
-   *   this[i] = cos(output1[i], output2[0])
-   */
-  virtual void cosSim(Matrix& output1, Matrix& output2, real scale = 1.0f) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void cosSimDerivative(Matrix& output,
-                                Matrix& prevOut1,
-                                Matrix& prevOut2,
-                                Matrix& prevGrad1,
-                                Matrix& prevGrad2,
-                                real scale = 1.0f) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// print out the values of elements to os
-  virtual void print(std::ostream& os) const {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * print a part of the matrix
-   * from the (top,left) value to the (height, width) value (not included)
-   */
-  virtual void print(std::ostream& os, size_t height, size_t width) const {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// print one row to os
-  virtual void printOneRow(std::ostream& os, size_t idx) const {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void check(std::ostream& os, Matrix& refMat, bool printDiff = true) {}
-
-  virtual real getMin() {
-    LOG(FATAL) << "Not implemented";
-    return 0;
-  }
-  virtual real getMax() {
-    LOG(FATAL) << "Not implemented";
-    return 0;
-  }
-
-  virtual void randomizeUniform() { LOG(FATAL) << "Not implemented"; }
-
-  /**
-   * @brief  calulate the error of classification
-   *
-   * output[i] = 1 if row i is an error.
-   *
-   * output[i] = 0 if row i is correct.
-   */
-  virtual void classificationError(Matrix& output, IVector& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * This function is used to calculate the convolution:
-   *
-   * It will expand a feature matrix according to the
-   * convolution filters
-   */
-  virtual void convExpand(Matrix& feature,
-                          int feaImgHeight,
-                          int feaImgWidth,
-                          int channels,
-                          int blockH,
-                          int blockW,
-                          int strideH,
-                          int strideW,
-                          int paddingH,
-                          int paddingW,
-                          int outputH,
-                          int outputW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * This function is the reverse implementation of convExpand:
-   *
-   * Its function is to restore a expanded-matrix into a feature matrix
-   */
-  virtual void convShrink(Matrix& expandColMat,
-                          int thisImgHeight,
-                          int thisImgWidth,
-                          int channels,
-                          int blockH,
-                          int blockW,
-                          int strideH,
-                          int strideW,
-                          int paddingH,
-                          int paddingW,
-                          int outputH,
-                          int outputW,
-                          real alpha = 1.0f,
-                          real beta = 0.0f) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * Pooling forward operation, pick out the largest element
-   * in the sizeX of value
-   */
-  virtual void maxPoolForward(Matrix& inputMat,
-                              size_t imgSizeH,
-                              size_t imgSizeW,
-                              size_t channels,
-                              size_t sizeX,
-                              size_t sizeY,
-                              size_t strideH,
-                              size_t strideW,
-                              size_t outputH,
-                              size_t outputW,
-                              size_t paddingH,
-                              size_t paddingW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /// Pooling backward operation.
-  virtual void maxPoolBackward(Matrix& image,
-                               size_t imgSizeH,
-                               size_t imgSizeW,
-                               Matrix& outGrad,
-                               Matrix& outV,
-                               size_t sizeX,
-                               size_t sizeY,
-                               size_t strideH,
-                               size_t strideW,
-                               size_t outputH,
-                               size_t outputW,
-                               real scaleTargets,
-                               real scaleOutput,
-                               size_t paddingH,
-                               size_t paddingW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /// Pooling forward operation, caculate the average of sizeX elements.
-  virtual void avgPoolForward(Matrix& input,
-                              size_t imgSizeH,
-                              size_t imgSizeW,
-                              size_t channels,
-                              size_t sizeX,
-                              size_t sizeY,
-                              size_t strideH,
-                              size_t strideW,
-                              size_t outputH,
-                              size_t outputW,
-                              size_t paddingH,
-                              size_t paddingW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void avgPoolBackward(Matrix& input,
-                               size_t imgSizeH,
-                               size_t imgSizeW,
-                               size_t sizeX,
-                               size_t sizeY,
-                               size_t strideH,
-                               size_t strideW,
-                               size_t outputH,
-                               size_t outputW,
-                               real scaleTargets,
-                               real scaleOutput,
-                               size_t paddingH,
-                               size_t paddingW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * Input: one or more sequences. Each sequence contains some instances.
-   *
-   * Output: output size is the number of input sequences (NOT input
-   * instances).
-   *
-   * output[i] is set to max_input[i].
-   */
-  virtual void maxSequenceForward(Matrix& input,
-                                  const IVector& sequence,
-                                  IVector& index) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void maxSequenceBackward(Matrix& outputGrad,
-                                   const IVector& sequence,
-                                   IVector& index) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * this.row[i] += table.row[ids[i]]
-   * if ids[i] == -1, it will be ignored
-   * @endcode
-   */
-  virtual void selectRows(Matrix& table, IVector& ids) {
-    (void)table;
-    (void)ids;
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * this[i] = table[i, id[i]]
-   * @endcode
-   */
-  virtual void selectElements(Matrix& table, IVector& ids) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * table.row[ids[i]] += this.row[i]
-   * if ids[i] == -1, it will be ignored
-   * @endcode
-   */
-  virtual void addToRows(Matrix& table, IVector& ids) {
-    (void)table;
-    (void)ids;
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * table[i, id[i]] += this[i]
-   * @endcode
-   */
-  virtual void addElements(Matrix& table, IVector& ids) {
-    LOG(FATAL) << "Not implemented";
-  }
-  /**
-   * @brief  cross entropy for multi binary labels
-   *
-   * @code
-   * this[i] = -sum(label[i][j]*log(output[i][j])
-   *           + (1-label[i][j])*log(1-output[i][j]))
-   * @endcode
-   */
-  virtual void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @brief  The gradient of cross entropy for multi binary labels on output
-   *
-   * @code
-   * this[i][j] = -label[i][j]/output[i][j]
-   *              + (1-label[i][j])/(1-output[i][j])
-   * @endcode
-   */
-  virtual void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @brief  Calculate the classification error for multi binary labels
-   *
-   * @code
-   * this[i] = sum((output[i][j] >= threshold && label[i][j] == 0)
-   *            || (output[i][j] < threshold && label[i][j] == 1))
-   *            / output->getWidth()
-   * @endcode
-   */
-  virtual void classificationErrorMulti(Matrix& output,
-                                        Matrix& label,
-                                        real threshold) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void paramReluForward(Matrix& data, Matrix& W) {
-    LOG(FATAL) << "Not implemented";
-  }
-  virtual void paramReluBackwardW(Matrix& oGrad, Matrix& data) {
-    LOG(FATAL) << "Not implemented";
-  }
-  virtual void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void bilinearForward(const Matrix& in,
-                               const size_t inImgH,
-                               const size_t inImgW,
-                               const size_t outImgH,
-                               const size_t outImgW,
-                               const size_t numChannels,
-                               const real ratioH,
-                               const real ratioW) {
-    LOG(FATAL) << "Not implemented";
-  }
-  virtual void bilinearBackward(const Matrix& out,
-                                const size_t outImgH,
-                                const size_t outImgW,
-                                const size_t inImgH,
-                                const size_t inImgW,
-                                const size_t numChannels,
-                                const real ratioH,
-                                const real ratioW) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  template <typename ExpressionType>
-  void operator=(const ExpressionType& expr) {
-    if (useGpu_) {
-      TensorGpuApply<real>(*this, expr);
-    } else {
-      TensorCpuApply<real>(*this, expr);
-    }
-  }
-
-  bool isEmpty() const {
-    return data_ == nullptr;
-  }
-
-  explicit operator bool() const {
-    return !isEmpty();
-  }
-};
-
-inline std::ostream& operator<<(std::ostream& os, const Matrix& mat) {
-  mat.print(os);
-  return os;
-}
-
-class GpuMatrix : public Matrix {
-public:
-  GpuMatrix();
-
-  GpuMatrix(size_t height, size_t width, bool trans = false);
-  GpuMatrix(real* data, size_t height, size_t width, bool trans = false)
-      : Matrix(data, height, width, trans, true) {}
-  GpuMatrix(real* data,
-            size_t height,
-            size_t width,
-            size_t stride,
-            bool trans = false)
-      : Matrix(data, height, width, stride, trans, true) {}
-  GpuMatrix(GpuMemHandlePtr dataHandle,
-            size_t height,
-            size_t width,
-            bool trans = false)
-      : Matrix(dataHandle, height, width, trans, true) {}
-  ~GpuMatrix();
-
-  void zeroMem();
-  void resetOne();
-  void setDiag(real value);
-
-  void resize(size_t newHeight, size_t newWidth);
-  void resize(size_t newHeight,
-              size_t newWidth,
-              size_t newNnz, /* used to allocate space */
-              SparseValueType valueType,
-              SparseFormat format) {
-    LOG(FATAL) << "Only Support Sparse Matrix";
-  }
-  void setRow(size_t row,
-              size_t colNum,
-              const unsigned int* cols,
-              const real* values) {
-    LOG(FATAL) << "Only Support Sparse Matrix";
-  }
-
-  /**
-   * Copy the data from cpu_memory buffer
-   */
-  void copyFrom(const real* hostSrc, size_t size);
-
-  void copyFrom(const real* hostSrc, const int64_t* seq);
-
-  void copyFrom(const Matrix& src, hl_stream_t stream);
-
-  void copyFrom(const Matrix& src);
-
-  void copyFrom(const IVector& src);
-
-  void copyByRowIndex(Matrix& b, const IVector& rowIndex);
-
-  MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
-
-  real getElement(size_t x, size_t y) const;
-
-  real* getRow(size_t row) { return BaseMatrix::rowBuf(row); }
-  virtual real* getRowBuf(size_t row) { return getRow(row); }
-
-  real getSum();
-  void accumulateColSum(Matrix& src);
-  real getAbsSum();
-
-  MatrixPtr getTranspose();
-  void transpose(MatrixPtr matTrans, bool memAlloc);
-
-  MatrixPtr getInverse();
-  void inverse(MatrixPtr matInv, bool memAlloc);
-
-  /// add b to each sample of this.
-  void addBias(Matrix& b, real scale);
-  void addSharedBias(Matrix& b, real scale);
-
-  /**
-   * @code
-   * add each sample from a to this.
-   * @endcode
-   */
-  void collectBias(Matrix& a, real scale);
-  void collectSharedBias(Matrix& a, real scale);
-
-  void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
-
-  /**
-   * @code
-   * this.row[i] += table.row[ids[i]]
-   * @endcode
-   */
-  virtual void selectRows(Matrix& table, IVector& ids);
-
-  /**
-   * @code
-   * this[i] = table[i, id[i]]
-   * @endcode
-   */
-  virtual void selectElements(Matrix& table, IVector& ids);
-
-  /**
-   * @code
-   * table.row[ids[i]] += this.row[i]
-   * @endcode
-   */
-  virtual void addToRows(Matrix& table, IVector& ids);
-
-  void addColumnVector(const Matrix& b);
-
-  /**
-   * @code
-   * this = scaleAB*(a*b) + scaleT*this
-   * @endcode
-   */
-  void mul(const Matrix& a, const Matrix& b, real scaleAB, real scaleT);
-
-  /**
-   * @code
-   * this = a*b
-   * @endcode
-   */
-  void mul(const Matrix& a, const Matrix& b);
-
-  void mul(const GpuMatrix& a, const GpuMatrix& b, real scaleAB, real scaleT);
-
-  void mul(const GpuSparseMatrix& a,
-           const GpuMatrix& b,
-           real scaleAB,
-           real scaleT);
-
-  void mul(const GpuMatrix& a,
-           const GpuSparseMatrix& b,
-           real scaleAB,
-           real scaleT);
-
-  /**
-   * @code
-   * this = scaleAB*(this*b) +  scaleT*this
-   * @endcode
-   */
-  void rightMul(Matrix& b, real scaleAB, real scaleT);
-
-  /**
-   * @code
-   * this = this* b
-   * @endcode
-   */
-  void rightMul(Matrix& b);
-
-  /**
-   * @code
-   * this = scaleAB*(a*this) +  scaleT*this
-   * @endcode
-   */
-  void leftMul(Matrix& a, real scaleAB, real scaleT);
-
-  /**
-   * @code
-   * this = a*this
-   * @endcode
-   */
-  void leftMul(Matrix& a);
-
-  void colMerge(Matrix& src);
-  void rowSum(Matrix& sum);
-  void rowMax(Matrix& max);
-  void rowMax(IVector& maxIds, Matrix& max);
-  void colMax(Matrix& max);
-  void colMax(IVector& maxIds, Matrix& max);
-  void maxoutForward(Matrix& a, IVector& id, size_t channels, size_t groups);
-  void maxoutBackward(Matrix& a, IVector& id, size_t channels, size_t groups);
-
-  void oneHotCrossEntropy(Matrix& output, IVector& label);
-  void oneHotCrossEntropyBp(Matrix& outputV, IVector& label);
-  void oneHotCrossEntropyWithSelfNorm(Matrix& output,
-                                      IVector& label,
-                                      real alpha);
-  void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
-                                        IVector& label,
-                                        real alpha);
-
-  void softmax(Matrix& output);
-  void sequenceSoftmax(Matrix& output, const IVector& index);
-  void softmaxBackward(Matrix& outputV);
-  void softmaxDerivative(Matrix& output, Matrix& sftmaxSum);
-
-  /// calculate the sum of squares diff cost.
-  void sumOfSquares(Matrix& output, Matrix& label);
-
-  /// gradient of sumOfSquares.
-  void sumOfSquaresBp(Matrix& outputV, Matrix& label);
-  void tanh(Matrix& output);
-  void tanhDerivative(Matrix& output);
-  void softrelu(Matrix& output);
-  void softreluDerivative(Matrix& output);
-  void scaledTanh(Matrix& output, real p1, real p2);
-
-  void cosSim(Matrix& output1, Matrix& output2, real scale);
-  void cosSimDerivative(Matrix& output,
-                        Matrix& prevOut1,
-                        Matrix& prevOut2,
-                        Matrix& prevGrad1,
-                        Matrix& prevGrad2,
-                        real scale);
-
-  virtual void print(std::ostream& os) const;
-  virtual void print(std::ostream& os, size_t height, size_t width) const;
-
-  void paramReluForward(Matrix& data, Matrix& W);
-  void paramReluBackwardW(Matrix& oGrad, Matrix& data);
-  void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W);
-
-  void check(std::ostream& os, Matrix& refMat, bool printDiff = true);
-  void randomizeUniform();
-
-  void classificationError(Matrix& output, IVector& label);
-
-  void convExpand(Matrix& feature,
-                  int feaImgHeight,
-                  int feaImgWidth,
-                  int channels,
-                  int blockH,
-                  int blockW,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  int outputH,
-                  int outputW);
-
-  void convShrink(Matrix& expandColMat,
-                  int thisImgHeight,
-                  int thisImgWidth,
-                  int channels,
-                  int blockH,
-                  int blochW,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingWreal,
-                  int outputH,
-                  int outputW,
-                  real alpha = 1.0f,
-                  real beta = 0.0f);
-
-  void maxPoolForward(Matrix& inputMat,
-                      size_t imgSizeH,
-                      size_t imgSizeW,
-                      size_t channels,
-                      size_t sizeX,
-                      size_t sizeY,
-                      size_t strideH,
-                      size_t strideW,
-                      size_t outputH,
-                      size_t outputW,
-                      size_t paddingH,
-                      size_t paddingW);
-
-  void maxPoolBackward(Matrix& image,
-                       size_t imgSizeH,
-                       size_t imgSizeW,
-                       Matrix& outGrad,
-                       Matrix& outV,
-                       size_t sizeX,
-                       size_t sizeY,
-                       size_t strideH,
-                       size_t strideW,
-                       size_t outputH,
-                       size_t outputW,
-                       real scaleTargets,
-                       real scaleOutput,
-                       size_t paddingH,
-                       size_t paddingW);
-
-  void avgPoolForward(Matrix& input,
-                      size_t imgSizeH,
-                      size_t imgSizeW,
-                      size_t channels,
-                      size_t sizeX,
-                      size_t sizeY,
-                      size_t strideH,
-                      size_t strideW,
-                      size_t outputH,
-                      size_t outputW,
-                      size_t paddingH,
-                      size_t paddingW);
-
-  void avgPoolBackward(Matrix& input,
-                       size_t imgSizeH,
-                       size_t imgSizeW,
-                       size_t sizeX,
-                       size_t sizeY,
-                       size_t strideH,
-                       size_t strideW,
-                       size_t outputH,
-                       size_t outputW,
-                       real scaleTargets,
-                       real scaleOutput,
-                       size_t paddingH,
-                       size_t paddingW);
-
-  void maxSequenceForward(Matrix& input,
-                          const IVector& sequence,
-                          IVector& index);
-
-  void maxSequenceBackward(Matrix& outputGrad,
-                           const IVector& sequence,
-                           IVector& index);
-
-  void bilinearForward(const Matrix& in,
-                       const size_t inImgH,
-                       const size_t inImgW,
-                       const size_t outImgH,
-                       const size_t outImgW,
-                       const size_t numChannels,
-                       const real ratioH,
-                       const real ratioW);
-
-  void bilinearBackward(const Matrix& out,
-                        const size_t outImgH,
-                        const size_t outImgW,
-                        const size_t inImgH,
-                        const size_t inImgW,
-                        const size_t numChannels,
-                        const real ratioH,
-                        const real ratioW);
-
-  void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label);
-
-  void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label);
-
-  template <typename ExpressionType>
-  void operator=(const ExpressionType& expr) {
-    TensorGpuApply<real>(*this, expr);
-  }
-};
-
-class CpuMatrix : public Matrix {
-public:
-  CpuMatrix(size_t height, size_t width, bool trans = false);
-  CpuMatrix(real* data, size_t height, size_t width, bool trans = false)
-      : Matrix(data, height, width, trans, false) {}
-  CpuMatrix(real* data,
-            size_t height,
-            size_t width,
-            size_t stride,
-            bool trans = false)
-      : Matrix(data, height, width, stride, trans, false) {}
-
-  CpuMatrix(CpuMemHandlePtr dataHandle,
-            size_t height,
-            size_t width,
-            bool trans = false)
-      : Matrix(dataHandle, height, width, trans, false) {}
-
-  ~CpuMatrix();
-
-  void zeroMem();
-  void resetOne();
-  void setDiag(real value);
-
-  void resize(size_t newHeight, size_t newWidth);
-  void resize(size_t newHeight,
-              size_t newWidth,
-              size_t newNnz, /* used to allocate space */
-              SparseValueType valueType,
-              SparseFormat format) {
-    LOG(FATAL) << "Only Support Sparse Matrix";
-  }
-  void setRow(size_t row,
-              size_t colNum,
-              const unsigned int* cols,
-              const real* values) {
-    LOG(FATAL) << "Only Support Sparse Matrix";
-  }
-
-  real getElement(size_t x, size_t y) const;
-  real getSum();
-  void accumulateColSum(Matrix& src);
-  real getAbsSum();
-
-  MatrixPtr getTranspose();
-  void transpose(MatrixPtr matTrans, bool memAlloc);
-
-  MatrixPtr getInverse();
-  void inverse(MatrixPtr matInv, bool memAlloc);
-
-  void copyFrom(const Matrix& src);
-
-  void copyFrom(const Matrix& src, hl_stream_t stream);
-
-  void copyFrom(const real* cpuSrc, size_t size);
-
-  void copyFrom(const real* cpuSrc, const int64_t* seq);
-
-  void copyFrom(const IVector& src);
-
-  void copyFrom(CpuSparseMatrix& src);
-
-  void copyByRowIndex(Matrix& b, const IVector& rowIndex);
-
-  MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
-
-  void convExpand(Matrix& feature,
-                  int feaImgHeight,
-                  int feaImgWidth,
-                  int channels,
-                  int blcokH,
-                  int blockW,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  int outputH,
-                  int outputW);
-
-  void convShrink(Matrix& expandFeat,
-                  int thisImgHeight,
-                  int thisImgWidth,
-                  int channels,
-                  int blockH,
-                  int blockW,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  int outputH,
-                  int outputW,
-                  real alpha = 1.0f,
-                  real beta = 0.0f);
-
-  void maxPoolForward(Matrix& inputMat,
-                      size_t imgSizeH,
-                      size_t imgSizeW,
-                      size_t channels,
-                      size_t sizeX,
-                      size_t sizeY,
-                      size_t strideH,
-                      size_t strideW,
-                      size_t outputH,
-                      size_t outputW,
-                      size_t paddingH,
-                      size_t paddingW);
-
-  void maxPoolBackward(Matrix& image,
-                       size_t imgSizeH,
-                       size_t imgSizeW,
-                       Matrix& outGrad,
-                       Matrix& outV,
-                       size_t sizeX,
-                       size_t sizeY,
-                       size_t strideH,
-                       size_t strideW,
-                       size_t outputH,
-                       size_t outputW,
-                       real scaleTargets,
-                       real scaleOutput,
-                       size_t paddingH,
-                       size_t paddingW);
-
-  void avgPoolForward(Matrix& input,
-                      size_t imgSizeH,
-                      size_t imgSizeW,
-                      size_t channels,
-                      size_t sizeX,
-                      size_t sizeY,
-                      size_t strideH,
-                      size_t strideW,
-                      size_t outputH,
-                      size_t outputW,
-                      size_t paddingH,
-                      size_t paddingW);
-
-  void avgPoolBackward(Matrix& input,
-                       size_t imgSizeH,
-                       size_t imgSizeW,
-                       size_t sizeX,
-                       size_t sizeY,
-                       size_t strideH,
-                       size_t strideW,
-                       size_t outputH,
-                       size_t outputW,
-                       real scaleTargets,
-                       real scaleOutput,
-                       size_t paddingH,
-                       size_t paddingW);
-
-  void maxSequenceForward(Matrix& input,
-                          const IVector& sequence,
-                          IVector& index);
-
-  void maxSequenceBackward(Matrix& outputGrad,
-                           const IVector& sequence,
-                           IVector& index);
-
-  real* getRow(size_t row) { return BaseMatrix::rowBuf(row); }
-  virtual real* getRowBuf(size_t row) { return getRow(row); }
-
-public:
-  /// add b to each sample of this.
-  void addBias(Matrix& b, real scale);
-  void addSharedBias(Matrix& b, real scale);
-
-  /// add each sample of a to this.
-  void collectBias(Matrix& a, real scale);
-  void collectSharedBias(Matrix& a, real scale);
-
-  void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
-
-  /**
-   * @code
-   * this.row[i] += table.row[ids[i]]
-   * @endcode
-   */
-  virtual void selectRows(Matrix& table, IVector& ids);
-
-  /**
-   * @code
-   * table.row[ids[i]] += this.row[i]
-   * @endcode
-   */
-  virtual void addToRows(Matrix& table, IVector& ids);
-
-  /**
-   * @code
-   * this[i] = table[i, id[i]]
-   * @endcode
-   */
-  virtual void selectElements(Matrix& table, IVector& ids);
-
-  /**
-   * @code
-   * table[i, id[i]] += this[i]
-   * @endcode
-   */
-  virtual void addElements(Matrix& table, IVector& ids);
-
-  /**
-   * use abstract getRow() to get row from table.
-   *
-   * Define table as template instead of virtual class for performance sake.
-   * internal used by above two virtual funcs.
-   */
-  template <typename TableMatType>
-  void selectRowsImp(TableMatType& table, IVector& ids);
-  template <typename TableMatType>
-  void addToRowsImp(TableMatType& table, IVector& ids);
-
-  void addColumnVector(const Matrix& b);
-
-  void mul(const Matrix& a, const Matrix& b, real scaleAB, real scaleT);
-  void mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
-
-  void mul(CpuMatrix* a, CpuSparseMatrix* b, real scaleAB, real scaleT);
-
-  static void mul(CpuMatrix* a,
-                  CpuMatrix* b,
-                  CpuSparseMatrix* c,
-                  real scaleAB,
-                  real scaleT);
-
-  /**
-   * c = a * b
-   *
-   * use abstract getRow() to get row from B,C.
-   * Define B,C as template instead of virtual class for performance sake.
-   */
-  template <typename MatBType, typename MatCType>
-  static void mul(
-      CpuSparseMatrix* a, MatBType* b, MatCType* c, real scaleAB, real scaleT);
-
-  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
-
-  void mul(const Matrix& a, const Matrix& b);
-
-  void rightMul(Matrix& b, real scaleAB, real scaleT);
-  void rightMul(Matrix& b);
-
-  void leftMul(Matrix& a, real scaleAB, real scaleT);
-  void leftMul(Matrix& a);
-  void colMerge(Matrix& src);
-  void rowSum(Matrix& sum);
-  void rowMaxId(IVector& maxIds);
-  void rowMax(Matrix& max);
-  void rowMax(IVector& maxIds, Matrix& maxVal);
-  void colMax(Matrix& max);
-  void colMax(IVector& maxIds, Matrix& maxVal);
-  void maxoutForward(Matrix& a, IVector& id, size_t channels, size_t groups);
-  void maxoutBackward(Matrix& a, IVector& id, size_t channels, size_t groups);
-  void rowNormalizeL1(Matrix& out);
-
-  void oneHotCrossEntropy(Matrix& output, IVector& label);
-  void oneHotCrossEntropyBp(Matrix& outputV, IVector& label);
-  void oneHotCrossEntropyWithSelfNorm(Matrix& output,
-                                      IVector& label,
-                                      real alpha);
-  void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
-                                        IVector& label,
-                                        real alpha);
-
-  void circularConv(Matrix& b, Matrix& c);
-  void circularConvDerivative(Matrix& output,
-                              Matrix& prevOut1,
-                              Matrix& prevOut2,
-                              Matrix& prevGrad1,
-                              Matrix& prevGrad2);
-
-  void softmax(Matrix& output);
-  void sequenceSoftmax(Matrix& output, const IVector& index);
-  void softmaxDerivative(Matrix& output, Matrix& sftmaxSum);
-
-  /// calculate the sum of squares diff cost.
-  void sumOfSquares(Matrix& output, Matrix& label);
-
-  /// gradient of sumOfSquares.
-  void sumOfSquaresBp(Matrix& outputV, Matrix& label);
-
-  void tanh(Matrix& output);
-  void tanhDerivative(Matrix& output);
-
-  void softrelu(Matrix& output);
-  void softreluDerivative(Matrix& output);
-  void scaledTanh(Matrix& output, real p1, real p2);
-
-  void cosSim(Matrix& output1, Matrix& output2, real scale);
-  void cosSimDerivative(Matrix& output,
-                        Matrix& prevOut1,
-                        Matrix& prevOut2,
-                        Matrix& prevGrad1,
-                        Matrix& prevGrad2,
-                        real scale);
-
-  void print(std::ostream& os) const;
-  void print(std::ostream& os, size_t height, size_t width) const;
-  void printOneRow(std::ostream& os, size_t idx) const;
-
-  void paramReluForward(Matrix& data, Matrix& W);
-  void paramReluBackwardW(Matrix& oGrad, Matrix& data);
-  void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W);
-
-  void check(std::ostream& os, Matrix& refMat, bool printDiff = true);
-
-  real getMin();
-  real getMax();
-
-  void randomizeUniform();
-
-  void classificationError(Matrix& output, IVector& label);
-
-  void addByBitCode(size_t numClasses, const IVector& codes, const Matrix& vec);
-
-  void addByBitCodeBackward(size_t numClasses,
-                            const IVector& codes,
-                            Matrix& vec);
-
-  void mulByBitCode(size_t numClasses,
-                    const IVector& codes,
-                    const Matrix& mat,
-                    const Matrix& input);
-
-  void mulByBitCodeBackwardWeight(size_t numClasses,
-                                  const IVector& codes,
-                                  Matrix& mat,
-                                  const Matrix& input);
-
-  void mulByBitCodeBackwardError(size_t numClasses,
-                                 const IVector& codes,
-                                 const Matrix& mat,
-                                 Matrix& input);
-
-  void sumByBitCode(size_t numClasses,
-                    IVector& codes,
-                    Matrix& sum,
-                    real scaleSum);
-
-  void subByBitCode(size_t numClasses_, IVector& codes);
-
-  void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label);
-  void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label);
-  void classificationErrorMulti(Matrix& output, Matrix& label, real threshold);
-
-  void bilinearForward(const Matrix& in,
-                       const size_t inImgH,
-                       const size_t inImgW,
-                       const size_t outImgH,
-                       const size_t outImgW,
-                       const size_t numChannels,
-                       const real ratioH,
-                       const real ratioW);
-
-  void bilinearBackward(const Matrix& out,
-                        const size_t outImgH,
-                        const size_t outImgW,
-                        const size_t inImgH,
-                        const size_t inImgW,
-                        const size_t numChannels,
-                        const real ratioH,
-                        const real ratioW);
-
-  template <typename ExpressionType>
-  void operator=(const ExpressionType& expr) {
-    TensorCpuApply<real>(*this, expr);
-  }
-};
-
-class SharedCpuMatrix : public CpuMatrix {
-public:
-  /* blockNum is number of partitions of the matrix  */
-  SharedCpuMatrix(int blockNum, size_t height, size_t width, bool trans = false)
-      : CpuMatrix(height, width, trans) {
-    initShared(blockNum);
-  }
-  SharedCpuMatrix(
-      int blockNum, real* data, size_t height, size_t width, bool trans = false)
-      : CpuMatrix(data, height, width, trans) {
-    initShared(blockNum);
-  }
-
-  SharedCpuMatrix(int blockNum,
-                  CpuMemHandlePtr dataHandle,
-                  size_t height,
-                  size_t width,
-                  bool trans = false)
-      : CpuMatrix(dataHandle, height, width, trans) {
-    initShared(blockNum);
-  }
-
-  SharedCpuMatrix(CpuMemHandlePtr dataHandle,
-                  size_t height,
-                  size_t width,
-                  bool trans = false)
-      : CpuMatrix(dataHandle, height, width, trans) {
-    initBlock(1);
-  }
-
-  ~SharedCpuMatrix() {}
-
-public:
-  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
-  virtual void add(Matrix& b, real p1, real p2);
-  virtual void add(real p1, real p2);
-
-private:
-  using Matrix::mul;
-  void initShared(int blockNum);
-  void initBlock(int blockNum);
-
-  int blockNum_;
-  std::vector<std::unique_ptr<std::mutex>> blockLocks_;
-  ThreadLocal<CpuMatrixPtr> localBuf_;
-  ThreadLocal<std::vector<int>> localBufRows_;
-  ThreadLocal<std::vector<int>> blockSeq_;
-};
-
-typedef struct { unsigned int col; } sparse_non_value_t;
-
-typedef struct {
-  unsigned int col;
-  float value;
-} sparse_float_value_t;
-
-}  // namespace paddle
-#include "ExecViaCpu.h"

From df9be2d483cc3073e7b8680c1f687654710d2865 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Mon, 9 Jan 2017 22:57:29 +0800
Subject: [PATCH 10/11] fix CrossMapNormalFunc and ContextProjectionFunc(remove
 inouts argument)

---
 paddle/function/BufferArg.h                   | 78 +++++++++++--------
 paddle/function/ContextProjectionOp.cpp       | 13 ++--
 paddle/function/CrossMapNormalOp.cpp          | 19 +++--
 paddle/function/Function.cpp                  | 14 ++--
 paddle/function/Function.h                    | 16 ++--
 paddle/gserver/layers/ContextProjection.cpp   | 10 +--
 paddle/gserver/layers/NormProjectionLayer.cpp | 33 ++++----
 7 files changed, 98 insertions(+), 85 deletions(-)

diff --git a/paddle/function/BufferArg.h b/paddle/function/BufferArg.h
index 3d28249f69c2b..6576d18dae99e 100644
--- a/paddle/function/BufferArg.h
+++ b/paddle/function/BufferArg.h
@@ -57,58 +57,67 @@ typedef std::shared_ptr<BufferArg> BufferArgPtr;
  * output Buffer or added to the output Buffer is determined by the
  * argType_ property of the output BufferArg.
  */
+
+// ArgType is only used by output BufferArg.
+// For input argument, argType_ is ignored.
+// For output argument, need to set the argType_ of the BufferArg.
+enum ArgType {
+  UNSPECIFIED = 0,
+  ASSIGN_TO = 1,
+  ADD_TO = 2,
+};
 class BufferArg {
 public:
-  // ArgType is only used by output BufferArg.
-  // For input argument, argType_ is ignored.
-  // For output argument, need to set the argType_ of the BufferArg.
-  enum ArgType {
-    UNSPECIFIED = 0,
-    ASSIGN_TO = 1,
-    ADD_TO = 2,
-  };
-
   void setArgType(ArgType argType) { argType_ = argType; }
 
   ArgType getArgType() const { return argType_; }
 
 public:
-  BufferArg(void* buf, ValueType valueType, const TensorShape& shape)
-      : buf_(buf), valueType_(valueType), shape_(shape) {}
+  BufferArg(void* buf,
+            ValueType valueType,
+            const TensorShape& shape,
+            ArgType argType = UNSPECIFIED)
+      : buf_(buf), valueType_(valueType), shape_(shape), argType_(argType) {}
 
   BufferArg(void* buf, ValueType valueType)
       : buf_(buf), valueType_(valueType) {}
 
-  BufferArg(const Matrix& matrix)
+  BufferArg(const Matrix& matrix, ArgType argType = UNSPECIFIED)
       : buf_(
             const_cast<void*>(reinterpret_cast<const void*>(matrix.getData()))),
         valueType_(DataType<real>::value),
-        shape_(2) {
+        shape_(2),
+        argType_(argType) {
     shape_.setDim(0, matrix.getHeight());
     shape_.setDim(1, matrix.getWidth());
   }
 
-  BufferArg(const Matrix& matrix, const TensorShape& shape)
+  BufferArg(const Matrix& matrix,
+            const TensorShape& shape,
+            ArgType argType = UNSPECIFIED)
       : buf_(
             const_cast<void*>(reinterpret_cast<const void*>(matrix.getData()))),
         valueType_(DataType<real>::value),
-        shape_(shape) {
+        shape_(shape),
+        argType_(argType) {
     CHECK_EQ(matrix.getElementCnt(), shape.getElements());
   }
 
-  BufferArg(const Vector& vector)
+  BufferArg(const Vector& vector, ArgType argType = UNSPECIFIED)
       : buf_(
             const_cast<void*>(reinterpret_cast<const void*>(vector.getData()))),
         valueType_(DataType<real>::value),
-        shape_(1) {
+        shape_(1),
+        argType_(argType) {
     shape_.setDim(0, vector.getSize());
   }
 
-  BufferArg(const IVector& vector)
+  BufferArg(const IVector& vector, ArgType argType = UNSPECIFIED)
       : buf_(
             const_cast<void*>(reinterpret_cast<const void*>(vector.getData()))),
         valueType_(VALUE_TYPE_INT32),
-        shape_(1) {
+        shape_(1),
+        argType_(argType) {
     shape_.setDim(0, vector.getSize());
   }
 
@@ -163,8 +172,10 @@ class BufferArg {
 // if a < b then value_.buf_[a] < value_.buf_[b]
 class SequenceIdArg : public BufferArg {
 public:
-  SequenceIdArg(void* buf, const TensorShape& shape)
-      : BufferArg(buf, VALUE_TYPE_INT32, shape) {
+  SequenceIdArg(void* buf,
+                const TensorShape& shape,
+                ArgType argType = UNSPECIFIED)
+      : BufferArg(buf, VALUE_TYPE_INT32, shape, argType) {
     CHECK_EQ(shape_.ndims(), 1);
     numSeqs_ = shape_[0] - 1;
   }
@@ -187,11 +198,15 @@ class SequenceArg : public BufferArg {
   SequenceArg(void* buf,
               ValueType valueType,
               const TensorShape& shape,
-              const SequenceIdArg& startPositions)
-      : BufferArg(buf, valueType, shape), startPositions_(startPositions) {}
+              const SequenceIdArg& startPositions,
+              ArgType argType = UNSPECIFIED)
+      : BufferArg(buf, valueType, shape, argType),
+        startPositions_(startPositions) {}
 
-  SequenceArg(const Matrix& matrix, const IVector& vector)
-      : BufferArg(matrix), startPositions_(vector) {}
+  SequenceArg(const Matrix& matrix,
+              const IVector& vector,
+              ArgType argType = UNSPECIFIED)
+      : BufferArg(matrix, argType), startPositions_(vector) {}
 
   ~SequenceArg() {}
 
@@ -214,8 +229,9 @@ class SparseMatrixArg : public BufferArg {
                   const BufferArg& col,
                   size_t nnz,
                   SparseDataFormat format,
-                  SparseDataType type)
-      : BufferArg(buf, valueType, shape),
+                  SparseDataType type,
+                  ArgType argType = UNSPECIFIED)
+      : BufferArg(buf, valueType, shape, argType),
         row_(row),
         col_(col),
         nnz_(nnz),
@@ -232,13 +248,13 @@ class SparseMatrixArg : public BufferArg {
     }
   }
 
-  SparseMatrixArg(const CpuSparseMatrix& sparse)
-      : BufferArg(sparse),
+  SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED)
+      : BufferArg(sparse, argType),
         row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
         col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32) {}
 
-  SparseMatrixArg(const GpuSparseMatrix& sparse)
-      : BufferArg(sparse),
+  SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED)
+      : BufferArg(sparse, argType),
         row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
         col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32) {}
 
diff --git a/paddle/function/ContextProjectionOp.cpp b/paddle/function/ContextProjectionOp.cpp
index 1a483c47953b1..b50098c52123a 100644
--- a/paddle/function/ContextProjectionOp.cpp
+++ b/paddle/function/ContextProjectionOp.cpp
@@ -84,12 +84,9 @@ class ContextProjectionForwardFunc : public FunctionBase {
     begin_pad_ = config.get<size_t>("begin_pad");
   }
 
-  void calc(const BufferArgs& inputs,
-            const BufferArgs& outputs,
-            const BufferArgs& inouts) override {
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
     CHECK_EQ(3, inputs.size());
     CHECK_EQ(1, outputs.size());
-    CHECK_EQ(0, inouts.size());
 
     CHECK(outputs[0].data() && inputs[0].data() && inputs[2].data());
     CHECK_EQ(outputs[0].shape().ndims(), 2);
@@ -103,6 +100,7 @@ class ContextProjectionForwardFunc : public FunctionBase {
     /// input and output has the same batch_size
     CHECK_EQ(inputs[0].shape()[0], outputs[0].shape()[0]);
 
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
     auto out_mat = outputs[0].matrix<Device>();
     auto in_mat = inputs[0].matrix<Device>();
     auto w_mat = !inputs[1].data()
@@ -194,12 +192,9 @@ class ContextProjectionBackwardFunc : public FunctionBase {
     total_pad_ = config.get<size_t>("total_pad");
   }
 
-  void calc(const BufferArgs& inputs,
-            const BufferArgs& outputs,
-            const BufferArgs& inouts) override {
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
     CHECK_EQ(3, inputs.size());
     CHECK_EQ(1, outputs.size());
-    CHECK_EQ(0, inouts.size());
 
     CHECK(outputs[0].data() && inputs[2].data());
     CHECK_EQ(outputs[0].shape().ndims(), 2);
@@ -214,6 +209,8 @@ class ContextProjectionBackwardFunc : public FunctionBase {
     /// dim of output = dim of input * context_length
     CHECK_EQ(outputs[0].shape()[1], inputs[0].shape()[1] * context_length_);
 
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+
     auto out_grad_mat = outputs[0].matrix<Device>();
     auto in_grad_mat =
         !inputs[0].data() ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
diff --git a/paddle/function/CrossMapNormalOp.cpp b/paddle/function/CrossMapNormalOp.cpp
index ec27db9c21296..23ee357a53d0d 100644
--- a/paddle/function/CrossMapNormalOp.cpp
+++ b/paddle/function/CrossMapNormalOp.cpp
@@ -112,6 +112,8 @@ void CrossMapNormalGrad<DEVICE_TYPE_CPU>(real* inputsGrad,
 }
 
 /**
+ * \brief {o_0, o_1} = calc(i_0)
+ *
  * \param inputs[0] input value.
  * \param outputs[0] output value.
  * \param outputs[1] denoms.
@@ -125,17 +127,16 @@ class CrossMapNormalFunc : public FunctionBase {
     pow_ = config.get<real>("pow");
   }
 
-  void calc(const BufferArgs& inputs,
-            const BufferArgs& outputs,
-            const BufferArgs& inouts) override {
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
     CHECK_EQ(1, inputs.size());
     CHECK_EQ(2, outputs.size());
-    CHECK_EQ(0, inouts.size());
 
     CHECK_EQ(inputs[0].shape().ndims(), 4);
     CHECK(inputs[0].shape() == outputs[0].shape());
     CHECK(inputs[0].shape() == outputs[1].shape());
 
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+    CHECK_EQ(outputs[1].getArgType(), ASSIGN_TO);
     size_t samples = inputs[0].shape()[0];
     size_t channels = inputs[0].shape()[1];
     size_t height = inputs[0].shape()[2];
@@ -160,6 +161,8 @@ class CrossMapNormalFunc : public FunctionBase {
 };
 
 /**
+ * \brief {o_0} = calc(i_0, i_1, i_2, i_3)
+ *
  * \param inputs[0] input value.
  * \param inputs[1] output value.
  * \param inputs[2] output grad.
@@ -175,12 +178,9 @@ class CrossMapNormalGradFunc : public FunctionBase {
     pow_ = config.get<real>("pow");
   }
 
-  void calc(const BufferArgs& inputs,
-            const BufferArgs& outputs,
-            const BufferArgs& inouts) override {
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
     CHECK_EQ(4, inputs.size());
     CHECK_EQ(1, outputs.size());
-    CHECK_EQ(0, inouts.size());
 
     CHECK_EQ(inputs[0].shape().ndims(), 4);
     CHECK(inputs[0].shape() == inputs[1].shape());
@@ -188,6 +188,9 @@ class CrossMapNormalGradFunc : public FunctionBase {
     CHECK(inputs[0].shape() == inputs[3].shape());
     CHECK(inputs[0].shape() == outputs[0].shape());
 
+    // TODO(hedaoyuan): need support ASSIGN_TO mode.
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+
     size_t samples = inputs[0].shape()[0];
     size_t channels = inputs[0].shape()[1];
     size_t height = inputs[0].shape()[2];
diff --git a/paddle/function/Function.cpp b/paddle/function/Function.cpp
index 2f56cfc1b5492..46af4e946258a 100644
--- a/paddle/function/Function.cpp
+++ b/paddle/function/Function.cpp
@@ -72,16 +72,18 @@ FuncConfig& FuncConfig::set<bool>(const std::string& key, bool v) {
   return *this;
 }
 
-void BufferArgs::addArg(const Matrix& arg, const TensorShape& shape) {
-  args_.push_back(std::make_shared<BufferArg>(arg, shape));
+void BufferArgs::addArg(const Matrix& arg,
+                        const TensorShape& shape,
+                        ArgType argType) {
+  args_.push_back(std::make_shared<BufferArg>(arg, shape, argType));
 }
 
-void BufferArgs::addArg(const CpuSparseMatrix& arg) {
-  args_.push_back(std::make_shared<SparseMatrixArg>(arg));
+void BufferArgs::addArg(const CpuSparseMatrix& arg, ArgType argType) {
+  args_.push_back(std::make_shared<SparseMatrixArg>(arg, argType));
 }
 
-void BufferArgs::addArg(const GpuSparseMatrix& arg) {
-  args_.push_back(std::make_shared<SparseMatrixArg>(arg));
+void BufferArgs::addArg(const GpuSparseMatrix& arg, ArgType argType) {
+  args_.push_back(std::make_shared<SparseMatrixArg>(arg, argType));
 }
 
 ClassRegistrar<FunctionBase> FunctionBase::funcRegistrar_;
diff --git a/paddle/function/Function.h b/paddle/function/Function.h
index 88d6824aa3939..249f8f9cfad58 100644
--- a/paddle/function/Function.h
+++ b/paddle/function/Function.h
@@ -49,7 +49,7 @@ class FuncConfig {
 /**
  * Argument type for Function::calc().
  * A BufferArgs contains a set of BufferArg,
- * because Function can have multiple inputs, outputs and inouts.
+ * because Function can have multiple inputs and outputs.
  */
 class BufferArgs {
 public:
@@ -58,9 +58,11 @@ class BufferArgs {
 
   // add argument into BufferArgs
   // Tensor can be Matrix, Vector, IVector.
+  // For inputs, do not need argType.
+  // For outputs, the argType needs to be specified as ASSIGN_TO or ADD_TO.
   template <typename Tensor>
-  void addArg(const Tensor& arg) {
-    args_.push_back(std::make_shared<BufferArg>(arg));
+  void addArg(const Tensor& arg, ArgType argType = UNSPECIFIED) {
+    args_.push_back(std::make_shared<BufferArg>(arg, argType));
   }
 
   // Add arg into BufferArgs and reshape the arg.
@@ -68,10 +70,12 @@ class BufferArgs {
   // For example, arg represents an image buffer,
   // but Matrix can only represent a two-dimensional Tensor.
   // So need an extra argument to describe the shape of the image buffer.
-  void addArg(const Matrix& arg, const TensorShape& shape);
+  void addArg(const Matrix& arg,
+              const TensorShape& shape,
+              ArgType argType = UNSPECIFIED);
 
-  void addArg(const CpuSparseMatrix& arg);
-  void addArg(const GpuSparseMatrix& arg);
+  void addArg(const CpuSparseMatrix& arg, ArgType argType = UNSPECIFIED);
+  void addArg(const GpuSparseMatrix& arg, ArgType argType = UNSPECIFIED);
 
   // get argument
   const BufferArg& operator[](size_t num) const {
diff --git a/paddle/gserver/layers/ContextProjection.cpp b/paddle/gserver/layers/ContextProjection.cpp
index 26783a42cac42..04d06cf33fed1 100644
--- a/paddle/gserver/layers/ContextProjection.cpp
+++ b/paddle/gserver/layers/ContextProjection.cpp
@@ -122,14 +122,13 @@ void ContextProjection::forward() {
 
   BufferArgs inputs;
   BufferArgs outputs;
-  BufferArgs inouts;
   inputs.addArg(*in_->value);
   inputs.addArg(CpuMatrix(w_ptr ? w_ptr->getData() : nullptr,
                           w_ptr ? w_ptr->getHeight() : 0,
                           input_dim));
   inputs.addArg(*in_->sequenceStartPositions->getVector(useGpu_));
-  outputs.addArg(*out_->value);
-  forward_[0]->calc(inputs, outputs, inouts);
+  outputs.addArg(*out_->value, ADD_TO);
+  forward_[0]->calc(inputs, outputs);
 
   if (state_ && config_.context_start() < 0) {
     CHECK_EQ(1, in_->getNumSequences());
@@ -166,15 +165,14 @@ void ContextProjection::backward(const UpdateCallback& callback) {
 
   BufferArgs inputs;
   BufferArgs outputs;
-  BufferArgs inouts;
   inputs.addArg(CpuMatrix(
       in_->grad ? in_->grad->getData() : nullptr, batch_size, input_dim));
   inputs.addArg(CpuMatrix(w_ptr ? w_ptr->getData() : nullptr,
                           w_ptr ? w_ptr->getHeight() : 0,
                           input_dim));
   inputs.addArg(*in_->sequenceStartPositions->getVector(useGpu_));
-  outputs.addArg(*out_->grad);
-  backward_[0]->calc(inputs, outputs, inouts);
+  outputs.addArg(*out_->grad, ADD_TO);
+  backward_[0]->calc(inputs, outputs);
 
   if (config_.trainable_padding()) {
     weight_->getParameterPtr()->incUpdate(callback);
diff --git a/paddle/gserver/layers/NormProjectionLayer.cpp b/paddle/gserver/layers/NormProjectionLayer.cpp
index 573de152fd0d5..4331009de7e98 100644
--- a/paddle/gserver/layers/NormProjectionLayer.cpp
+++ b/paddle/gserver/layers/NormProjectionLayer.cpp
@@ -59,7 +59,6 @@ bool CMRProjectionNormLayer::init(const LayerMap& layerMap,
 
 void CMRProjectionNormLayer::forward(PassType passType) {
   Layer::forward(passType);
-
   /* malloc memory for the output_ if necessary */
   /* note: one sample correspond to one row */
   MatrixPtr input = inputLayers_[0]->getOutputValue();
@@ -67,42 +66,36 @@ void CMRProjectionNormLayer::forward(PassType passType) {
   int size = getSize();
   resetOutput(batchSize, size);
 
-  MatrixPtr outV = getOutputValue();
-
   Matrix::resizeOrCreate(denoms_, batchSize, size, /* trans */ false, useGpu_);
 
   shape_ = TensorShape({batchSize, channels_, imgSizeH_, imgSizeW_});
 
+  // prepare forward arguments
   BufferArgs inputs;
   BufferArgs outputs;
-  BufferArgs inouts;
-  inputs.addArg(*input, shape_);
-  outputs.addArg(*outV, shape_);
-  outputs.addArg(*denoms_, shape_);
+  inputs.addArg(*getInputValue(0), shape_);
+  outputs.addArg(*getOutputValue(), shape_, ASSIGN_TO);
+  outputs.addArg(*denoms_, shape_, ASSIGN_TO);
 
-  forward_[0]->calc(inputs, outputs, inouts);
+  forward_[0]->calc(inputs, outputs);
 }
 
 void CMRProjectionNormLayer::backward(const UpdateCallback& callback) {
   (void)callback;
 
-  if (NULL == inputLayers_[0]->getOutputGrad()) {
+  if (NULL == getInputGrad(0)) {
     return;
   }
-  /* Do derivation */
-  MatrixPtr preOutGrad = inputLayers_[0]->getOutputGrad();
-  MatrixPtr localGrad = getOutputGrad();
-  MatrixPtr localOutV = getOutputValue();
-  MatrixPtr preOutV = inputLayers_[0]->getOutputValue();
 
+  // prepare backward arguments
   BufferArgs inputs;
   BufferArgs outputs;
-  BufferArgs inouts;
-  inputs.addArg(*preOutV, shape_);
-  inputs.addArg(*localOutV, shape_);
-  inputs.addArg(*localGrad, shape_);
+  inputs.addArg(*getInputValue(0), shape_);
+  inputs.addArg(*getOutputValue(), shape_);
+  inputs.addArg(*getOutputGrad(), shape_);
   inputs.addArg(*denoms_, shape_);
-  outputs.addArg(*preOutGrad, shape_);
-  backward_[0]->calc(inputs, outputs, inouts);
+  outputs.addArg(*getInputGrad(0), shape_, ADD_TO);
+
+  backward_[0]->calc(inputs, outputs);
 }
 }  // namespace paddle

From ae4400beda6ce14e78d137ff60da4196f7e6c70c Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Tue, 10 Jan 2017 13:17:23 +0800
Subject: [PATCH 11/11] Bug fix for mac os

---
 paddle/function/BufferArg.h             | 12 ++++++------
 paddle/function/ContextProjectionOp.cpp |  8 ++++----
 paddle/function/CrossMapNormalOp.cpp    |  4 ++--
 paddle/function/TensorShape.h           |  4 ++--
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/paddle/function/BufferArg.h b/paddle/function/BufferArg.h
index 6576d18dae99e..9649913fa8d9b 100644
--- a/paddle/function/BufferArg.h
+++ b/paddle/function/BufferArg.h
@@ -126,7 +126,7 @@ class BufferArg {
     CHECK(buf_);
     CHECK(valueType_ == DataType<real>::value);
     // CHECK(deviceType_ == DType);
-    CHECK_EQ(2, shape_.ndims());
+    CHECK_EQ((size_t)2, shape_.ndims());
     return typename Tensor<real, DType>::Matrix(
         reinterpret_cast<real*>(buf_), shape_[0], shape_[1]);
   }
@@ -136,7 +136,7 @@ class BufferArg {
     CHECK(buf_);
     CHECK(valueType_ == DataType<VType>::value);
     // CHECK(deviceType_ == DType);
-    CHECK_EQ(1, shape_.ndims());
+    CHECK_EQ((size_t)1, shape_.ndims());
     return typename Tensor<VType, DType>::Vector(
         shape_[0], reinterpret_cast<VType*>(buf_));
   }
@@ -176,7 +176,7 @@ class SequenceIdArg : public BufferArg {
                 const TensorShape& shape,
                 ArgType argType = UNSPECIFIED)
       : BufferArg(buf, VALUE_TYPE_INT32, shape, argType) {
-    CHECK_EQ(shape_.ndims(), 1);
+    CHECK_EQ(shape_.ndims(), (size_t)1);
     numSeqs_ = shape_[0] - 1;
   }
 
@@ -238,9 +238,9 @@ class SparseMatrixArg : public BufferArg {
         format_(format),
         type_(type) {
     CHECK((valueType == VALUE_TYPE_FLOAT) || (valueType == VALUE_TYPE_DOUBLE));
-    CHECK_EQ(shape_.ndims(), 2);
-    CHECK_EQ(row_.shape().ndims(), 1);
-    CHECK_EQ(col_.shape().ndims(), 1);
+    CHECK_EQ(shape_.ndims(), (size_t)2);
+    CHECK_EQ(row_.shape().ndims(), (size_t)1);
+    CHECK_EQ(col_.shape().ndims(), (size_t)1);
     if (format == SPARSE_CSR_FORMAT) {
       CHECK_EQ(nnz, col.shape()[0]);
     } else if (format == SPARSE_CSC_FORMAT) {
diff --git a/paddle/function/ContextProjectionOp.cpp b/paddle/function/ContextProjectionOp.cpp
index ca7a11f93683f..cb448562ebb37 100644
--- a/paddle/function/ContextProjectionOp.cpp
+++ b/paddle/function/ContextProjectionOp.cpp
@@ -85,8 +85,8 @@ class ContextProjectionForwardFunc : public FunctionBase {
   }
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(3, inputs.size());
-    CHECK_EQ(1, outputs.size());
+    CHECK_EQ((size_t)3, inputs.size());
+    CHECK_EQ((size_t)1, outputs.size());
 
     CHECK(outputs[0].data() && inputs[0].data() && inputs[2].data());
     CHECK_EQ(outputs[0].shape().ndims(), (size_t)2);
@@ -193,8 +193,8 @@ class ContextProjectionBackwardFunc : public FunctionBase {
   }
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(3, inputs.size());
-    CHECK_EQ(1, outputs.size());
+    CHECK_EQ((size_t)3, inputs.size());
+    CHECK_EQ((size_t)1, outputs.size());
 
     CHECK(outputs[0].data() && inputs[2].data());
     CHECK_EQ(outputs[0].shape().ndims(), (size_t)2);
diff --git a/paddle/function/CrossMapNormalOp.cpp b/paddle/function/CrossMapNormalOp.cpp
index cf989468403d2..92980c503fdaa 100644
--- a/paddle/function/CrossMapNormalOp.cpp
+++ b/paddle/function/CrossMapNormalOp.cpp
@@ -131,7 +131,7 @@ class CrossMapNormalFunc : public FunctionBase {
     CHECK_EQ((size_t)1, inputs.size());
     CHECK_EQ((size_t)2, outputs.size());
 
-    CHECK_EQ(inputs[0].shape().ndims(), 4);
+    CHECK_EQ(inputs[0].shape().ndims(), (size_t)4);
     CHECK(inputs[0].shape() == outputs[0].shape());
     CHECK(inputs[0].shape() == outputs[1].shape());
 
@@ -182,7 +182,7 @@ class CrossMapNormalGradFunc : public FunctionBase {
     CHECK_EQ((size_t)4, inputs.size());
     CHECK_EQ((size_t)1, outputs.size());
 
-    CHECK_EQ(inputs[0].shape().ndims(), 4);
+    CHECK_EQ(inputs[0].shape().ndims(), (size_t)4);
     CHECK(inputs[0].shape() == inputs[1].shape());
     CHECK(inputs[0].shape() == inputs[2].shape());
     CHECK(inputs[0].shape() == inputs[3].shape());
diff --git a/paddle/function/TensorShape.h b/paddle/function/TensorShape.h
index 0333fe18316ba..e491e3f1d6b26 100644
--- a/paddle/function/TensorShape.h
+++ b/paddle/function/TensorShape.h
@@ -42,14 +42,14 @@ class TensorShape {
 
   // get the size of specified dimension
   size_t operator[](size_t dim) const {
-    CHECK_GE(dim, 0);
+    CHECK_GE(dim, (size_t)0);
     CHECK_LT(dim, ndims_);
     return dims_[dim];
   }
 
   // set the size of specified dimension
   void setDim(size_t dim, size_t size) {
-    CHECK_GE(dim, 0);
+    CHECK_GE(dim, (size_t)0);
     CHECK_LT(dim, ndims_);
     dims_[dim] = size;
     numElements();