From 8cd2222e49a1a4f07665efc3864c9a3f43f53941 Mon Sep 17 00:00:00 2001
From: yangyaming <yangyaming@baidu.com>
Date: Mon, 15 May 2017 14:56:21 +0800
Subject: [PATCH] merge convolution logic into class Matrix

---
 paddle/gserver/layers/ConvShiftLayer.cpp | 146 ++---------------------
 paddle/math/Matrix.cpp                   |  95 ++++++++++-----
 paddle/math/Matrix.h                     |  18 ++-
 3 files changed, 90 insertions(+), 169 deletions(-)

diff --git a/paddle/gserver/layers/ConvShiftLayer.cpp b/paddle/gserver/layers/ConvShiftLayer.cpp
index a8b04a88267d7..e4dd7f0ee034b 100644
--- a/paddle/gserver/layers/ConvShiftLayer.cpp
+++ b/paddle/gserver/layers/ConvShiftLayer.cpp
@@ -52,9 +52,6 @@ class ConvShiftLayer : public Layer {
 
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback = nullptr) override;
-  bool isSeqType();
-  void circularConvSeq();
-  void circularConvSeqDerivative();
 };
 
 REGISTER_LAYER(conv_shift, ConvShiftLayer);
@@ -69,122 +66,12 @@ bool ConvShiftLayer::init(const LayerMap& layerMap,
   return true;
 }
 
-bool ConvShiftLayer::isSeqType() {
-  const Argument& inLayer0 = getInput(0);
-  if (nullptr == inLayer0.sequenceStartPositions)
-    return false;
-  else
-    return true;
-}
-
-void ConvShiftLayer::circularConvSeq() {
-  const Argument& inLayer0 = getInput(0);
-  MatrixPtr in0 = inLayer0.value;
-  MatrixPtr in1 = getInputValue(1);
-  MatrixPtr out = getOutputValue();
-  const ICpuGpuVectorPtr& sequenceStartPositions =
-      inLayer0.sequenceStartPositions;
-
-  size_t width0 = in0->getWidth();
-  size_t numSeqs = sequenceStartPositions->getSize() - 1;
-  size_t height0 = in0->getHeight();
-  size_t width1 = in1->getWidth();
-  size_t height1 = in1->getHeight();
-
-  CHECK_EQ(numSeqs, height1);
-  CHECK_EQ(width0, out->getWidth());
-  CHECK_EQ(height0, out->getHeight());
-
-  CHECK_EQ(width1 % 2, 1U);
-
-  real* inV0 = in0->getData();
-  const int* startPosIntPtr = sequenceStartPositions->getData(false);
-  real* inV1 = in1->getData();
-  real* outV = out->getData();
-
-  int leftCtxLen = (width1 - 1) / 2;
-  for (size_t x = 0; x < numSeqs - 1; x++) {
-    int curSeqLen = startPosIntPtr[x + 1];
-    size_t curSeqWidth = curSeqLen * width0;
-    for (size_t i = 0; i < curSeqWidth; i++) {
-      for (size_t j = 0; j < width1; ++j) {
-        int index = i + j - leftCtxLen;
-        index = (index + curSeqWidth) % curSeqWidth;
-        int outVRowOffset = i / width0;
-        int outVColOffset = i % width0;
-        int inV0RowOffset = index / width0;
-        int inV0ColOffset = index % width0;
-        (outV + outVRowOffset)[outVColOffset] +=
-            (inV0 + inV0RowOffset)[inV0ColOffset] * inV1[j];
-      }
-    }
-    outV += curSeqWidth;
-    inV0 += curSeqWidth;
-    inV1 += width1;
-  }
-}
-
-void ConvShiftLayer::circularConvSeqDerivative() {
-  const Argument& inLayer0 = getInput(0);
-  MatrixPtr in0 = inLayer0.value;
-  MatrixPtr in1 = getInputValue(1);
-  MatrixPtr inG0 = getInputGrad(0);
-  MatrixPtr inG1 = getInputGrad(1);
-  MatrixPtr outG = getOutputGrad();
-  const ICpuGpuVectorPtr& sequenceStartPositions =
-      inLayer0.sequenceStartPositions;
-
-  size_t height0 = in0->getHeight();
-  size_t height1 = in1->getHeight();
-  size_t numSeqs = sequenceStartPositions->getSize() - 1;
-  size_t width0 = in0->getWidth();
-  size_t width1 = in1->getWidth();
-
-  CHECK_EQ(height1, numSeqs);
-  CHECK_EQ(height0, inG0->getHeight());
-  CHECK_EQ(width0, inG0->getWidth());
-  CHECK_EQ(height1, inG1->getHeight());
-  CHECK_EQ(width1, inG1->getWidth());
-  CHECK_EQ(height0, outG->getHeight());
-  CHECK_EQ(width0, outG->getWidth());
-
-  const int* startPosIntPtr = sequenceStartPositions->getData(false);
-  real* outGV = outG->getData();
-  real* inV0 = in0->getData();
-  real* inV1 = in1->getData();
-  real* inGV0 = inG0->getData();
-  real* inGV1 = inG1->getData();
-
-  int leftCtxLen = (width1 - 1) / 2;
-  for (size_t x = 0; x < numSeqs - 1; x++) {
-    int curSeqLen = startPosIntPtr[x + 1];
-    size_t curSeqWidth = curSeqLen * width0;
-    for (size_t j = 0; j < width1; j++) {
-      for (size_t i = 0; i < curSeqWidth; i++) {
-        int index = i + j - leftCtxLen;
-        index = (index + curSeqWidth) % curSeqWidth;
-        int inGV0RowOffset = index / width0;
-        int inGV0ColOffset = index % width0;
-        int outGVRowOffset = i / width0;
-        int outGVColOffset = i % width0;
-        (inGV0 + inGV0RowOffset)[inGV0ColOffset] +=
-            (outGV + outGVRowOffset)[outGVColOffset] * inV1[j];
-        inGV1[j] += (outGV + outGVRowOffset)[outGVColOffset] *
-                    (inGV0 + inGV0RowOffset)[inGV0ColOffset];
-      }
-    }
-    outGV += curSeqWidth;
-    inV0 += curSeqWidth;
-    inV1 += width1;
-    inGV0 += curSeqWidth;
-    inGV1 += width1;
-  }
-}
-
 void ConvShiftLayer::forward(PassType passType) {
   Layer::forward(passType);
 
   MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+  const ICpuGpuVectorPtr& seqStartPosPtr = getInput(0).sequenceStartPositions;
 
   size_t batchSize = inV0->getHeight();
   size_t dataDim = inV0->getWidth();
@@ -196,34 +83,27 @@ void ConvShiftLayer::forward(PassType passType) {
     resetOutput(batchSize, dataDim);
   }
 
+  MatrixPtr outV = getOutputValue();
+
   REGISTER_TIMER_INFO("FwConvShiftTimer", getName().c_str());
-  if (!isSeqType()) {
-    MatrixPtr inV1 = getInputValue(1);
-    CHECK_EQ(batchSize, inV1->getHeight());
-    MatrixPtr outV = getOutputValue();
-    outV->circularConv(*inV0, *inV1);
-  } else {
-    circularConvSeq();
-  }
+  outV->circularConv(*inV0, *inV1, seqStartPosPtr, useGpu_);
 }
 
 void ConvShiftLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+  MatrixPtr outG = getOutputGrad();
   MatrixPtr inG0 = getInputGrad(0);
   MatrixPtr inG1 = getInputGrad(1);
+  const ICpuGpuVectorPtr& seqStartPosPtr = getInput(0).sequenceStartPositions;
 
   REGISTER_TIMER_INFO("BwConvShiftTimer", getName().c_str());
 
-  if (!(inG0 && inG1)) {
-    CHECK(!inG0 || !inG1) << "Not supported";
-  }
-
-  if (!isSeqType()) {
-    MatrixPtr inV0 = getInputValue(0);
-    MatrixPtr inV1 = getInputValue(1);
-    MatrixPtr outG = getOutputGrad();
-    outG->circularConvDerivative(*outG, *inV0, *inV1, *inG0, *inG1);
+  if (inG0 && inG1) {
+    outG->circularConvDerivative(
+        *outG, *inV0, *inV1, *inG0, *inG1, seqStartPosPtr, useGpu_);
   } else {
-    circularConvSeqDerivative();
+    CHECK(!inG0 || !inG1) << "Not supported";
   }
 }
 
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 6ac61be0bf1b7..bf282eb524e3d 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -3877,14 +3877,22 @@ real CpuMatrix::getMax() {
   return res;
 }
 
-void CpuMatrix::circularConv(Matrix& in0, Matrix& in1) {
-  size_t height = this->getHeight();
+void CpuMatrix::circularConv(Matrix& in0,
+                             Matrix& in1,
+                             const ICpuGpuVectorPtr& seqStartPosPtr,
+                             bool useGpu) {
+  size_t height0 = this->getHeight();
   size_t width0 = this->getWidth();
   size_t width1 = in1.getWidth();
+  size_t numSeqs = height0;
+  // if sequence type, height1 should be sequence number
+  if (nullptr != seqStartPosPtr) {
+    numSeqs = seqStartPosPtr->getSize() - 1;
+  }
 
-  CHECK_EQ(height, in0.getHeight());
+  CHECK_EQ(height0, in0.getHeight());
   CHECK_EQ(width0, in0.getWidth());
-  CHECK_EQ(height, in1.getHeight());
+  CHECK_EQ(numSeqs, in1.getHeight());
 
   CHECK_EQ(width1 % 2, 1U);
 
@@ -3892,32 +3900,50 @@ void CpuMatrix::circularConv(Matrix& in0, Matrix& in1) {
   real* inV0 = in0.getData();
   real* inV1 = in1.getData();
 
+  const int* startPosIntPtr = nullptr;
+  if (nullptr != seqStartPosPtr) {
+    startPosIntPtr = seqStartPosPtr->getData(useGpu);
+  }
+
   int leftCtxLen = (width1 - 1) / 2;
-  for (size_t x = 0; x < height;
-       ++x, outV += width0, inV0 += width0, inV1 += width1) {
-    for (size_t i = 0; i < width0; ++i) {  // each dimension of output
-      for (size_t j = 0; j < width1; ++j) {
-        // iterate over all dimentions of inV1
-        int index = i + j - leftCtxLen;
-        index = (index + width0) % width0;
-        outV[i] += inV0[index] * inV1[j];
+  // row first order, treate multiple rows as a long row
+  for (size_t x = 0; x < numSeqs; ++x) {
+    size_t curSeqWidth = width0;
+    if (nullptr != startPosIntPtr)
+      curSeqWidth *= startPosIntPtr[x + 1] - startPosIntPtr[x];
+    // conv a complete sequence
+    for (size_t i = 0; i < curSeqWidth; ++i) {
+      for (size_t j = 0; j < width1;
+           ++j) {  // iterate over convolution template
+        int index = (i + j - leftCtxLen + curSeqWidth) % curSeqWidth;
+        *(outV + i) += *(inV0 + index) * inV1[j];
       }
     }
+    outV += curSeqWidth;
+    inV0 += curSeqWidth;
+    inV1 += width1;
   }
 }
 
-void CpuMatrix::circularConvDerivative(
-    Matrix& outG, Matrix& in0, Matrix& in1, Matrix& inG0, Matrix& inG1) {
-  size_t height = in0.getHeight();
+void CpuMatrix::circularConvDerivative(Matrix& outG,
+                                       Matrix& in0,
+                                       Matrix& in1,
+                                       Matrix& inG0,
+                                       Matrix& inG1,
+                                       const ICpuGpuVectorPtr& seqStartPosPtr,
+                                       bool useGpu) {
+  size_t height0 = in0.getHeight();
   size_t width0 = in0.getWidth();
   size_t width1 = in1.getWidth();
+  size_t numSeqs = height0;
+  if (nullptr != seqStartPosPtr) numSeqs = seqStartPosPtr->getSize() - 1;
 
-  CHECK_EQ(height, in1.getHeight());
-  CHECK_EQ(height, inG0.getHeight());
+  CHECK_EQ(numSeqs, in1.getHeight());
+  CHECK_EQ(height0, inG0.getHeight());
   CHECK_EQ(width0, inG0.getWidth());
-  CHECK_EQ(height, inG1.getHeight());
+  CHECK_EQ(numSeqs, inG1.getHeight());
   CHECK_EQ(width1, inG1.getWidth());
-  CHECK_EQ(height, outG.getHeight());
+  CHECK_EQ(height0, outG.getHeight());
   CHECK_EQ(width0, outG.getWidth());
 
   real* outGV = outG.getData();
@@ -3925,23 +3951,28 @@ void CpuMatrix::circularConvDerivative(
   real* inV1 = in1.getData();
   real* inGV0 = inG0.getData();
   real* inGV1 = inG1.getData();
+  const int* startPosIntPtr = nullptr;
+  if (nullptr != seqStartPosPtr) {
+    startPosIntPtr = seqStartPosPtr->getData(useGpu);
+  }
 
   int leftCtxLen = (width1 - 1) / 2;
-  for (size_t x = 0; x < height; ++x,
-              outGV += width0,
-              inV0 += width0,
-              inV1 += width1,
-              inGV0 += width0,
-              inGV1 += width1) {
-    for (size_t j = 0; j < width1; ++j) {  // iterate over width1
-      for (size_t i = 0; i < width0; ++i) {
-        // such over all dimensions of outG
-        int index = i + j - leftCtxLen;
-        index = (index + width0) % width0;
-        inGV0[index] += outGV[i] * inV1[j];
-        inGV1[j] += outGV[i] * inV0[index];
+  for (size_t x = 0; x < numSeqs; ++x) {
+    size_t curSeqWidth = width0;
+    if (nullptr != startPosIntPtr)
+      curSeqWidth *= startPosIntPtr[x + 1] - startPosIntPtr[x];
+    for (size_t j = 0; j < width1; ++j) {  // iterate over convolution template
+      for (size_t i = 0; i < curSeqWidth; i++) {
+        int index = (i + j - leftCtxLen + curSeqWidth) % curSeqWidth;
+        *(inGV0 + index) += *(outGV + i) * inV1[j];
+        inGV1[j] += *(outGV + i) * *(inV0 + index);
       }
     }
+    outGV += curSeqWidth;
+    inV0 += curSeqWidth;
+    inV1 += width1;
+    inGV0 += curSeqWidth;
+    inGV1 += width1;
   }
 }
 
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index 3252adb19e4c2..2dcc04fb59feb 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -744,7 +744,10 @@ class Matrix : public BaseMatrix {
    * b's index arithmetic is computed modulo M,
    * c's index arithmetic is computed modulo N.
    */
-  virtual void circularConv(Matrix& b, Matrix& c) {
+  virtual void circularConv(Matrix& b,
+                            Matrix& c,
+                            const ICpuGpuVectorPtr& seqStartPosPtr,
+                            bool useGpu) {
     LOG(FATAL) << "Not implemented";
   }
 
@@ -752,7 +755,9 @@ class Matrix : public BaseMatrix {
                                       Matrix& prevOut1,
                                       Matrix& prevOut2,
                                       Matrix& prevGrad1,
-                                      Matrix& prevGrad2) {
+                                      Matrix& prevGrad2,
+                                      const ICpuGpuVectorPtr& seqStartPosPtr,
+                                      bool useGpu) {
     LOG(FATAL) << "Not implemented";
   }
 
@@ -1719,12 +1724,17 @@ class CpuMatrix : public Matrix {
                                         IVector& label,
                                         real alpha);
 
-  void circularConv(Matrix& b, Matrix& c);
+  void circularConv(Matrix& b,
+                    Matrix& c,
+                    const ICpuGpuVectorPtr& seqStartPosPtr = nullptr,
+                    bool useGpu = false);
   void circularConvDerivative(Matrix& output,
                               Matrix& prevOut1,
                               Matrix& prevOut2,
                               Matrix& prevGrad1,
-                              Matrix& prevGrad2);
+                              Matrix& prevGrad2,
+                              const ICpuGpuVectorPtr& seqStartPosPtr = nullptr,
+                              bool useGpu = false);
 
   void softmax(Matrix& output);
   void sequenceSoftmax(Matrix& output, const IVector& index);