From 1644c72accb59c325c7e17bb1bb46e03391a4c27 Mon Sep 17 00:00:00 2001
From: wangmeng28 <wangmeng28@baidu.com>
Date: Wed, 11 Oct 2017 16:07:30 +0800
Subject: [PATCH 01/18] Add framework of the factorization machine layer

---
 doc/api/v2/config/layer.rst                   | 15 +++--
 .../layers/FactorizationMachineLayer.cpp      | 65 +++++++++++++++++++
 .../layers/FactorizationMachineLayer.h        | 59 +++++++++++++++++
 paddle/gserver/tests/test_LayerGrad.cpp       | 19 ++++++
 proto/ModelConfig.proto                       |  3 +
 python/paddle/trainer/config_parser.py        | 15 +++++
 .../paddle/trainer_config_helpers/layers.py   | 65 +++++++++++++++++++
 .../tests/configs/file_list.sh                |  3 +-
 .../test_factorization_machine.protostr       | 39 +++++++++++
 .../configs/test_factorization_machine.py     |  9 +++
 10 files changed, 287 insertions(+), 5 deletions(-)
 create mode 100644 paddle/gserver/layers/FactorizationMachineLayer.cpp
 create mode 100644 paddle/gserver/layers/FactorizationMachineLayer.h
 create mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr
 create mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py
diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index d4e9d53e5c095..89d6953c33010 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -54,7 +54,7 @@ img_conv
 
 ..  _api_v2.layer_context_projection:
 
-context_projection 
+context_projection
 ------------------
 ..  autoclass:: paddle.v2.layer.context_projection
     :noindex:
@@ -70,7 +70,7 @@ Image Pooling Layer
 img_pool
 --------
 ..  autoclass:: paddle.v2.layer.img_pool
-    :noindex:   
+    :noindex:
 
 spp
 ---
@@ -99,7 +99,7 @@ sum_to_one_norm
 ---------------
 ..  autoclass:: paddle.v2.layer.sum_to_one_norm
     :noindex:
-    
+
 cross_channel_norm
 ------------------
 ..  autoclass:: paddle.v2.layer.cross_channel_norm
@@ -109,7 +109,7 @@ row_l2_norm
 -----------
 ..  autoclass:: paddle.v2.layer.row_l2_norm
     :noindex:
-    
+
 Recurrent Layers
 ================
 
@@ -395,6 +395,13 @@ multiplex
 ..  autoclass:: paddle.v2.layer.multiplex
     :noindex:
 
+Factorization Machine Layer
+============================
+
+factorization_machine
+---------------------
+..  autoclass:: paddle.v2.layer.factorization_machine
+    :noindex:
 
 Slicing and Joining Layers
 ==========================
diff --git a/paddle/gserver/layers/FactorizationMachineLayer.cpp b/paddle/gserver/layers/FactorizationMachineLayer.cpp
new file mode 100644
index 0000000000000..5456bf2601eab
--- /dev/null
+++ b/paddle/gserver/layers/FactorizationMachineLayer.cpp
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "FactorizationMachineLayer.h"
+#include <algorithm>
+#include <vector>
+#include "paddle/math/SparseMatrix.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(factorization_machine, FactorizationMachineLayer);
+
+bool FactorizationMachineLayer::init(const LayerMap& layerMap,
+                                     const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  factorSize_ = config_.factor_size();
+
+  /* initialize the latentVectors_ */
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  size_t height = inputLayers_[0]->getSize();
+  latentVectors_.reset(new Weight(height, factorSize_, parameters_[0]));
+
+  return true;
+}
+
+void FactorizationMachineLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  auto input = getInput(0);
+
+  int batchSize = input.getBatchSize();
+  int size = getSize();
+  reserveOutput(batchSize, size);
+
+  MatrixPtr outV = getOutputValue();
+
+  /* activation */ {
+    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void FactorizationMachineLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ {
+    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
+    backwardActivation();
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/FactorizationMachineLayer.h b/paddle/gserver/layers/FactorizationMachineLayer.h
new file mode 100644
index 0000000000000..e7807c8986c21
--- /dev/null
+++ b/paddle/gserver/layers/FactorizationMachineLayer.h
@@ -0,0 +1,59 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/ThreadLocal.h"
+
+namespace paddle {
+/**
+ * @brief The Factorization Machine models pairwise (order-2) feature
+ * interactions as inner product of the learned latent vectors corresponding
+ * to each input feature.
+ *
+ * The Factorization Machine can effectively capture feature interactions
+ * especially when the input is sparse. While in principle FM can model higher
+ * order feature interaction, in practice usually only order-2 feature
+ * interactions are considered. The Factorization Machine Layer here only
+ * computes the order-2 interations with the formula:
+ *
+ * \f[
+ *     y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j
+ * \f]
+ *
+ * The config file api is factorization_machine.
+ */
+
+class FactorizationMachineLayer : public Layer {
+protected:
+  /// The latent vectors, shape: (size, factorSize_)
+  std::unique_ptr<Weight> latentVectors_;
+  /// The hyperparameter that defines the dimensionality of the factorization
+  size_t factorSize_;
+
+public:
+  explicit FactorizationMachineLayer(const LayerConfig& config)
+      : Layer(config) {}
+  ~FactorizationMachineLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 90a3352898863..542db5ee5b74c 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -2359,6 +2359,25 @@ TEST(Layer, ScaleShiftLayer) {
   }
 }
 
+void testFactorizationMachineLayer(InputType type, bool useGpu) {
+  const int FACTOR_SIZE = 10;
+  TestConfig config;
+  config.layerConfig.set_type("factorization_machine");
+  config.layerConfig.set_factor_size(FACTOR_SIZE);
+  config.biasSize = 1;
+  config.inputDefs.push_back({type, "layer_0", 8192, 0});
+  config.layerConfig.add_inputs();
+  testLayerGrad(config, "factorization_machine", 16, false, useGpu, false);
+}
+
+TEST(Layer, FactorizationMachineLayer) {
+  testFactorizationMachineLayer(INPUT_DATA, false);
+  testFactorizationMachineLayer(INPUT_SPARSE_FLOAT_VALUE_DATA, false);
+#ifdef PADDLE_WITH_CUDA
+  testFactorizationMachineLayer(INPUT_DATA, true);
+#endif
+}
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   initMain(argc, argv);
diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto
index ebf0911d6ea0b..0d2140ccf9390 100644
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -525,6 +525,9 @@ message LayerConfig {
 
   // for switch order layer
   optional ReshapeConfig reshape_conf = 59;
+
+  // for factorization machine layer
+  optional uint32 factor_size = 60;
 }
 
 message EvaluatorConfig {
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 098a51ab87912..07b3ff66dc7a2 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -3780,6 +3780,21 @@ def __init__(self, name, inputs, reshape, **xargs):
         self.config.reshape_conf.width_axis.extend(reshape['width'])
 
 
+@config_layer('factorization_machine')
+class FactorizationMachineLayer(LayerBase):
+    def __init__(self, name, inputs, factor_size, **xargs):
+        super(FactorizationMachineLayer, self).__init__(
+            name, 'factorization_machine', size=1, inputs=inputs, **xargs)
+        config_assert(
+            len(self.inputs) == 1,
+            'factorization machine layer must have one and only one input.')
+        self.config.factor_size = factor_size
+        input_layer = self.get_input_layer(0)
+        psize = input_layer.size * factor_size
+        dims = [input_layer.size, 1]
+        self.create_input_parameter(0, psize, dims)
+
+
 # Deprecated, use a new layer specific class instead
 @config_func
 def Layer(name, type, **xargs):
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index d37f29d2c4bf9..e6348dca2a816 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -143,6 +143,7 @@
     'scale_shift_layer',
     'img_conv3d_layer',
     'resize_layer',
+    'factorization_machine',
 ]
 
 
@@ -253,6 +254,8 @@ class LayerType(object):
 
     RESIZE = 'resize'
 
+    FACTORIZATION_MACHINE = 'factorization_machine'
+
     @staticmethod
     def is_layer_type(type_name):
         """
@@ -6955,3 +6958,65 @@ def resize_layer(input, size, name=None):
     """
     Layer(name=name, type=LayerType.RESIZE, inputs=Input(input.name), size=size)
     return LayerOutput(name, LayerType.RESIZE, parents=[input], size=input.size)
+
+
+@wrap_name_default()
+@wrap_act_default(act=LinearActivation())
+@wrap_param_attr_default()
+@layer_support()
+def factorization_machine(input,
+                          factor_size,
+                          act=None,
+                          name=None,
+                          param_attr=None,
+                          layer_attr=None):
+    """
+    The Factorization Machine models pairwise feature interactions as inner
+    product of the learned latent vectors corresponding to each input feature.
+
+    The Factorization Machine can effectively capture feature interactions
+    especially when the input is sparse. In practice, usually order 2 feature
+    interactions are considered using Factorization Machine with the formula:
+
+    .. math::
+
+        y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j
+
+    Note:
+        X is the input vector with size n. V is the factor matrix. Each row of V
+        is the latent vector corresponding to each input dimesion. The size of
+        each latent vector is k.
+
+    .. code-block:: python
+
+       factor_machine = factorization_machine(input=input_layer, factor_size=10)
+
+    :param input: The input layer.
+    :type input: LayerOutput
+    :param factor_size: The hyperparameter that defines the dimensionality of
+                        the latent vector size
+    :type context_len: int
+    :param act: Activation Type. Default is linear activation.
+    :type act: BaseActivation
+    :param param_attr: The Parameter Attribute. If None, the latent vectors will
+                       be initialized smartly. It's better to set it by
+                       yourself.
+    :type param_attr: ParameterAttribute
+    :param layer_attr: Extra Layer config.
+    :type layer_attr: ExtraLayerAttribute|None
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+
+    """
+    assert isinstance(input, LayerOutput)
+    assert factor_size > 0, "the factor_size must be greater than 0."
+
+    Layer(
+        inputs=[Input(input.name, **param_attr.attr)],
+        name=name,
+        factor_size=factor_size,
+        type=LayerType.FACTORIZATION_MACHINE,
+        active_type=act.name,
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name, LayerType.FACTORIZATION_MACHINE, input, activation=act, size=1)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
index 6a4550c209762..40bbb04bd493b 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -10,6 +10,7 @@ test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_la
 test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_layer
 test_kmax_seq_socre_layer test_sub_nested_seq_select_layer test_scale_shift_layer
 test_seq_slice_layer test_cross_entropy_over_beam test_pooling3D_layer
-test_conv3d_layer test_deconv3d_layer test_BatchNorm3D test_resize_layer)
+test_conv3d_layer test_deconv3d_layer test_BatchNorm3D test_resize_layer
+test_factorization_machine)
 
 export whole_configs=(test_split_datasource)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr
new file mode 100644
index 0000000000000..585a5c7b23dd1
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr
@@ -0,0 +1,39 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 1024
+  active_type: ""
+}
+layers {
+  name: "__factorization_machine_0__"
+  type: "factorization_machine"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___factorization_machine_0__.w0"
+  }
+  factor_size: 10
+}
+parameters {
+  name: "___factorization_machine_0__.w0"
+  size: 10240
+  initial_mean: 0.0
+  initial_std: 0.03125
+  dims: 1024
+  dims: 1
+  initial_strategy: 0
+  initial_smart: true
+}
+input_layer_names: "data"
+output_layer_names: "__factorization_machine_0__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__factorization_machine_0__"
+  input_layer_names: "data"
+  output_layer_names: "__factorization_machine_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py b/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py
new file mode 100644
index 0000000000000..62ceb359cf8f9
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py
@@ -0,0 +1,9 @@
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=1000, learning_rate=1e-5)
+
+data = data_layer(name='data', size=1024)
+
+fm = factorization_machine(input=data, factor_size=10)
+
+outputs(fm)

From f504c8a83d641b573ef0765227246460dea2f764 Mon Sep 17 00:00:00 2001
From: wangmeng28 <wangmeng28@baidu.com>
Date: Wed, 11 Oct 2017 21:47:27 +0800
Subject: [PATCH 02/18] Remove unnecessary configs

---
 paddle/gserver/tests/test_LayerGrad.cpp                       | 4 +---
 .../tests/configs/test_factorization_machine.py               | 2 --
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index f63c93c9430c2..eea884cb5021a 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -2371,10 +2371,8 @@ void testFactorizationMachineLayer(InputType type, bool useGpu) {
 
 TEST(Layer, FactorizationMachineLayer) {
   testFactorizationMachineLayer(INPUT_DATA, false);
-  testFactorizationMachineLayer(INPUT_SPARSE_FLOAT_VALUE_DATA, false);
-#ifdef PADDLE_WITH_CUDA
   testFactorizationMachineLayer(INPUT_DATA, true);
-#endif
+  testFactorizationMachineLayer(INPUT_SPARSE_FLOAT_VALUE_DATA, false);
 }
 
 int main(int argc, char** argv) {
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py b/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py
index 62ceb359cf8f9..b249de0fee3c8 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py
@@ -1,7 +1,5 @@
 from paddle.trainer_config_helpers import *
 
-settings(batch_size=1000, learning_rate=1e-5)
-
 data = data_layer(name='data', size=1024)
 
 fm = factorization_machine(input=data, factor_size=10)

From 947b6a77ce08c1ca2dc386514f0e97eb75ade91a Mon Sep 17 00:00:00 2001
From: wangmeng28 <wangmeng28@baidu.com>
Date: Tue, 17 Oct 2017 00:26:53 +0800
Subject: [PATCH 03/18] Implement factorization machine layer

---
 .../layers/FactorizationMachineLayer.cpp      | 62 +++++++++++++++++--
 .../layers/FactorizationMachineLayer.h        | 12 ++++
 paddle/gserver/tests/test_LayerGrad.cpp       |  5 +-
 3 files changed, 73 insertions(+), 6 deletions(-)

diff --git a/paddle/gserver/layers/FactorizationMachineLayer.cpp b/paddle/gserver/layers/FactorizationMachineLayer.cpp
index 5456bf2601eab..09128eeeef143 100644
--- a/paddle/gserver/layers/FactorizationMachineLayer.cpp
+++ b/paddle/gserver/layers/FactorizationMachineLayer.cpp
@@ -33,7 +33,10 @@ bool FactorizationMachineLayer::init(const LayerMap& layerMap,
   /* initialize the latentVectors_ */
   CHECK_EQ(inputLayers_.size(), 1UL);
   size_t height = inputLayers_[0]->getSize();
-  latentVectors_.reset(new Weight(height, factorSize_, parameters_[0]));
+  latentVectors_ =
+      std::unique_ptr<Weight>(new Weight(height, factorSize_, parameters_[0]));
+
+  v2_ = latentVectors_->getW()->clone(0, 0, useGpu_);
 
   return true;
 }
@@ -41,14 +44,28 @@ bool FactorizationMachineLayer::init(const LayerMap& layerMap,
 void FactorizationMachineLayer::forward(PassType passType) {
   Layer::forward(passType);
 
-  auto input = getInput(0);
+  const MatrixPtr& inputV = getInputValue(0);
 
-  int batchSize = input.getBatchSize();
-  int size = getSize();
+  size_t batchSize = inputV->getHeight();
+  size_t size = getSize();
   reserveOutput(batchSize, size);
 
   MatrixPtr outV = getOutputValue();
 
+  Matrix::resizeOrCreate(tmpMul_, batchSize, factorSize_, false, useGpu_);
+  Matrix::resizeOrCreate(tmpOut_, batchSize, factorSize_, false, useGpu_);
+
+  REGISTER_TIMER_INFO("FwMulTimer", getName().c_str());
+  tmpMul_->mul(*inputV, *latentVectors_->getW());
+  tmpOut_->pow2(*tmpMul_, 2);
+  outV->sumRows(*tmpOut_, 0.5, 0);
+
+  x2_ = inputV->clone(0, 0, useGpu_);
+  x2_->pow2(*inputV, 2);
+  v2_->pow2(*latentVectors_->getW(), 2);
+  tmpOut_->mul(*x2_, *v2_);
+  outV->sumRows(*tmpOut_, -0.5, 1.0);
+
   /* activation */ {
     REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
     forwardActivation();
@@ -60,6 +77,43 @@ void FactorizationMachineLayer::backward(const UpdateCallback& callback) {
     REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
     backwardActivation();
   }
+
+  const MatrixPtr& inputV = getInputValue(0);
+  const MatrixPtr& oGrad = getOutputGrad();
+
+  MatrixPtr tmpSum =
+      Matrix::create(1, latentVectors_->getW()->getHeight(), false, useGpu_);
+  MatrixPtr tmpSum_T = Matrix::create(tmpSum->getRowBuf(0),
+                                      latentVectors_->getW()->getHeight(),
+                                      1,
+                                      false,
+                                      useGpu_);
+
+  /* Calculate the gradients of the latentVectors_ matrix */
+  if (latentVectors_->getWGrad()) {
+    MatrixPtr tmpIn = inputV->clone(0, 0, useGpu_);
+    tmpIn->rowScale(0, *inputV, *oGrad);
+
+    latentVectors_->getWGrad()->mul(*tmpIn->getTranspose(), *tmpMul_, 1, 1);
+
+    tmpIn->rowScale(0, *x2_, *oGrad);
+    tmpSum->sumCols(*tmpIn, -1, 0);
+    latentVectors_->getWGrad()->addRowScale(
+        0, *latentVectors_->getW(), *tmpSum_T);
+
+    /* Increasing the number of gradient */
+    latentVectors_->getParameterPtr()->incUpdate(callback);
+  }
+
+  /* Calculate the input layers gradient */
+  MatrixPtr inGrad = getInputGrad(0);
+  if (inGrad != NULL) {
+    MatrixPtr latentVectors_T = latentVectors_->getW()->getTranspose();
+    inGrad->mul(*tmpMul_, *latentVectors_T, 1, 1);
+    tmpSum_T->sumRows(*v2_, -1, 0);
+    inGrad->addColScale(0, *inputV, *tmpSum);
+    inGrad->rowScale(0, *inGrad, *oGrad);
+  }
 }
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/FactorizationMachineLayer.h b/paddle/gserver/layers/FactorizationMachineLayer.h
index e7807c8986c21..7cf064690ff8d 100644
--- a/paddle/gserver/layers/FactorizationMachineLayer.h
+++ b/paddle/gserver/layers/FactorizationMachineLayer.h
@@ -40,10 +40,22 @@ namespace paddle {
 class FactorizationMachineLayer : public Layer {
 protected:
   /// The latent vectors, shape: (size, factorSize_)
+  /// Each row of the latentVectors_ matrix is the latent vector
+  /// corresponding to one input feature dimension
   std::unique_ptr<Weight> latentVectors_;
   /// The hyperparameter that defines the dimensionality of the factorization
   size_t factorSize_;
 
+private:
+  /// The result of input matrix * letent vector matrix that will be used in
+  /// both forward and backward step
+  MatrixPtr tmpMul_;
+  MatrixPtr tmpOut_;
+  /// Store the square values of the letent vectors matrix
+  MatrixPtr v2_;
+  /// Store the square values of input matrix
+  MatrixPtr x2_;
+
 public:
   explicit FactorizationMachineLayer(const LayerConfig& config)
       : Layer(config) {}
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index eea884cb5021a..21e8fb7eed11d 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -2363,8 +2363,9 @@ void testFactorizationMachineLayer(InputType type, bool useGpu) {
   TestConfig config;
   config.layerConfig.set_type("factorization_machine");
   config.layerConfig.set_factor_size(FACTOR_SIZE);
-  config.biasSize = 1;
-  config.inputDefs.push_back({type, "layer_0", 8192, 0});
+  config.layerConfig.set_size(1);
+  config.biasSize = 0;
+  config.inputDefs.push_back({type, "layer_0", 1024, 10240});
   config.layerConfig.add_inputs();
   testLayerGrad(config, "factorization_machine", 16, false, useGpu, false);
 }

From 2ce8f1875bb6f69bdc48eb16e78a2c163316ca2b Mon Sep 17 00:00:00 2001
From: wangmeng28 <wangmeng28@baidu.com>
Date: Tue, 17 Oct 2017 11:09:41 +0800
Subject: [PATCH 04/18] Fix tests for factorization machine layer

---
 paddle/gserver/tests/test_LayerGrad.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 21e8fb7eed11d..54053b751bf7c 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -2373,7 +2373,6 @@ void testFactorizationMachineLayer(InputType type, bool useGpu) {
 TEST(Layer, FactorizationMachineLayer) {
   testFactorizationMachineLayer(INPUT_DATA, false);
   testFactorizationMachineLayer(INPUT_DATA, true);
-  testFactorizationMachineLayer(INPUT_SPARSE_FLOAT_VALUE_DATA, false);
 }
 
 int main(int argc, char** argv) {

From 86053e7766a93ee0130131c20f262c58a4cbc86d Mon Sep 17 00:00:00 2001
From: wangmeng28 <wangmeng28@baidu.com>
Date: Tue, 17 Oct 2017 12:20:43 +0800
Subject: [PATCH 05/18] Reduce the input size in testing factorization machine

---
 paddle/gserver/tests/test_LayerGrad.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 54053b751bf7c..6c604b1e6710b 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -2365,14 +2365,15 @@ void testFactorizationMachineLayer(InputType type, bool useGpu) {
   config.layerConfig.set_factor_size(FACTOR_SIZE);
   config.layerConfig.set_size(1);
   config.biasSize = 0;
-  config.inputDefs.push_back({type, "layer_0", 1024, 10240});
+  config.inputDefs.push_back({type, "layer_0", 128, 1280});
   config.layerConfig.add_inputs();
   testLayerGrad(config, "factorization_machine", 16, false, useGpu, false);
 }
 
 TEST(Layer, FactorizationMachineLayer) {
-  testFactorizationMachineLayer(INPUT_DATA, false);
-  testFactorizationMachineLayer(INPUT_DATA, true);
+  for (auto useGpu : {false, true}) {
+    testFactorizationMachineLayer(INPUT_DATA, useGpu);
+  }
 }
 
 int main(int argc, char** argv) {

From 9741ade8ee761f78291e249ea17ad5e3e2c904d2 Mon Sep 17 00:00:00 2001
From: wangmeng28 <wangmeng28@baidu.com>
Date: Tue, 17 Oct 2017 16:53:54 +0800
Subject: [PATCH 06/18] Change pow to square in factorization machine layer

---
 paddle/gserver/layers/FactorizationMachineLayer.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/gserver/layers/FactorizationMachineLayer.cpp b/paddle/gserver/layers/FactorizationMachineLayer.cpp
index 09128eeeef143..8d9dcbaea7f75 100644
--- a/paddle/gserver/layers/FactorizationMachineLayer.cpp
+++ b/paddle/gserver/layers/FactorizationMachineLayer.cpp
@@ -57,12 +57,12 @@ void FactorizationMachineLayer::forward(PassType passType) {
 
   REGISTER_TIMER_INFO("FwMulTimer", getName().c_str());
   tmpMul_->mul(*inputV, *latentVectors_->getW());
-  tmpOut_->pow2(*tmpMul_, 2);
+  tmpMul_->square2(*tmpOut_);
   outV->sumRows(*tmpOut_, 0.5, 0);
 
   x2_ = inputV->clone(0, 0, useGpu_);
-  x2_->pow2(*inputV, 2);
-  v2_->pow2(*latentVectors_->getW(), 2);
+  inputV->square2(*x2_);
+  latentVectors_->getW()->square2(*v2_);
   tmpOut_->mul(*x2_, *v2_);
   outV->sumRows(*tmpOut_, -0.5, 1.0);
 

From 8654e8a5203c62ca7b69c1778ff0b71f7c5f8223 Mon Sep 17 00:00:00 2001
From: wangmeng28 <wangmeng28@baidu.com>
Date: Tue, 17 Oct 2017 23:42:51 +0800
Subject: [PATCH 07/18] Fix dims in config parser for factorization machine
 layer

---
 python/paddle/trainer/config_parser.py                          | 2 +-
 .../tests/configs/protostr/test_factorization_machine.protostr  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 9aba0b49ad880..557a91ca7b5b8 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -3794,7 +3794,7 @@ def __init__(self, name, inputs, factor_size, **xargs):
         self.config.factor_size = factor_size
         input_layer = self.get_input_layer(0)
         psize = input_layer.size * factor_size
-        dims = [input_layer.size, 1]
+        dims = [input_layer.size, factor_size]
         self.create_input_parameter(0, psize, dims)
 
 
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr
index 585a5c7b23dd1..4f3002b19942e 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr
@@ -22,7 +22,7 @@ parameters {
   initial_mean: 0.0
   initial_std: 0.03125
   dims: 1024
-  dims: 1
+  dims: 10
   initial_strategy: 0
   initial_smart: true
 }

From 4c72b0634cc2c280f0edcc84a0ece00511fdd6cd Mon Sep 17 00:00:00 2001
From: wangmeng28 <wangmeng28@baidu.com>
Date: Wed, 18 Oct 2017 15:36:36 +0800
Subject: [PATCH 08/18] Fix creation of tmp variable in factorization machine
 layer

---
 paddle/gserver/layers/FactorizationMachineLayer.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/gserver/layers/FactorizationMachineLayer.cpp b/paddle/gserver/layers/FactorizationMachineLayer.cpp
index 8d9dcbaea7f75..e5c9d1a90d5bc 100644
--- a/paddle/gserver/layers/FactorizationMachineLayer.cpp
+++ b/paddle/gserver/layers/FactorizationMachineLayer.cpp
@@ -33,10 +33,11 @@ bool FactorizationMachineLayer::init(const LayerMap& layerMap,
   /* initialize the latentVectors_ */
   CHECK_EQ(inputLayers_.size(), 1UL);
   size_t height = inputLayers_[0]->getSize();
+  CHECK_EQ(parameters_[0]->getSize(), height * factorSize_);
   latentVectors_ =
       std::unique_ptr<Weight>(new Weight(height, factorSize_, parameters_[0]));
 
-  v2_ = latentVectors_->getW()->clone(0, 0, useGpu_);
+  v2_ = Matrix::create(height, factorSize_, false, useGpu_);
 
   return true;
 }

From d9062cd9ee1297547c16d57c0d5024ceb3555d2f Mon Sep 17 00:00:00 2001
From: wangmeng28 <wangmeng28@baidu.com>
Date: Thu, 26 Oct 2017 00:43:47 +0800
Subject: [PATCH 09/18] Add sparse matrix support in factorization machine
 layer

---
 .../layers/FactorizationMachineLayer.cpp      | 24 +++++++++++++++----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/paddle/gserver/layers/FactorizationMachineLayer.cpp b/paddle/gserver/layers/FactorizationMachineLayer.cpp
index e5c9d1a90d5bc..06658a2841382 100644
--- a/paddle/gserver/layers/FactorizationMachineLayer.cpp
+++ b/paddle/gserver/layers/FactorizationMachineLayer.cpp
@@ -62,7 +62,12 @@ void FactorizationMachineLayer::forward(PassType passType) {
   outV->sumRows(*tmpOut_, 0.5, 0);
 
   x2_ = inputV->clone(0, 0, useGpu_);
-  inputV->square2(*x2_);
+  if (dynamic_cast<CpuSparseMatrix*>(x2_.get())) {
+    x2_->copyFrom(*inputV);
+    (dynamic_cast<CpuSparseMatrix*>(x2_.get()))->square2();
+  } else {
+    inputV->square2(*x2_);
+  }
   latentVectors_->getW()->square2(*v2_);
   tmpOut_->mul(*x2_, *v2_);
   outV->sumRows(*tmpOut_, -0.5, 1.0);
@@ -93,11 +98,20 @@ void FactorizationMachineLayer::backward(const UpdateCallback& callback) {
   /* Calculate the gradients of the latentVectors_ matrix */
   if (latentVectors_->getWGrad()) {
     MatrixPtr tmpIn = inputV->clone(0, 0, useGpu_);
-    tmpIn->rowScale(0, *inputV, *oGrad);
-
-    latentVectors_->getWGrad()->mul(*tmpIn->getTranspose(), *tmpMul_, 1, 1);
+    if (dynamic_cast<CpuSparseMatrix*>(inputV.get())) {
+      CpuSparseMatrix* inputV_s = dynamic_cast<CpuSparseMatrix*>(inputV.get());
+      CpuSparseMatrix* x2_s = dynamic_cast<CpuSparseMatrix*>(x2_.get());
+      CpuSparseMatrix* tmpIn_s = dynamic_cast<CpuSparseMatrix*>(tmpIn.get());
+      tmpIn_s->copyFrom(*inputV_s);
+      tmpIn_s->rowScale(0, *inputV_s, *oGrad);
+      latentVectors_->getWGrad()->mul(*tmpIn->getTranspose(), *tmpMul_, 1, 1);
+      tmpIn_s->rowScale(0, *x2_s, *oGrad);
+    } else {
+      tmpIn->rowScale(0, *inputV, *oGrad);
+      latentVectors_->getWGrad()->mul(*tmpIn->getTranspose(), *tmpMul_, 1, 1);
+      tmpIn->rowScale(0, *x2_, *oGrad);
+    }
 
-    tmpIn->rowScale(0, *x2_, *oGrad);
     tmpSum->sumCols(*tmpIn, -1, 0);
     latentVectors_->getWGrad()->addRowScale(
         0, *latentVectors_->getW(), *tmpSum_T);

From 509ae79a5de846dfd38bd85618b2467066413a97 Mon Sep 17 00:00:00 2001
From: wangmeng28 <wangmeng28@baidu.com>
Date: Thu, 26 Oct 2017 00:47:06 +0800
Subject: [PATCH 10/18] Add rowScale for CpuSparseMatrix

---
 paddle/math/CpuSparseMatrix.cpp | 17 +++++++++++++++++
 paddle/math/CpuSparseMatrix.h   |  9 +++++++++
 2 files changed, 26 insertions(+)

diff --git a/paddle/math/CpuSparseMatrix.cpp b/paddle/math/CpuSparseMatrix.cpp
index bf62229c03bb1..e211c23a7e670 100644
--- a/paddle/math/CpuSparseMatrix.cpp
+++ b/paddle/math/CpuSparseMatrix.cpp
@@ -260,6 +260,23 @@ void CpuSparseMatrix::printOneRow(std::ostream& os, size_t idx) const {
   os << ";";
 }
 
+void CpuSparseMatrix::rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c) {
+  CHECK(getFormat() != SPARSE_CSC) << "Not supported";
+  CHECK(height_ == b.getHeight());
+  CHECK(width_ == b.getWidth());
+  real* A = getValue();
+  real* B = b.getValue();
+  for (size_t i = 0; i < height_; i++) {
+    size_t start = getRowStartIdx(i);
+    size_t end = getRowStartIdx(i + 1);
+    CHECK(start == b.getRowStartIdx(i));
+    CHECK(end == b.getRowStartIdx(i + 1));
+    for (size_t j = start; j < end; j++) {
+      A[j] = B[j] * c.getElement(i, cCol);
+    }
+  }
+}
+
 void CpuSparseMatrix::randomizeUniform() {
   CHECK_LE(elementCnt_, height_ * width_);
   if (valueType_ == FLOAT_VALUE) {
diff --git a/paddle/math/CpuSparseMatrix.h b/paddle/math/CpuSparseMatrix.h
index 36d57bbb65245..8f9ad67215f5c 100644
--- a/paddle/math/CpuSparseMatrix.h
+++ b/paddle/math/CpuSparseMatrix.h
@@ -236,6 +236,15 @@ class CpuSparseMatrix : public Matrix {
               const unsigned int* cols,
               const real* values);
 
+  /**
+   * @brief this_row = b_row * c_row[cCol]
+   *
+   * @param[in]  cCol   the column of matrix c used to scale each row of b
+   * @param[in]  b      CpuSparseMatrix
+   * @param[in]  c      Matrix
+   */
+  void rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c);
+
   void randomizeUniform();
 
   void copyFrom(const GpuSparseMatrix& src, hl_stream_t stream);

From 4172fc09c39b61c3cb1933687680bab15153b59f Mon Sep 17 00:00:00 2001
From: wangmeng28 <wangmeng28@baidu.com>
Date: Wed, 1 Nov 2017 21:51:23 +0800
Subject: [PATCH 11/18] Add sparse input support for factorization machine
 layer

---
 paddle/gserver/layers/FactorizationMachineLayer.cpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/paddle/gserver/layers/FactorizationMachineLayer.cpp b/paddle/gserver/layers/FactorizationMachineLayer.cpp
index 06658a2841382..3bd8d7cb4c7c6 100644
--- a/paddle/gserver/layers/FactorizationMachineLayer.cpp
+++ b/paddle/gserver/layers/FactorizationMachineLayer.cpp
@@ -104,15 +104,21 @@ void FactorizationMachineLayer::backward(const UpdateCallback& callback) {
       CpuSparseMatrix* tmpIn_s = dynamic_cast<CpuSparseMatrix*>(tmpIn.get());
       tmpIn_s->copyFrom(*inputV_s);
       tmpIn_s->rowScale(0, *inputV_s, *oGrad);
-      latentVectors_->getWGrad()->mul(*tmpIn->getTranspose(), *tmpMul_, 1, 1);
+      latentVectors_->getWGrad()->mul(*tmpIn_s->getTranspose(), *tmpMul_, 1, 1);
       tmpIn_s->rowScale(0, *x2_s, *oGrad);
+
+      MatrixPtr ones = Matrix::create(1, inputV->getHeight(), false, useGpu_);
+      ones->zeroMem();
+      ones->add(-1);
+      tmpSum->mul(*ones, *tmpIn_s, 1, 0);
     } else {
       tmpIn->rowScale(0, *inputV, *oGrad);
       latentVectors_->getWGrad()->mul(*tmpIn->getTranspose(), *tmpMul_, 1, 1);
       tmpIn->rowScale(0, *x2_, *oGrad);
+
+      tmpSum->sumCols(*tmpIn, -1, 0);
     }
 
-    tmpSum->sumCols(*tmpIn, -1, 0);
     latentVectors_->getWGrad()->addRowScale(
         0, *latentVectors_->getW(), *tmpSum_T);
 

From 7a1a586355844eb18fb6c87304cee5bbf70d078d Mon Sep 17 00:00:00 2001
From: wangmeng28 <wangmeng28@baidu.com>
Date: Thu, 16 Nov 2017 17:15:03 +0800
Subject: [PATCH 12/18] Update variable names and docs for factorization
 machine layer

---
 .../layers/FactorizationMachineLayer.cpp      | 110 +++++++++---------
 .../layers/FactorizationMachineLayer.h        |  31 +++--
 paddle/gserver/tests/test_LayerGrad.cpp       |   1 +
 paddle/math/CpuSparseMatrix.cpp               |   8 +-
 .../paddle/trainer_config_helpers/layers.py   |  14 ++-
 5 files changed, 94 insertions(+), 70 deletions(-)

diff --git a/paddle/gserver/layers/FactorizationMachineLayer.cpp b/paddle/gserver/layers/FactorizationMachineLayer.cpp
index 3bd8d7cb4c7c6..f0f1738f30550 100644
--- a/paddle/gserver/layers/FactorizationMachineLayer.cpp
+++ b/paddle/gserver/layers/FactorizationMachineLayer.cpp
@@ -32,12 +32,10 @@ bool FactorizationMachineLayer::init(const LayerMap& layerMap,
 
   /* initialize the latentVectors_ */
   CHECK_EQ(inputLayers_.size(), 1UL);
-  size_t height = inputLayers_[0]->getSize();
-  CHECK_EQ(parameters_[0]->getSize(), height * factorSize_);
-  latentVectors_ =
-      std::unique_ptr<Weight>(new Weight(height, factorSize_, parameters_[0]));
-
-  v2_ = Matrix::create(height, factorSize_, false, useGpu_);
+  size_t inputSize = inputLayers_[0]->getSize();
+  CHECK_EQ(parameters_[0]->getSize(), inputSize * factorSize_);
+  latentVectors_ = std::unique_ptr<Weight>(
+      new Weight(inputSize, factorSize_, parameters_[0]));
 
   return true;
 }
@@ -48,79 +46,85 @@ void FactorizationMachineLayer::forward(PassType passType) {
   const MatrixPtr& inputV = getInputValue(0);
 
   size_t batchSize = inputV->getHeight();
-  size_t size = getSize();
-  reserveOutput(batchSize, size);
+  size_t outputSize = getSize();
+  size_t inputSize = inputLayers_[0]->getSize();
+  reserveOutput(batchSize, outputSize);
 
   MatrixPtr outV = getOutputValue();
 
-  Matrix::resizeOrCreate(tmpMul_, batchSize, factorSize_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      latentVectorsSquare_, inputSize, factorSize_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      inputMulFactor_, batchSize, factorSize_, false, useGpu_);
   Matrix::resizeOrCreate(tmpOut_, batchSize, factorSize_, false, useGpu_);
 
-  REGISTER_TIMER_INFO("FwMulTimer", getName().c_str());
-  tmpMul_->mul(*inputV, *latentVectors_->getW());
-  tmpMul_->square2(*tmpOut_);
+  REGISTER_TIMER_INFO("InputMulFactorTimer", getName().c_str());
+  inputMulFactor_->mul(*inputV, *latentVectors_->getW());
+  inputMulFactor_->square2(*tmpOut_);
   outV->sumRows(*tmpOut_, 0.5, 0);
 
-  x2_ = inputV->clone(0, 0, useGpu_);
-  if (dynamic_cast<CpuSparseMatrix*>(x2_.get())) {
-    x2_->copyFrom(*inputV);
-    (dynamic_cast<CpuSparseMatrix*>(x2_.get()))->square2();
+  inputSquare_ = inputV->clone(0, 0, useGpu_);
+  if (dynamic_cast<CpuSparseMatrix*>(inputSquare_.get())) {
+    inputSquare_->copyFrom(*inputV);
+    (dynamic_cast<CpuSparseMatrix*>(inputSquare_.get()))->square2();
   } else {
-    inputV->square2(*x2_);
+    inputV->square2(*inputSquare_);
   }
-  latentVectors_->getW()->square2(*v2_);
-  tmpOut_->mul(*x2_, *v2_);
+  latentVectors_->getW()->square2(*latentVectorsSquare_);
+  tmpOut_->mul(*inputSquare_, *latentVectorsSquare_);
   outV->sumRows(*tmpOut_, -0.5, 1.0);
 
   /* activation */ {
-    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
+    REGISTER_TIMER_INFO("FmAtvTimer", getName().c_str());
     forwardActivation();
   }
 }
 
 void FactorizationMachineLayer::backward(const UpdateCallback& callback) {
-  /* Do derivation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
-  }
+  /* Do derivation */ { backwardActivation(); }
 
   const MatrixPtr& inputV = getInputValue(0);
   const MatrixPtr& oGrad = getOutputGrad();
 
-  MatrixPtr tmpSum =
-      Matrix::create(1, latentVectors_->getW()->getHeight(), false, useGpu_);
-  MatrixPtr tmpSum_T = Matrix::create(tmpSum->getRowBuf(0),
-                                      latentVectors_->getW()->getHeight(),
-                                      1,
-                                      false,
-                                      useGpu_);
+  Matrix::resizeOrCreate(
+      tmpSum_, 1, latentVectors_->getW()->getHeight(), false, useGpu_);
+  MatrixPtr tmpSumTrans = Matrix::create(tmpSum_->getRowBuf(0),
+                                         latentVectors_->getW()->getHeight(),
+                                         1,
+                                         false,
+                                         useGpu_);
 
   /* Calculate the gradients of the latentVectors_ matrix */
   if (latentVectors_->getWGrad()) {
-    MatrixPtr tmpIn = inputV->clone(0, 0, useGpu_);
+    MatrixPtr tmpInput = inputV->clone(0, 0, useGpu_);
     if (dynamic_cast<CpuSparseMatrix*>(inputV.get())) {
-      CpuSparseMatrix* inputV_s = dynamic_cast<CpuSparseMatrix*>(inputV.get());
-      CpuSparseMatrix* x2_s = dynamic_cast<CpuSparseMatrix*>(x2_.get());
-      CpuSparseMatrix* tmpIn_s = dynamic_cast<CpuSparseMatrix*>(tmpIn.get());
-      tmpIn_s->copyFrom(*inputV_s);
-      tmpIn_s->rowScale(0, *inputV_s, *oGrad);
-      latentVectors_->getWGrad()->mul(*tmpIn_s->getTranspose(), *tmpMul_, 1, 1);
-      tmpIn_s->rowScale(0, *x2_s, *oGrad);
-
-      MatrixPtr ones = Matrix::create(1, inputV->getHeight(), false, useGpu_);
-      ones->zeroMem();
-      ones->add(-1);
-      tmpSum->mul(*ones, *tmpIn_s, 1, 0);
+      CpuSparseMatrix* sparseInputV =
+          dynamic_cast<CpuSparseMatrix*>(inputV.get());
+      CpuSparseMatrix* sparseInputSquare =
+          dynamic_cast<CpuSparseMatrix*>(inputSquare_.get());
+      CpuSparseMatrix* sparseTmpInput =
+          dynamic_cast<CpuSparseMatrix*>(tmpInput.get());
+      sparseTmpInput->copyFrom(*sparseInputV);
+      sparseTmpInput->rowScale(0, *sparseInputV, *oGrad);
+      latentVectors_->getWGrad()->mul(
+          *sparseTmpInput->getTranspose(), *inputMulFactor_, 1, 1);
+      sparseTmpInput->rowScale(0, *sparseInputSquare, *oGrad);
+
+      Matrix::resizeOrCreate(negOnes_, 1, inputV->getHeight(), false, useGpu_);
+      negOnes_->zeroMem();
+      negOnes_->add(-1);
+      tmpSum_->mul(*negOnes_, *sparseTmpInput, 1, 0);
     } else {
-      tmpIn->rowScale(0, *inputV, *oGrad);
-      latentVectors_->getWGrad()->mul(*tmpIn->getTranspose(), *tmpMul_, 1, 1);
-      tmpIn->rowScale(0, *x2_, *oGrad);
+      tmpInput->rowScale(0, *inputV, *oGrad);
+      latentVectors_->getWGrad()->mul(
+          *tmpInput->getTranspose(), *inputMulFactor_, 1, 1);
+      tmpInput->rowScale(0, *inputSquare_, *oGrad);
 
-      tmpSum->sumCols(*tmpIn, -1, 0);
+      tmpSum_->sumCols(*tmpInput, -1, 0);
     }
 
     latentVectors_->getWGrad()->addRowScale(
-        0, *latentVectors_->getW(), *tmpSum_T);
+        0, *latentVectors_->getW(), *tmpSumTrans);
 
     /* Increasing the number of gradient */
     latentVectors_->getParameterPtr()->incUpdate(callback);
@@ -129,10 +133,10 @@ void FactorizationMachineLayer::backward(const UpdateCallback& callback) {
   /* Calculate the input layers gradient */
   MatrixPtr inGrad = getInputGrad(0);
   if (inGrad != NULL) {
-    MatrixPtr latentVectors_T = latentVectors_->getW()->getTranspose();
-    inGrad->mul(*tmpMul_, *latentVectors_T, 1, 1);
-    tmpSum_T->sumRows(*v2_, -1, 0);
-    inGrad->addColScale(0, *inputV, *tmpSum);
+    inGrad->mul(
+        *inputMulFactor_, *latentVectors_->getW()->getTranspose(), 1, 1);
+    tmpSumTrans->sumRows(*latentVectorsSquare_, -1, 0);
+    inGrad->addColScale(0, *inputV, *tmpSum_);
     inGrad->rowScale(0, *inGrad, *oGrad);
   }
 }
diff --git a/paddle/gserver/layers/FactorizationMachineLayer.h b/paddle/gserver/layers/FactorizationMachineLayer.h
index 7cf064690ff8d..85d40fdb1ee39 100644
--- a/paddle/gserver/layers/FactorizationMachineLayer.h
+++ b/paddle/gserver/layers/FactorizationMachineLayer.h
@@ -34,27 +34,36 @@ namespace paddle {
  *     y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j
  * \f]
  *
+ * The detailed calculation for forward and backward can be found at this paper:
+ *
+ *     Rendle, Steffen. Factorization machines. IEEE 10th International
+ *     Conference on Data Mining (ICDM). IEEE, 2010.
+ *
  * The config file api is factorization_machine.
  */
 
 class FactorizationMachineLayer : public Layer {
 protected:
-  /// The latent vectors, shape: (size, factorSize_)
-  /// Each row of the latentVectors_ matrix is the latent vector
-  /// corresponding to one input feature dimension
+  // The latent vectors, shape: (size, factorSize_)
+  // Each row of the latentVectors_ matrix is the latent vector
+  // corresponding to one input feature dimension
   std::unique_ptr<Weight> latentVectors_;
-  /// The hyperparameter that defines the dimensionality of the factorization
+  // The hyperparameter that defines the dimensionality of the factorization
   size_t factorSize_;
 
 private:
-  /// The result of input matrix * letent vector matrix that will be used in
-  /// both forward and backward step
-  MatrixPtr tmpMul_;
+  // Store the square values of the letent vectors matrix
+  MatrixPtr latentVectorsSquare_;
+  // Store the square values of input matrix
+  MatrixPtr inputSquare_;
+  // The result of input matrix * latent vector matrix that will be used in
+  // both forward and backward step
+  MatrixPtr inputMulFactor_;
+  // Temporary calculation result store
   MatrixPtr tmpOut_;
-  /// Store the square values of the letent vectors matrix
-  MatrixPtr v2_;
-  /// Store the square values of input matrix
-  MatrixPtr x2_;
+  MatrixPrt tmpSum_;
+  // Negative identity matrix
+  MatrixPtr negOnes_;
 
 public:
   explicit FactorizationMachineLayer(const LayerConfig& config)
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 072d75c23d64d..04ff618c21427 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -2442,6 +2442,7 @@ void testFactorizationMachineLayer(InputType type, bool useGpu) {
 TEST(Layer, FactorizationMachineLayer) {
   for (auto useGpu : {false, true}) {
     testFactorizationMachineLayer(INPUT_DATA, useGpu);
+    testFactorizationMachineLayer(INPUT_SPARSE_FLOAT_VALUE_DATA, useGpu);
   }
 }
 
diff --git a/paddle/math/CpuSparseMatrix.cpp b/paddle/math/CpuSparseMatrix.cpp
index e211c23a7e670..6a432cd16b727 100644
--- a/paddle/math/CpuSparseMatrix.cpp
+++ b/paddle/math/CpuSparseMatrix.cpp
@@ -262,15 +262,15 @@ void CpuSparseMatrix::printOneRow(std::ostream& os, size_t idx) const {
 
 void CpuSparseMatrix::rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c) {
   CHECK(getFormat() != SPARSE_CSC) << "Not supported";
-  CHECK(height_ == b.getHeight());
-  CHECK(width_ == b.getWidth());
+  CHECK_EQ(height_, b.getHeight());
+  CHECK_EQ(width_, b.getWidth());
   real* A = getValue();
   real* B = b.getValue();
   for (size_t i = 0; i < height_; i++) {
     size_t start = getRowStartIdx(i);
     size_t end = getRowStartIdx(i + 1);
-    CHECK(start == b.getRowStartIdx(i));
-    CHECK(end == b.getRowStartIdx(i + 1));
+    CHECK_EQ(start, b.getRowStartIdx(i));
+    CHECK_EQ(end, b.getRowStartIdx(i + 1));
     for (size_t j = start; j < end; j++) {
       A[j] = B[j] * c.getElement(i, cCol);
     }
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 30e334e7c8aa5..7e38383bd623c 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -7161,16 +7161,26 @@ def factorization_machine(input,
     The Factorization Machine models pairwise feature interactions as inner
     product of the learned latent vectors corresponding to each input feature.
     The Factorization Machine can effectively capture feature interactions
-    especially when the input is sparse. In practice, usually order 2 feature
-    interactions are considered using Factorization Machine with the formula:
+    especially when the input is sparse.
+
+    This implementation only consider the 2-order feature interactions using
+    Factorization Machine with the formula:
+
     .. math::
         y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j
+
     Note:
         X is the input vector with size n. V is the factor matrix. Each row of V
         is the latent vector corresponding to each input dimesion. The size of
         each latent vector is k.
+
+    For details of Factorization Machine, please refer to the paper:
+        Rendle, Steffen. Factorization machines. IEEE 10th International
+        Conference on Data Mining (ICDM). IEEE, 2010.
+
     .. code-block:: python
        factor_machine = factorization_machine(input=input_layer, factor_size=10)
+
     :param input: The input layer.
     :type input: LayerOutput
     :param factor_size: The hyperparameter that defines the dimensionality of

From 0b6afb589cb74c4cb24b8ee5461f1d8b12674143 Mon Sep 17 00:00:00 2001
From: wangmeng28 <wangmeng28@baidu.com>
Date: Thu, 16 Nov 2017 19:11:40 +0800
Subject: [PATCH 13/18] Fix typo in factorization machine layer

---
 paddle/gserver/layers/FactorizationMachineLayer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/gserver/layers/FactorizationMachineLayer.h b/paddle/gserver/layers/FactorizationMachineLayer.h
index 85d40fdb1ee39..85ac175657c35 100644
--- a/paddle/gserver/layers/FactorizationMachineLayer.h
+++ b/paddle/gserver/layers/FactorizationMachineLayer.h
@@ -61,7 +61,7 @@ class FactorizationMachineLayer : public Layer {
   MatrixPtr inputMulFactor_;
   // Temporary calculation result store
   MatrixPtr tmpOut_;
-  MatrixPrt tmpSum_;
+  MatrixPtr tmpSum_;
   // Negative identity matrix
   MatrixPtr negOnes_;
 

From 09f4f9257981dc3744e9131dabcebebaa5eb7f91 Mon Sep 17 00:00:00 2001
From: wangmeng28 <wangmeng28@baidu.com>
Date: Thu, 16 Nov 2017 20:33:25 +0800
Subject: [PATCH 14/18] Add unitest for factorization machine layer with sparse
 input

---
 paddle/gserver/tests/test_LayerGrad.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 589db0bd6ce17..7ad9866ecff94 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -2444,8 +2444,8 @@ void testFactorizationMachineLayer(InputType type, bool useGpu) {
 TEST(Layer, FactorizationMachineLayer) {
   for (auto useGpu : {false, true}) {
     testFactorizationMachineLayer(INPUT_DATA, useGpu);
-    testFactorizationMachineLayer(INPUT_SPARSE_FLOAT_VALUE_DATA, useGpu);
   }
+  testFactorizationMachineLayer(INPUT_SPARSE_FLOAT_VALUE_DATA, false);
 }
 
 int main(int argc, char** argv) {

From d5a6c81dc55057ba437efe417992c0521e87c754 Mon Sep 17 00:00:00 2001
From: wangmeng28 <wangmeng28@baidu.com>
Date: Mon, 20 Nov 2017 11:48:52 +0800
Subject: [PATCH 15/18] Update docs for factorization machine layer

---
 paddle/gserver/layers/FactorizationMachineLayer.h | 5 ++---
 python/paddle/trainer_config_helpers/layers.py    | 5 ++---
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/paddle/gserver/layers/FactorizationMachineLayer.h b/paddle/gserver/layers/FactorizationMachineLayer.h
index 85ac175657c35..3bc36daaab331 100644
--- a/paddle/gserver/layers/FactorizationMachineLayer.h
+++ b/paddle/gserver/layers/FactorizationMachineLayer.h
@@ -36,8 +36,7 @@ namespace paddle {
  *
  * The detailed calculation for forward and backward can be found at this paper:
  *
- *     Rendle, Steffen. Factorization machines. IEEE 10th International
- *     Conference on Data Mining (ICDM). IEEE, 2010.
+ *     Factorization machines.
  *
  * The config file api is factorization_machine.
  */
@@ -59,7 +58,7 @@ class FactorizationMachineLayer : public Layer {
   // The result of input matrix * latent vector matrix that will be used in
   // both forward and backward step
   MatrixPtr inputMulFactor_;
-  // Temporary calculation result store
+  // Store temporary calculation result
   MatrixPtr tmpOut_;
   MatrixPtr tmpSum_;
   // Negative identity matrix
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index cc1bf923dd00c..37214a53d362b 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -3876,7 +3876,7 @@ def recurrent_layer(input,
     :type input: LayerOutput
     :param act: Activation type. TanhActivation is the default activation.
     :type act: BaseActivation
-    :param bias_attr: The parameter attribute for bias. If this parameter is set to 
+    :param bias_attr: The parameter attribute for bias. If this parameter is set to
                       False or an object whose type is not ParameterAttribute,
                       no bias is defined. If the parameter is set to True,
                       the bias is initialized to zero.
@@ -7307,8 +7307,7 @@ def factorization_machine(input,
         each latent vector is k.
 
     For details of Factorization Machine, please refer to the paper:
-        Rendle, Steffen. Factorization machines. IEEE 10th International
-        Conference on Data Mining (ICDM). IEEE, 2010.
+    Factorization machines.
 
     .. code-block:: python
        factor_machine = factorization_machine(input=input_layer, factor_size=10)

From 6fed6f2079902c86c43161f916c3450094fde6d0 Mon Sep 17 00:00:00 2001
From: wangmeng28 <wangmeng28@baidu.com>
Date: Mon, 20 Nov 2017 20:44:52 +0800
Subject: [PATCH 16/18] Add support of sparse_binary_vector as input for fm
 layer

---
 .../layers/FactorizationMachineLayer.cpp      | 20 +++++++++-----
 .../layers/FactorizationMachineLayer.h        |  1 +
 paddle/math/CpuSparseMatrix.cpp               | 26 ++++++++++++++-----
 3 files changed, 34 insertions(+), 13 deletions(-)

diff --git a/paddle/gserver/layers/FactorizationMachineLayer.cpp b/paddle/gserver/layers/FactorizationMachineLayer.cpp
index f0f1738f30550..b665fb6dfc4a0 100644
--- a/paddle/gserver/layers/FactorizationMachineLayer.cpp
+++ b/paddle/gserver/layers/FactorizationMachineLayer.cpp
@@ -96,15 +96,20 @@ void FactorizationMachineLayer::backward(const UpdateCallback& callback) {
 
   /* Calculate the gradients of the latentVectors_ matrix */
   if (latentVectors_->getWGrad()) {
-    MatrixPtr tmpInput = inputV->clone(0, 0, useGpu_);
     if (dynamic_cast<CpuSparseMatrix*>(inputV.get())) {
+      Matrix::resizeOrCreateSparseMatrix(tmpInput_,
+                                         inputV->getHeight(),
+                                         inputV->getWidth(),
+                                         inputV->getElementCnt());
+
       CpuSparseMatrix* sparseInputV =
           dynamic_cast<CpuSparseMatrix*>(inputV.get());
       CpuSparseMatrix* sparseInputSquare =
           dynamic_cast<CpuSparseMatrix*>(inputSquare_.get());
       CpuSparseMatrix* sparseTmpInput =
-          dynamic_cast<CpuSparseMatrix*>(tmpInput.get());
+          dynamic_cast<CpuSparseMatrix*>(tmpInput_.get());
       sparseTmpInput->copyFrom(*sparseInputV);
+
       sparseTmpInput->rowScale(0, *sparseInputV, *oGrad);
       latentVectors_->getWGrad()->mul(
           *sparseTmpInput->getTranspose(), *inputMulFactor_, 1, 1);
@@ -115,12 +120,15 @@ void FactorizationMachineLayer::backward(const UpdateCallback& callback) {
       negOnes_->add(-1);
       tmpSum_->mul(*negOnes_, *sparseTmpInput, 1, 0);
     } else {
-      tmpInput->rowScale(0, *inputV, *oGrad);
+      Matrix::resizeOrCreate(
+          tmpInput_, inputV->getHeight(), inputV->getWidth(), false, useGpu_);
+
+      tmpInput_->rowScale(0, *inputV, *oGrad);
       latentVectors_->getWGrad()->mul(
-          *tmpInput->getTranspose(), *inputMulFactor_, 1, 1);
-      tmpInput->rowScale(0, *inputSquare_, *oGrad);
+          *tmpInput_->getTranspose(), *inputMulFactor_, 1, 1);
+      tmpInput_->rowScale(0, *inputSquare_, *oGrad);
 
-      tmpSum_->sumCols(*tmpInput, -1, 0);
+      tmpSum_->sumCols(*tmpInput_, -1, 0);
     }
 
     latentVectors_->getWGrad()->addRowScale(
diff --git a/paddle/gserver/layers/FactorizationMachineLayer.h b/paddle/gserver/layers/FactorizationMachineLayer.h
index 3bc36daaab331..df20a49934d5d 100644
--- a/paddle/gserver/layers/FactorizationMachineLayer.h
+++ b/paddle/gserver/layers/FactorizationMachineLayer.h
@@ -61,6 +61,7 @@ class FactorizationMachineLayer : public Layer {
   // Store temporary calculation result
   MatrixPtr tmpOut_;
   MatrixPtr tmpSum_;
+  MatrixPtr tmpInput_;
   // Negative identity matrix
   MatrixPtr negOnes_;
 
diff --git a/paddle/math/CpuSparseMatrix.cpp b/paddle/math/CpuSparseMatrix.cpp
index 6a432cd16b727..dc6979cf5a522 100644
--- a/paddle/math/CpuSparseMatrix.cpp
+++ b/paddle/math/CpuSparseMatrix.cpp
@@ -266,13 +266,25 @@ void CpuSparseMatrix::rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c) {
   CHECK_EQ(width_, b.getWidth());
   real* A = getValue();
   real* B = b.getValue();
-  for (size_t i = 0; i < height_; i++) {
-    size_t start = getRowStartIdx(i);
-    size_t end = getRowStartIdx(i + 1);
-    CHECK_EQ(start, b.getRowStartIdx(i));
-    CHECK_EQ(end, b.getRowStartIdx(i + 1));
-    for (size_t j = start; j < end; j++) {
-      A[j] = B[j] * c.getElement(i, cCol);
+  if (b.getValueType() == FLOAT_VALUE) {
+    for (size_t i = 0; i < height_; i++) {
+      size_t start = getRowStartIdx(i);
+      size_t end = getRowStartIdx(i + 1);
+      CHECK_EQ(start, b.getRowStartIdx(i));
+      CHECK_EQ(end, b.getRowStartIdx(i + 1));
+      for (size_t j = start; j < end; j++) {
+        A[j] = B[j] * c.getElement(i, cCol);
+      }
+    }
+  } else if (b.getValueType() == NO_VALUE) {
+    for (size_t i = 0; i < height_; i++) {
+      size_t start = getRowStartIdx(i);
+      size_t end = getRowStartIdx(i + 1);
+      CHECK_EQ(start, b.getRowStartIdx(i));
+      CHECK_EQ(end, b.getRowStartIdx(i + 1));
+      for (size_t j = start; j < end; j++) {
+        A[j] = c.getElement(i, cCol);
+      }
     }
   }
 }

From 74a699a72ef9046a7f302e339c8e20a8152ae9d8 Mon Sep 17 00:00:00 2001
From: wangmeng28 <wangmeng28@baidu.com>
Date: Mon, 20 Nov 2017 22:14:24 +0800
Subject: [PATCH 17/18] change clone to resizeOrCreate in fm layer

---
 .../gserver/layers/FactorizationMachineLayer.cpp   | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/paddle/gserver/layers/FactorizationMachineLayer.cpp b/paddle/gserver/layers/FactorizationMachineLayer.cpp
index b665fb6dfc4a0..be26b9ba88c27 100644
--- a/paddle/gserver/layers/FactorizationMachineLayer.cpp
+++ b/paddle/gserver/layers/FactorizationMachineLayer.cpp
@@ -58,16 +58,22 @@ void FactorizationMachineLayer::forward(PassType passType) {
       inputMulFactor_, batchSize, factorSize_, false, useGpu_);
   Matrix::resizeOrCreate(tmpOut_, batchSize, factorSize_, false, useGpu_);
 
-  REGISTER_TIMER_INFO("InputMulFactorTimer", getName().c_str());
+  REGISTER_TIMER_INFO("FmInputMulFactorTimer", getName().c_str());
   inputMulFactor_->mul(*inputV, *latentVectors_->getW());
   inputMulFactor_->square2(*tmpOut_);
   outV->sumRows(*tmpOut_, 0.5, 0);
 
-  inputSquare_ = inputV->clone(0, 0, useGpu_);
-  if (dynamic_cast<CpuSparseMatrix*>(inputSquare_.get())) {
+  if (dynamic_cast<CpuSparseMatrix*>(inputV.get())) {
+    Matrix::resizeOrCreateSparseMatrix(inputSquare_,
+                                       inputV->getHeight(),
+                                       inputV->getWidth(),
+                                       inputV->getElementCnt(),
+                                       inputV->getValueType());
     inputSquare_->copyFrom(*inputV);
     (dynamic_cast<CpuSparseMatrix*>(inputSquare_.get()))->square2();
   } else {
+    Matrix::resizeOrCreate(
+        inputSquare_, inputV->getHeight(), inputV->getWidth(), false, useGpu_);
     inputV->square2(*inputSquare_);
   }
   latentVectors_->getW()->square2(*latentVectorsSquare_);
@@ -75,7 +81,7 @@ void FactorizationMachineLayer::forward(PassType passType) {
   outV->sumRows(*tmpOut_, -0.5, 1.0);
 
   /* activation */ {
-    REGISTER_TIMER_INFO("FmAtvTimer", getName().c_str());
+    REGISTER_TIMER_INFO("FmFwAtvTimer", getName().c_str());
     forwardActivation();
   }
 }

From 8a283dbc9e78f8c2f00d04180986abfb7d6b29df Mon Sep 17 00:00:00 2001
From: wangmeng28 <wangmeng28@baidu.com>
Date: Mon, 27 Nov 2017 19:13:28 +0800
Subject: [PATCH 18/18] Update docs for fm layer

---
 .../paddle/trainer_config_helpers/layers.py   | 21 ++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 32287cce6ccae..288aebb5b496d 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -7423,18 +7423,25 @@ def factorization_machine(input,
     Factorization machines.
 
     .. code-block:: python
-       factor_machine = factorization_machine(input=input_layer, factor_size=10)
-
-    :param input: The input layer.
+        first_order = paddle.layer.fc(input=input,
+                                      size=1,
+                                      act=paddle.activation.Linear())
+        second_order = paddle.layer.factorization_machine(input=input,
+                                                          factor_size=10)
+        fm = paddle.layer.addto(input=[first_order, second_order],
+                                act=paddle.activation.Linear(),
+                                bias_attr=False)
+
+    :param input: The input layer. Supported input types: all input data types
+                  on CPU, and only dense input types on GPU.
     :type input: LayerOutput
     :param factor_size: The hyperparameter that defines the dimensionality of
-                        the latent vector size
+                        the latent vector size.
     :type context_len: int
     :param act: Activation Type. Default is linear activation.
     :type act: BaseActivation
-    :param param_attr: The Parameter Attribute. If None, the latent vectors will
-                       be initialized smartly. It's better to set it by
-                       yourself.
+    :param param_attr: The parameter attribute. See ParameterAttribute for
+                       details.
     :type param_attr: ParameterAttribute
     :param layer_attr: Extra Layer config.
     :type layer_attr: ExtraLayerAttribute|None