PaddlePaddle · will-am · Nov 27, 2017 · Oct 11, 2017 · Oct 11, 2017 · Oct 11, 2017
diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
@@ -54,7 +54,7 @@ img_conv
 
 ..  _api_v2.layer_context_projection:
 
-context_projection 
+context_projection
 ------------------
 ..  autoclass:: paddle.v2.layer.context_projection
     :noindex:
@@ -70,7 +70,7 @@ Image Pooling Layer
 img_pool
 --------
 ..  autoclass:: paddle.v2.layer.img_pool
-    :noindex:   
+    :noindex:
 
 spp
 ---
@@ -104,7 +104,7 @@ sum_to_one_norm
 ---------------
 ..  autoclass:: paddle.v2.layer.sum_to_one_norm
     :noindex:
-    
+
 cross_channel_norm
 ------------------
 ..  autoclass:: paddle.v2.layer.cross_channel_norm
@@ -114,7 +114,7 @@ row_l2_norm
 -----------
 ..  autoclass:: paddle.v2.layer.row_l2_norm
     :noindex:
-    
+
 Recurrent Layers
 ================
 
@@ -400,6 +400,13 @@ multiplex
 ..  autoclass:: paddle.v2.layer.multiplex
     :noindex:
 
+Factorization Machine Layer
+============================
+
+factorization_machine
+---------------------
+..  autoclass:: paddle.v2.layer.factorization_machine
+    :noindex:
 
 Slicing and Joining Layers
 ==========================

diff --git a/paddle/gserver/layers/FactorizationMachineLayer.cpp b/paddle/gserver/layers/FactorizationMachineLayer.cpp
@@ -0,0 +1,140 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "FactorizationMachineLayer.h"
+#include <algorithm>
+#include <vector>
+#include "paddle/math/SparseMatrix.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(factorization_machine, FactorizationMachineLayer);
+
+bool FactorizationMachineLayer::init(const LayerMap& layerMap,
+                                     const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  factorSize_ = config_.factor_size();
+
+  /* initialize the latentVectors_ */
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  size_t height = inputLayers_[0]->getSize();
+  CHECK_EQ(parameters_[0]->getSize(), height * factorSize_);
+  latentVectors_ =
+      std::unique_ptr<Weight>(new Weight(height, factorSize_, parameters_[0]));
+
+  v2_ = Matrix::create(height, factorSize_, false, useGpu_);
+
+  return true;
+}
+
+void FactorizationMachineLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const MatrixPtr& inputV = getInputValue(0);
+
+  size_t batchSize = inputV->getHeight();
+  size_t size = getSize();
+  reserveOutput(batchSize, size);
+
+  MatrixPtr outV = getOutputValue();
+
+  Matrix::resizeOrCreate(tmpMul_, batchSize, factorSize_, false, useGpu_);
+  Matrix::resizeOrCreate(tmpOut_, batchSize, factorSize_, false, useGpu_);
+
+  REGISTER_TIMER_INFO("FwMulTimer", getName().c_str());
+  tmpMul_->mul(*inputV, *latentVectors_->getW());
+  tmpMul_->square2(*tmpOut_);
+  outV->sumRows(*tmpOut_, 0.5, 0);
+
+  x2_ = inputV->clone(0, 0, useGpu_);
+  if (dynamic_cast<CpuSparseMatrix*>(x2_.get())) {
+    x2_->copyFrom(*inputV);
+    (dynamic_cast<CpuSparseMatrix*>(x2_.get()))->square2();
+  } else {
+    inputV->square2(*x2_);
+  }
+  latentVectors_->getW()->square2(*v2_);
+  tmpOut_->mul(*x2_, *v2_);
+  outV->sumRows(*tmpOut_, -0.5, 1.0);
+
+  /* activation */ {
+    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void FactorizationMachineLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ {
+    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
+    backwardActivation();
+  }
+
+  const MatrixPtr& inputV = getInputValue(0);
+  const MatrixPtr& oGrad = getOutputGrad();
+
+  MatrixPtr tmpSum =
+      Matrix::create(1, latentVectors_->getW()->getHeight(), false, useGpu_);
+  MatrixPtr tmpSum_T = Matrix::create(tmpSum->getRowBuf(0),
+                                      latentVectors_->getW()->getHeight(),
+                                      1,
+                                      false,
+                                      useGpu_);
+
+  /* Calculate the gradients of the latentVectors_ matrix */
+  if (latentVectors_->getWGrad()) {
+    MatrixPtr tmpIn = inputV->clone(0, 0, useGpu_);
+    if (dynamic_cast<CpuSparseMatrix*>(inputV.get())) {
+      CpuSparseMatrix* inputV_s = dynamic_cast<CpuSparseMatrix*>(inputV.get());
+      CpuSparseMatrix* x2_s = dynamic_cast<CpuSparseMatrix*>(x2_.get());
+      CpuSparseMatrix* tmpIn_s = dynamic_cast<CpuSparseMatrix*>(tmpIn.get());
+      tmpIn_s->copyFrom(*inputV_s);
+      tmpIn_s->rowScale(0, *inputV_s, *oGrad);
+      latentVectors_->getWGrad()->mul(*tmpIn_s->getTranspose(), *tmpMul_, 1, 1);
+      tmpIn_s->rowScale(0, *x2_s, *oGrad);
+
+      MatrixPtr ones = Matrix::create(1, inputV->getHeight(), false, useGpu_);
+      ones->zeroMem();
+      ones->add(-1);
+      tmpSum->mul(*ones, *tmpIn_s, 1, 0);
+    } else {
+      tmpIn->rowScale(0, *inputV, *oGrad);
+      latentVectors_->getWGrad()->mul(*tmpIn->getTranspose(), *tmpMul_, 1, 1);
+      tmpIn->rowScale(0, *x2_, *oGrad);
+
+      tmpSum->sumCols(*tmpIn, -1, 0);
+    }
+
+    latentVectors_->getWGrad()->addRowScale(
+        0, *latentVectors_->getW(), *tmpSum_T);
+
+    /* Increasing the number of gradient */
+    latentVectors_->getParameterPtr()->incUpdate(callback);
+  }
+
+  /* Calculate the input layers gradient */
+  MatrixPtr inGrad = getInputGrad(0);
+  if (inGrad != NULL) {
+    MatrixPtr latentVectors_T = latentVectors_->getW()->getTranspose();
+    inGrad->mul(*tmpMul_, *latentVectors_T, 1, 1);
+    tmpSum_T->sumRows(*v2_, -1, 0);
+    inGrad->addColScale(0, *inputV, *tmpSum);
+    inGrad->rowScale(0, *inGrad, *oGrad);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/FactorizationMachineLayer.h b/paddle/gserver/layers/FactorizationMachineLayer.h
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/ThreadLocal.h"
+
+namespace paddle {
+/**
+ * @brief The Factorization Machine models pairwise (order-2) feature
+ * interactions as inner product of the learned latent vectors corresponding
+ * to each input feature.
+ *
+ * The Factorization Machine can effectively capture feature interactions
+ * especially when the input is sparse. While in principle FM can model higher
+ * order feature interaction, in practice usually only order-2 feature
+ * interactions are considered. The Factorization Machine Layer here only
+ * computes the order-2 interations with the formula:
+ *
+ * \f[
+ *     y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j
+ * \f]
+ *
+ * The config file api is factorization_machine.
+ */
+
+class FactorizationMachineLayer : public Layer {
+protected:
+  /// The latent vectors, shape: (size, factorSize_)
+  /// Each row of the latentVectors_ matrix is the latent vector
+  /// corresponding to one input feature dimension
+  std::unique_ptr<Weight> latentVectors_;
+  /// The hyperparameter that defines the dimensionality of the factorization
+  size_t factorSize_;
+
+private:
+  /// The result of input matrix * letent vector matrix that will be used in
+  /// both forward and backward step
+  MatrixPtr tmpMul_;
+  MatrixPtr tmpOut_;
+  /// Store the square values of the letent vectors matrix
+  MatrixPtr v2_;
+  /// Store the square values of input matrix
+  MatrixPtr x2_;
+
+public:
+  explicit FactorizationMachineLayer(const LayerConfig& config)
+      : Layer(config) {}
+  ~FactorizationMachineLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -2427,6 +2427,24 @@ TEST(Layer, ScaleSubRegionLayer) {
   }
 }
 
+void testFactorizationMachineLayer(InputType type, bool useGpu) {
+  const int FACTOR_SIZE = 10;
+  TestConfig config;
+  config.layerConfig.set_type("factorization_machine");
+  config.layerConfig.set_factor_size(FACTOR_SIZE);
+  config.layerConfig.set_size(1);
+  config.biasSize = 0;
+  config.inputDefs.push_back({type, "layer_0", 128, 1280});
+  config.layerConfig.add_inputs();
+  testLayerGrad(config, "factorization_machine", 16, false, useGpu, false);
+}
+
+TEST(Layer, FactorizationMachineLayer) {
+  for (auto useGpu : {false, true}) {
+    testFactorizationMachineLayer(INPUT_DATA, useGpu);
+  }
+}
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   initMain(argc, argv);

diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto
@@ -540,6 +540,9 @@ message LayerConfig {
 
   // for switch order layer
   optional ReshapeConfig reshape_conf = 59;
+
+  // for factorization machine layer
+  optional uint32 factor_size = 60;
 }
 
 message EvaluatorConfig {

diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
@@ -3832,6 +3832,21 @@ def __init__(self, name, inputs, value, **xargs):
                            image_conf.channels)
 
 
+@config_layer('factorization_machine')
+class FactorizationMachineLayer(LayerBase):
+    def __init__(self, name, inputs, factor_size, **xargs):
+        super(FactorizationMachineLayer, self).__init__(
+            name, 'factorization_machine', size=1, inputs=inputs, **xargs)
+        config_assert(
+            len(self.inputs) == 1,
+            'factorization machine layer must have one and only one input.')
+        self.config.factor_size = factor_size
+        input_layer = self.get_input_layer(0)
+        psize = input_layer.size * factor_size
+        dims = [input_layer.size, factor_size]
+        self.create_input_parameter(0, psize, dims)
+
+
 # Deprecated, use a new layer specific class instead
 @config_func
 def Layer(name, type, **xargs):

diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
@@ -146,6 +146,7 @@
     'resize_layer',
     'sub_seq_layer',
     'scale_sub_region_layer',
+    'factorization_machine',
 ]
 
 
@@ -260,6 +261,8 @@ class LayerType(object):
 
     SCALE_SUB_REGION_LAYER = 'scale_sub_region'
 
+    FACTORIZATION_MACHINE = 'factorization_machine'
+
     @staticmethod
     def is_layer_type(type_name):
         """
@@ -7142,3 +7145,57 @@ def scale_sub_region_layer(input, indices, value, name=None):
         parents=[input, indices],
         num_filters=input.num_filters,
         size=input.size)
+
+
+@wrap_name_default()
+@wrap_act_default(act=LinearActivation())
+@wrap_param_attr_default()
+@layer_support()
+def factorization_machine(input,
+                          factor_size,
+                          act=None,
+                          name=None,
+                          param_attr=None,
+                          layer_attr=None):
+    """
+    The Factorization Machine models pairwise feature interactions as inner
+    product of the learned latent vectors corresponding to each input feature.
+    The Factorization Machine can effectively capture feature interactions
+    especially when the input is sparse. In practice, usually order 2 feature
+    interactions are considered using Factorization Machine with the formula:
+    .. math::
+        y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j
+    Note:
+        X is the input vector with size n. V is the factor matrix. Each row of V
+        is the latent vector corresponding to each input dimesion. The size of
+        each latent vector is k.
+    .. code-block:: python
+       factor_machine = factorization_machine(input=input_layer, factor_size=10)
+    :param input: The input layer.
+    :type input: LayerOutput
+    :param factor_size: The hyperparameter that defines the dimensionality of
+                        the latent vector size
+    :type context_len: int
+    :param act: Activation Type. Default is linear activation.
+    :type act: BaseActivation
+    :param param_attr: The Parameter Attribute. If None, the latent vectors will
+                       be initialized smartly. It's better to set it by
+                       yourself.
+    :type param_attr: ParameterAttribute
+    :param layer_attr: Extra Layer config.
+    :type layer_attr: ExtraLayerAttribute|None
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert isinstance(input, LayerOutput)
+    assert factor_size > 0, "the factor_size must be greater than 0."
+
+    Layer(
+        inputs=[Input(input.name, **param_attr.attr)],
+        name=name,
+        factor_size=factor_size,
+        type=LayerType.FACTORIZATION_MACHINE,
+        active_type=act.name,
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name, LayerType.FACTORIZATION_MACHINE, input, activation=act, size=1)