follow comments

PaddlePaddle · Oct 31, 2016 · 27e89df · 27e89df
1 parent 61e21c3
commit 27e89df
Show file tree

Hide file tree

Showing 11 changed files with 61 additions and 69 deletions.
diff --git a/paddle/gserver/layers/ConcatenateLayer.cpp b/paddle/gserver/layers/ConcatenateLayer.cpp
@@ -98,7 +98,7 @@ void ConcatenateLayer::backward(const UpdateCallback& callback) {
 class ConcatenateLayer2 : public Layer {
 public:
   explicit ConcatenateLayer2(const LayerConfig& config) :
-      Layer(config), sharedBias_(false) {}
+      Layer(config) {}
 
   ~ConcatenateLayer2() {}
 
@@ -141,9 +141,7 @@ bool ConcatenateLayer2::init(const LayerMap& layerMap,
 
   /* initialize biases_ */
   if (biasParameter_.get() != NULL) {
-    if (config_.has_shared_biases()) {
-      sharedBias_ = config_.shared_biases();
-    }
+    sharedBias_ = config_.shared_biases();
     size_t psize = config_.bias_size();
     biases_ = std::unique_ptr<Weight>(new Weight(1, psize, biasParameter_));
   }
@@ -173,7 +171,7 @@ void ConcatenateLayer2::forward(PassType passType) {
   }
 
   /* add the bias-vector */
-  if (biases_.get() != NULL) {
+  if (biases_) {
     REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
     output_.value->addBias(*(biases_->getW()), 1, sharedBias_);
   }
@@ -190,18 +188,16 @@ void ConcatenateLayer2::backward(const UpdateCallback& callback) {
     backwardActivation();
   }
 
+  AsyncGpuBlock block;
   if (biases_ && biases_->getWGrad()) {
     REGISTER_TIMER_INFO("Concat2BpBiasTimer", getName().c_str());
     biases_->getWGrad()->collectBias(*getOutputGrad(), 1, sharedBias_);
     biases_->getParameterPtr()->incUpdate(callback);
   }
 
-  {
-    AsyncGpuBlock block;
-    for (size_t i = 0; i != inputLayers_.size(); ++i) {
-      if (projections_[i]) {
-        projections_[i]->backward(callback);
-      }
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    if (projections_[i]) {
+      projections_[i]->backward(callback);
     }
   }
 }

diff --git a/paddle/gserver/layers/ConvBaseLayer.cpp b/paddle/gserver/layers/ConvBaseLayer.cpp
@@ -62,10 +62,14 @@ bool ConvBaseLayer::init(const LayerMap& layerMap,
 }
 
 size_t ConvBaseLayer::calOutputSize() {
-  imgSizeH_.clear();
-  imgSizeW_.clear();
-  outputH_.clear();
-  outputW_.clear();
+  auto clearAndReserve = [this](IntV* vec) {
+    vec->clear();
+    vec->reserve(this->inputLayers_.size());
+  };
+  clearAndReserve(&imgSizeH_);
+  clearAndReserve(&imgSizeW_);
+  clearAndReserve(&outputH_);
+  clearAndReserve(&outputW_);
   size_t layerSize = 0;
   for (size_t i = 0; i < inputLayers_.size(); i++) {
     imgSizeH_.push_back(inputLayers_[i]->getOutput().getFrameHeight());

diff --git a/paddle/gserver/layers/ConvBaseLayer.h b/paddle/gserver/layers/ConvBaseLayer.h
@@ -27,7 +27,6 @@ class ConvBaseLayer : public Layer {
 protected:
   typedef std::vector<int> IntV;
 
-
   /// The number of filters.
   int numFilters_;
   /// The x dimension of the padding.
@@ -79,6 +78,12 @@ class ConvBaseLayer : public Layer {
   explicit ConvBaseLayer(const LayerConfig& config) : Layer(config) {}
 
   virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  /**
+   * imgSizeH_ and imgSizeW_ will be set according to the previous input layers
+   * in this function. Then it will calculate outputH_ and outputW_ and set them
+   * into output argument.
+   */
   virtual size_t calOutputSize();
 
   Weight& getWeight(int idx) { return *weights_[idx]; }

diff --git a/paddle/gserver/layers/ConvProjection.cpp b/paddle/gserver/layers/ConvProjection.cpp
@@ -18,25 +18,11 @@ limitations under the License. */
 
 namespace paddle {
 
-static ThreadLocal<std::vector<MemoryHandle*>> convMem_;
-static __thread bool convMemInit = false;
-void* getSpaceBytes(size_t size) {
-  if (!convMemInit) {
-    int numDevices = hl_get_device_count();
-    convMem_.get()->resize(numDevices);
-    convMemInit = true;
-  }
-
-  int devId = hl_get_device();
-  MemoryHandle** localMem = &(*convMem_.get())[devId];
-  if (NULL == *localMem || size > (*localMem)->getAllocSize()) {
-    *localMem = new GpuMemoryHandle(size);
-  }
-  return (*localMem)->getBuf();
-}
 
 REGISTER_PROJECTION(conv, ConvProjection);
 
+ThreadLocalD<std::vector<MemoryHandle*>> ConvProjection::convMem_;
+
 ConvProjection::ConvProjection(const ProjectionConfig& config,
                                ParameterPtr parameter, bool useGpu)
     : Projection(config, parameter, useGpu) {
@@ -48,8 +34,6 @@ ConvProjection::ConvProjection(const ProjectionConfig& config,
   size_t height = filterH_ * filterW_ * channels_ / groups_;
   size_t width = numFilters_;
   weight_.reset(new Weight(height, width, parameter));
-
-
   weightOffset_ = height * width / groups_;
 }
 
@@ -108,6 +92,7 @@ void ConvProjection::reshapeTensorDesc(int batchSize) {
   // for example, in the case of layer ConcatenateLayer2 with two
   // ConvProjection, the stride is the output_size of layer ConcatenateLayer2.
   // So the calculation of nStride is different from CudnnConvLayer.
+  // In fact, only "nStride = out_->value->getStride()" is ok.
   size_t nStride = numFilters_ * outputH_ * outputW_;
   if (out_->value->isContiguous()) {
     CHECK_EQ(nStride, out_->value->getWidth());
@@ -120,7 +105,8 @@ void ConvProjection::reshapeTensorDesc(int batchSize) {
 }
 
 void ConvProjection::reshape(int batchSize) {
-  calOutputSize();
+  size_t width = calOutputSize();
+  CHECK_EQ(width, out_->value->getWidth());
 
   isSelectAlgo_ = (batchSize == batchNum_);
   batchNum_ = batchSize;
@@ -201,6 +187,21 @@ void ConvProjection::backward(const UpdateCallback& callback) {
   weight_->getParameterPtr()->incUpdate(callback);
 }
 
+void* ConvProjection::getSpaceBytes(size_t size) {
+  std::vector<MemoryHandle*>& convMem = *convMem_;
+  if (convMem.empty()) {
+    int numDevices = hl_get_device_count();
+    convMem.resize(numDevices);
+  }
+
+  int devId = hl_get_device();
+  MemoryHandle** localMem = &(convMem[devId]);
+  if (NULL == *localMem || size > (*localMem)->getAllocSize()) {
+    *localMem = new GpuMemoryHandle(size);
+  }
+  return (*localMem)->getBuf();
+}
+
 ConvProjection::~ConvProjection() {
   hl_destroy_tensor_descriptor(inputDesc_);
   hl_destroy_tensor_descriptor(outputDesc_);

diff --git a/paddle/gserver/layers/ConvProjection.h b/paddle/gserver/layers/ConvProjection.h
@@ -46,7 +46,7 @@ class ConvProjection : public Projection {
     return (imageSize - filterSize + 2 * padding) / stride + 1;
   }
 
-  void calOutputSize() {
+  size_t calOutputSize() {
     imageH_ = in_->getFrameHeight();
     imageW_ = in_->getFrameWidth();
     if (imageH_ == 0) imageH_ = configImgH_;
@@ -59,8 +59,11 @@ class ConvProjection : public Projection {
 
     inputOffset_ = (channels_ / groups_) * imageH_ * imageW_;
     outputOffset_ = (numFilters_ / groups_) * outputH_ * outputW_;
+    return outputH_ * outputW_ * numFilters_;
   }
 
+  static void* getSpaceBytes(size_t size);
+
   /// imageH_ and imageW_ is calculated from the input layer.
   int imageH_, imageW_;
   /// configImgH_ and configImgW_ is obtained from config.
@@ -87,13 +90,13 @@ class ConvProjection : public Projection {
   /// Cudnn tensor descriptor for a convolution operation.
   hl_convolution_descriptor convDesc_;
 
-  /// Save the algorithm for forward convolution, which is obtained by cudnn
+  /// Record the algorithm for forward convolution, which is obtained by cudnn
   /// api to search the best suited algorithm.
   int fwdAlgo_;
-  /// Save the algorithm for computing convolution gradient with respect to
+  /// Record the algorithm for computing convolution gradient with respect to
   /// filter coefficients.
   int bwdFilterAlgo_;
-  /// Save the algorithm for computing convolution gradient with respect to
+  /// Record the algorithm for computing convolution gradient with respect to
   /// the output.
   int bwdDataAlgo_;
   /// Amount of GPU memory needed as workspace to be able to execute a
@@ -108,15 +111,15 @@ class ConvProjection : public Projection {
   /// Size of total work space.
   size_t workSpaceInBytes_;
 
-  /// Is or not select conv algorihtm.
+  /// Whether to call cuDNN api to choose conv algorithm.
   bool isSelectAlgo_;
   /// batchNum is used to record batch size. If the batch size is changed,
   /// the selection algorithm will be called.
   int batchNum_;
-
   bool bias_;
 
   std::unique_ptr<Weight> weight_;
+  static ThreadLocalD<std::vector<MemoryHandle*>> convMem_;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/CudnnConvLayer.cpp b/paddle/gserver/layers/CudnnConvLayer.cpp
@@ -27,6 +27,7 @@ bool CudnnConvLayer::init(const LayerMap &layerMap,
 
   CHECK_EQ(inputLayers_.size(), parameters_.size());
   projections_.reserve(inputLayers_.size());
+  projConf_.reserve(inputLayers_.size());
 
   numFilters_ = config_.num_filters();
   CHECK(config_.shared_biases());
@@ -100,7 +101,7 @@ void CudnnConvLayer::backward(const UpdateCallback &callback) {
 }
 
 CudnnConvLayer::~CudnnConvLayer() {
-  if (biases_.get()) {
+  if (biases_) {
     hl_destroy_tensor_descriptor(biasDesc_);
     hl_destroy_tensor_descriptor(outputDesc_);
   }

diff --git a/paddle/gserver/layers/MixedLayer.cpp b/paddle/gserver/layers/MixedLayer.cpp
@@ -44,9 +44,7 @@ bool MixedLayer::init(const LayerMap& layerMap,
 
   /* initialize biases_ */
   if (biasParameter_.get() != NULL) {
-    if (config_.has_shared_biases()) {
-      sharedBias_ = config_.shared_biases();
-    }
+    sharedBias_ = config_.shared_biases();
     size_t psize = config_.bias_size();
     biases_ = std::unique_ptr<Weight>(
         new Weight(1, psize, biasParameter_));

diff --git a/paddle/trainer/TrainerBenchmark.cpp b/paddle/trainer/TrainerBenchmark.cpp
@@ -24,19 +24,14 @@ P_DEFINE_bool(feed_data, false, "Wether to read data from DataProvider.");
 
 namespace paddle {
 
-
 void Trainer::time() {
-  srand(config_->getConfig().start_pass() + 1);
-  dataProvider_->reset();
+  startTrain();
 
-  this->stats_->reset();
   trainerInternal_.getParameterUpdater()->startPass();
   evaluator_->start();
 
-  trainerInternal_.getGradientMachine()->start(*config_, dataProvider_);
   DataBatch dataBatch;
   int32_t batchSize = config_->getOptConfig().batch_size();
-
   int32_t num = dataProvider_->getNextBatch(batchSize, &dataBatch);
   CHECK_EQ(num, batchSize) << "The sample number is less than batch size "
                            << num << " != " << batchSize;
@@ -70,7 +65,7 @@ void Trainer::time() {
   globalStat.printSegTimerStatus();
   globalStat.reset();
 
-  trainerInternal_.getGradientMachine()->finish();
+  finishTrain();
 }
 
 }  // namespace paddle
diff --git a/proto/ModelConfig.proto.m4 b/proto/ModelConfig.proto.m4
@@ -255,7 +255,7 @@ sinclude(`ModelConfigLayer.proto.m4')
   // (which is how convnets are usually trained). Setting this to
   // false will untie the biases, yielding a separate bias for
   // every location at which the filter is applied.
-  optional bool shared_biases = 8;
+  optional bool shared_biases = 8 [default = false];
 
   // Valid values are ones that divide the area of the output
   // grid in this convolutional layer. For example if this layer

diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
@@ -668,9 +668,6 @@ def calc_bias_size(self):
 
     def calc_parameter_dims(self, input_size, output_size):
         return None
-        # or [self.proj_conf.conv_conf.channels *
-        #     self.proj_conf.conv_conf.filter_size * self.proj_conf.conv_conf.filter_size_y,
-        #     self.config.num_filters]
 
 
 # Define a operator for mixed layer
@@ -2569,18 +2566,13 @@ def __init__(
             record_operator_conf = self.config.operator_confs.add()
             record_operator_conf.CopyFrom(operator_conf)
 
-        shared_biases=None
-
         psize = self.config.size
         if isinstance(self.inputs[0], ConvProjection):
-            shared_biases = True
+            self.config.shared_biases = True
             psize = 0
             for input in self.inputs:
                 psize += input.calc_bias_size()
 
-        if shared_biases is not None:
-            self.config.shared_biases = shared_biases
-
         self.config.bias_size = psize
         self.create_bias_parameter(bias, psize)
 
@@ -2632,7 +2624,8 @@ def __init__(
           for input_index in xrange(len(self.inputs) - 1):
               input = self.inputs[input_index + 1]
               config_assert(isinstance(input, ConvProjection),
-                  "All the inputs of ConcatenateLayer2 should be ConvProjection.")
+                  "The first input of ConcatenateLayer2 is ConvProjection, "
+                  "the other inputs should also be ConvProjection.")
 
         size = 0
         for input_index in xrange(len(self.inputs)):
@@ -2659,18 +2652,13 @@ def __init__(
               input.proj_conf.output_size)
             self.create_input_parameter(input_index, psize, dims)
 
-        shared_biases=None
-
         psize = self.config.size
         if isinstance(self.inputs[0], ConvProjection):
-            shared_biases = True
+            self.config.shared_biases = True
             psize = 0
             for input in self.inputs:
                 psize += input.calc_bias_size()
 
-        if shared_biases is not None:
-            self.config.shared_biases = shared_biases
-
         self.config.bias_size = psize
         self.create_bias_parameter(bias, psize)
 

diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
@@ -895,6 +895,7 @@ def simple_gru2(input,
     """
     simple_gru2 is the same with simple_gru, but using grumemory instead
     Please see grumemory in layers.py for more detail about the maths.
+    simple_gru2 is faster than simple_gru.
 
     The example usage is: