Skip to content

Commit

Permalink
Add job=time in trainer, refine cudnn_conv to reduce gpu memory and s…
Browse files Browse the repository at this point in the history
…peed up training. (PaddlePaddle#218)

* Add benchmark for PaddlePaddle, tensorflow and caffe

* ConvProjection to reduce memory for goolenet

* Add unit test for ConvProjection.
1. unit test in test_LayerGrad.
2. compare the ConvPorjection and CudnnConvLayer, also compare the concat_layer+img_conv_layer and concat_layer_conv_projection.

* Reduce cudnn_conv memory and add benchmark document.
1. Use TmpMatrix as the workspace in cudnn_conv to reduce gpu memory. It reduce lots of memory.
2. Add benchmark document.
3. fix smallnet_mnist_cifar.py in paddle.

* Add job=time and refine cudnn_conv to reduce gpu memroy and speed up

* Refine cudnn_conv and shared biases operation in concat_layer and mixed_layer.

* follow comments

* follow comments

* Use unique_ptr to prevent memory leaks in CudnnConvLayer.
  • Loading branch information
qingqing01 authored Nov 2, 2016
1 parent 12945b2 commit 45c81a4
Show file tree
Hide file tree
Showing 39 changed files with 1,340 additions and 396 deletions.
7 changes: 6 additions & 1 deletion doc/ui/cmd_argument/argument_outline.md
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ It looks like there are a lot of arguments. However, most of them are for develo
</tr>

<tr>
<td class="left" rowspan = "5">GPU</td><td class="left">gpu_id</td>
<td class="left" rowspan = "6">GPU</td><td class="left">gpu_id</td>
<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
</tr>

Expand All @@ -207,6 +207,11 @@ It looks like there are a lot of arguments. However, most of them are for develo
<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
</tr>

<tr>
<td class="left">cudnn_conv_workspace_limit_in_mb</td>
<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
</tr>

<tr>
<td class="left" rowspan = "4">RNN</td>
<td class="left">beam_size</td>
Expand Down
4 changes: 4 additions & 0 deletions doc/ui/cmd_argument/detail_introduction.md
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,10 @@
- Choose path to dynamic load NVIDIA CUDA library, for instance, /usr/local/cuda/lib64. [Default]: LD_LIBRARY_PATH
- type: string (default: "", null)

* `--cudnn_conv_workspace_limit_in_mb`
- Specify cuDNN max workspace limit, in units MB, 4096MB=4GB by default.
- type: int32 (default: 4096MB=4GB)

## NLP: RNN/LSTM/GRU
* `--rnn_use_batch`
- Whether to use batch method for calculation in simple RecurrentLayer.
Expand Down
19 changes: 19 additions & 0 deletions paddle/cuda/include/hl_device_functions.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -48,5 +48,24 @@ inline __device__ double paddleAtomicAdd(double* address, double val) {
}
} // namespace paddle

/**
* @brief sum reduction
*
* @param[in,out] smem input data, better to use __shared__ memory.
* @param[in] tid thread index.
* @param[in] threads the total thread number used to reduce,
* such as, blockDim.x.
*
* @return smem[0]: the sum of each elements in smem.
*/
__device__ __forceinline__
void simpleReduce(real* smem, int tid, int threads) {
for (unsigned int s = threads / 2; s > 0; s >>= 1) {
if (tid < s) {
smem[tid] += smem[tid + s];
}
__syncthreads();
}
}

#endif /* HL_DEVICE_FUNCTIONS_CUH_ */
36 changes: 36 additions & 0 deletions paddle/cuda/include/hl_matrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -229,4 +229,40 @@ extern void hl_cossim_derivative(real* grad,
int input2_height,
real scale);

/**
* @brief Matrix addition: A_d[i][j] += scale * B_d[j/channel].
*
* @param[in] A_d input matrix (M x N).
* @param[in] B_d input matrix (1 x channel).
* @param[in] channel width of B.
* @param[in] dimM height of A.
* @param[in] dimN width of A.
* @param[in] scale scalar used for addition.
*
*/
extern void hl_matrix_add_shared_bias(real* A_d,
real* B_d,
const int channel,
const int dimM,
const int dimN,
real scale);

/**
* @brief Matrix addition: A_d[i][j] += scale * B_d[j/channel].
*
* @param[in] B_d input matrix (1 x channel).
* @param[in] A_d input matrix (M x N).
* @param[in] channel width of B.
* @param[in] dimM height of A.
* @param[in] dimN width of A.
* @param[in] scale scalar used for addition.
*
*/
extern void hl_matrix_collect_shared_bias(real* B_d,
real* A_d,
const int channel,
const int dimM,
const int dimN,
real scale);

#endif /* HL_MATRIX_H_ */
13 changes: 13 additions & 0 deletions paddle/cuda/include/stub/hl_matrix_stub.h
Original file line number Diff line number Diff line change
Expand Up @@ -101,4 +101,17 @@ inline void hl_cossim_derivative(real* grad,
int input2_height,
real scale) {}

inline void hl_matrix_add_shared_bias(real* A_d,
real* B_d,
const int channel,
const int dimM,
const int dimN,
real scale) {}

inline void hl_matrix_collect_shared_bias(real* B_d,
real* A_d,
const int channel,
const int dimM,
const int dimN,
real scale) {}
#endif // HL_MATRIX_STUB_H_
7 changes: 6 additions & 1 deletion paddle/cuda/src/hl_cuda_cudnn.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@ limitations under the License. */
#include "hl_thread.ph"
#include "hl_dso_loader.h"
#include "paddle/utils/Logging.h"
#include "paddle/utils/CommandLineParser.h"

P_DEFINE_int32(cudnn_conv_workspace_limit_in_mb, 4096,
"Specify cuDNN max workspace limit, in units MB, "
"4096MB=4GB by default.");

namespace dynload {

Expand Down Expand Up @@ -242,7 +247,7 @@ void hl_conv_workspace(hl_tensor_descriptor input,
CHECK_NOTNULL(conv);

// Specify workspace limit directly
size_t memoryLimitBytes = 8 * 1024 * 1024;
size_t memoryLimitBytes = (1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb;

// cudnn convolution forward configuration
cudnnTensorDescriptor_t fwd_src_desc = GET_TENSOR_DESCRIPTOR(input);
Expand Down
87 changes: 87 additions & 0 deletions paddle/cuda/src/hl_cuda_matrix.cu
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ limitations under the License. */
#include "hl_sequence.h"
#include "paddle/utils/Logging.h"
#include "hl_device_functions.cuh"
#include "hl_gpu_matrix_kernel.cuh"

DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1*a + p2*b);
Expand Down Expand Up @@ -673,3 +674,89 @@ void hl_cossim_derivative(real* grad,
input1_height, input2_height, scale);
CHECK_SYNC("hl_cossim_derivate failed");
}

__global__ void KeMatrixAddSharedBias(real* A,
real* B,
const int channel,
const int M,
const int N,
real scale) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int dim = N / channel;
if (index < M * N) {
int i = index % N;
i = i / dim;
A[index] += scale * B[i];
}
}

void hl_matrix_add_shared_bias(real* A_d,
real* B_d,
const int channel,
const int dimM,
const int dimN,
real scale) {
const int blocks = 512;
const int grids = DIVUP(dimM * dimN, blocks);
KeMatrixAddSharedBias<<<grids, blocks, 0, STREAM_DEFAULT>>>
(A_d, B_d, channel, dimM, dimN, scale);
CHECK_SYNC("hl_matrix_add_shared_bias failed");
}


template <int blockSize>
__global__ void KeMatrixCollectSharedBias(real *B,
real *A,
const int channel,
const int M,
const int N,
const int dim,
const int limit,
real scale) {
if (dim < limit) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < channel) {
real sum = 0.0;
for (int i = 0; i < M; ++i) {
for (int j = 0; j < dim; ++j) {
sum += A[i * N + index * dim + j];
}
}
B[index] += scale * sum;
}
} else {
const int tid = threadIdx.x;
const int bid = blockIdx.x;
__shared__ real smem[blockSize];
real sum = 0.0;
for (int j = 0; j < ((dim * M + blockSize - 1) / blockSize); ++j) {
int n = j * blockSize + tid;
int m = n / dim;
int w = n % dim;
smem[tid] = (m < M && w < dim) ? A[m * N + bid * dim + w] : 0.0;
__syncthreads();
simpleReduce(smem, tid, blockSize);
sum += smem[0];
}
if (tid == 0) {
B[bid] += scale * sum;
}
}
}

void hl_matrix_collect_shared_bias(real* B_d,
real* A_d,
const int channel,
const int dimM,
const int dimN,
real scale) {
const int dim = dimN / channel;
const int blocks = 256;
const int limit = 64;
int grids = (dimM * dim) < limit ? DIVUP(channel, blocks) : channel;

KeMatrixCollectSharedBias<blocks>
<<< grids, blocks, 0, STREAM_DEFAULT>>>
(B_d, A_d, channel, dimM, dimN, dim, limit, scale);
CHECK_SYNC("hl_matrix_collect_shared_bias failed");
}
18 changes: 0 additions & 18 deletions paddle/cuda/src/hl_cuda_sparse.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -908,24 +908,6 @@ int findIndex(int* indice, int num, int index) {
return (end - 1);
}

/**
* @brief sum reduction
*
* @param[in,out] smem input data, better to use __shared__ memory.
* @param[in] tid local thread index.
* @param[in] blockDimX the size of blockDim.x.
*
* note: return smem[0]: the sum of each elements of smem.
*/
__device__ __forceinline__
void reduce(real* smem, int tid, int blockDimX) {
for (unsigned int s = blockDimX / 2; s > 0; s >>= 1) {
if (tid < s) {
smem[tid] += smem[tid + s];
}
__syncthreads();
}
}

/**
* @brief sum columns of csr sparse matrix (csr_val), then add to a_val.
Expand Down
33 changes: 29 additions & 4 deletions paddle/gserver/layers/ConcatenateLayer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,8 @@ void ConcatenateLayer::backward(const UpdateCallback& callback) {
*/
class ConcatenateLayer2 : public Layer {
public:
explicit ConcatenateLayer2(const LayerConfig& config) : Layer(config) {}
explicit ConcatenateLayer2(const LayerConfig& config) :
Layer(config) {}

~ConcatenateLayer2() {}

Expand All @@ -110,6 +111,8 @@ class ConcatenateLayer2 : public Layer {
std::vector<std::unique_ptr<Projection>> projections_;
std::vector<Argument> projOutput_;
std::vector<std::pair<size_t, size_t>> projCol_;
bool sharedBias_;
std::unique_ptr<Weight> biases_;
};

REGISTER_LAYER(concat2, ConcatenateLayer2);
Expand All @@ -119,7 +122,6 @@ bool ConcatenateLayer2::init(const LayerMap& layerMap,
/* Initialize the basic parent class */
if (!Layer::init(layerMap, parameterMap)) return false;

CHECK(!biasParameter_);
CHECK_EQ(inputLayers_.size(), parameters_.size());
projections_.reserve(inputLayers_.size());
projCol_.reserve(inputLayers_.size());
Expand All @@ -137,6 +139,13 @@ bool ConcatenateLayer2::init(const LayerMap& layerMap,
}
CHECK_EQ(getSize(), endCol);

/* initialize biases_ */
if (biasParameter_.get() != NULL) {
sharedBias_ = config_.shared_biases();
size_t psize = config_.bias_size();
biases_ = std::unique_ptr<Weight>(new Weight(1, psize, biasParameter_));
}

return true;
}

Expand All @@ -154,8 +163,17 @@ void ConcatenateLayer2::forward(PassType passType) {
projOutput_[i].grad = output_.grad->subColMatrix(startCol, endCol);
}

for (size_t i = 0; i != inputLayers_.size(); ++i) {
projections_[i]->forward(&getInput(i), &projOutput_[i], passType);
{
AsyncGpuBlock block;
for (size_t i = 0; i != inputLayers_.size(); ++i) {
projections_[i]->forward(&getInput(i), &projOutput_[i], passType);
}
}

/* add the bias-vector */
if (biases_) {
REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
output_.value->addBias(*(biases_->getW()), 1, sharedBias_);
}

/* activation */ {
Expand All @@ -170,6 +188,13 @@ void ConcatenateLayer2::backward(const UpdateCallback& callback) {
backwardActivation();
}

AsyncGpuBlock block;
if (biases_ && biases_->getWGrad()) {
REGISTER_TIMER_INFO("Concat2BpBiasTimer", getName().c_str());
biases_->getWGrad()->collectBias(*getOutputGrad(), 1, sharedBias_);
biases_->getParameterPtr()->incUpdate(callback);
}

for (size_t i = 0; i != inputLayers_.size(); ++i) {
if (projections_[i]) {
projections_[i]->backward(callback);
Expand Down
51 changes: 34 additions & 17 deletions paddle/gserver/layers/ConvBaseLayer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,25 +35,12 @@ bool ConvBaseLayer::init(const LayerMap& layerMap,
filterSizeY_.push_back(conf.filter_size_y());
filterPixels_.push_back(filterSize_.back() * filterSizeY_.back());
channels_.push_back(conf.channels());
imgSize_.push_back(conf.img_size());
imgPixels_.push_back(imgSize_.back() * imgSize_.back());
imgSizeH_.push_back(conf.img_size());
imgSizeW_.push_back(conf.img_size());
groups_.push_back(conf.groups());
filterChannels_.push_back(conf.filter_channels());
outputX_.push_back(conf.output_x());
outputs_.push_back(outputX_.back() * outputX_.back());
}

/* initialize the weightList */
CHECK(inputLayers_.size() == parameters_.size());
for (size_t i = 0; i < inputLayers_.size(); i++) {
size_t height, width;
height = filterPixels_[i] * filterChannels_[i];
width = numFilters_;

// create a new weight
CHECK_EQ(parameters_[i]->getSize(), width * height);
Weight* w = new Weight(height, width, parameters_[i]);
weights_.emplace_back(w);
outputH_.push_back(conf.output_x());
outputW_.push_back(conf.output_x());
}

/* initialize the biases_ */
Expand All @@ -74,4 +61,34 @@ bool ConvBaseLayer::init(const LayerMap& layerMap,
return true;
}

size_t ConvBaseLayer::calOutputSize() {
auto clearAndReserve = [this](IntV* vec) {
vec->clear();
vec->reserve(this->inputLayers_.size());
};
clearAndReserve(&imgSizeH_);
clearAndReserve(&imgSizeW_);
clearAndReserve(&outputH_);
clearAndReserve(&outputW_);
size_t layerSize = 0;
for (size_t i = 0; i < inputLayers_.size(); i++) {
imgSizeH_.push_back(inputLayers_[i]->getOutput().getFrameHeight());
imgSizeW_.push_back(inputLayers_[i]->getOutput().getFrameWidth());
if (imgSizeH_[i] == 0)
imgSizeH_[i] = config_.inputs(i).conv_conf().img_size();
if (imgSizeW_[i] == 0)
imgSizeW_[i] = config_.inputs(i).conv_conf().img_size();
outputH_.push_back(
outputSize(imgSizeH_[i], filterSizeY_[i], paddingY_[i], strideY_[i]));
outputW_.push_back(
outputSize(imgSizeW_[i], filterSize_[i], padding_[i], stride_[i]));
CHECK_EQ(outputH_[i], outputH_[0]);
CHECK_EQ(outputW_[i], outputW_[0]);
}
getOutput().setFrameHeight(outputH_[0]);
getOutput().setFrameWidth(outputW_[0]);
layerSize = outputH_[0] * outputW_[0] * size_t(numFilters_);
return layerSize;
}

} // namespace paddle
Loading

0 comments on commit 45c81a4

Please sign in to comment.