Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add job=time in trainer, refine cudnn_conv to reduce gpu memory and speed up training. #218

Merged
merged 10 commits into from
Nov 2, 2016
Merged
7 changes: 6 additions & 1 deletion doc/ui/cmd_argument/argument_outline.md
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ It looks like there are a lot of arguments. However, most of them are for develo
</tr>

<tr>
<td class="left" rowspan = "5">GPU</td><td class="left">gpu_id</td>
<td class="left" rowspan = "6">GPU</td><td class="left">gpu_id</td>
<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
</tr>

Expand All @@ -207,6 +207,11 @@ It looks like there are a lot of arguments. However, most of them are for develo
<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
</tr>

<tr>
<td class="left">cudnn_conv_workspace_limit_in_mb</td>
<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
</tr>

<tr>
<td class="left" rowspan = "4">RNN</td>
<td class="left">beam_size</td>
Expand Down
4 changes: 4 additions & 0 deletions doc/ui/cmd_argument/detail_introduction.md
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,10 @@
- Choose path to dynamic load NVIDIA CUDA library, for instance, /usr/local/cuda/lib64. [Default]: LD_LIBRARY_PATH
- type: string (default: "", null)

* `--cudnn_conv_workspace_limit_in_mb`
- Specify cuDNN max workspace limit, in units MB, 4096MB=4GB by default.
- type: int32 (default: 4096MB=4GB)

## NLP: RNN/LSTM/GRU
* `--rnn_use_batch`
- Whether to use batch method for calculation in simple RecurrentLayer.
Expand Down
19 changes: 19 additions & 0 deletions paddle/cuda/include/hl_device_functions.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -48,5 +48,24 @@ inline __device__ double paddleAtomicAdd(double* address, double val) {
}
} // namespace paddle

/**
* @brief sum reduction
*
* @param[in,out] smem input data, better to use __shared__ memory.
* @param[in] tid thread index.
* @param[in] threads the total thread number used to reduce,
* such as, blockDim.x.
*
* @return smem[0]: the sum of each elements in smem.
*/
__device__ __forceinline__
void simpleReduce(real* smem, int tid, int threads) {
for (unsigned int s = threads / 2; s > 0; s >>= 1) {
if (tid < s) {
smem[tid] += smem[tid + s];
}
__syncthreads();
}
}

#endif /* HL_DEVICE_FUNCTIONS_CUH_ */
36 changes: 36 additions & 0 deletions paddle/cuda/include/hl_matrix.h
Original file line number Diff line number Diff line change
Expand Up @@ -229,4 +229,40 @@ extern void hl_cossim_derivative(real* grad,
int input2_height,
real scale);

/**
* @brief Matrix addition: A_d[i][j] += scale * B_d[j/channel].
*
* @param[in] A_d input matrix (M x N).
* @param[in] B_d input matrix (1 x channel).
* @param[in] channel width of B.
* @param[in] dimM height of A.
* @param[in] dimN width of A.
* @param[in] scale scalar used for addition.
*
*/
extern void hl_matrix_add_shared_bias(real* A_d,
real* B_d,
const int channel,
const int dimM,
const int dimN,
real scale);

/**
* @brief Matrix addition: A_d[i][j] += scale * B_d[j/channel].
*
* @param[in] B_d input matrix (1 x channel).
* @param[in] A_d input matrix (M x N).
* @param[in] channel width of B.
* @param[in] dimM height of A.
* @param[in] dimN width of A.
* @param[in] scale scalar used for addition.
*
*/
extern void hl_matrix_collect_shared_bias(real* B_d,
real* A_d,
const int channel,
const int dimM,
const int dimN,
real scale);

#endif /* HL_MATRIX_H_ */
13 changes: 13 additions & 0 deletions paddle/cuda/include/stub/hl_matrix_stub.h
Original file line number Diff line number Diff line change
Expand Up @@ -101,4 +101,17 @@ inline void hl_cossim_derivative(real* grad,
int input2_height,
real scale) {}

inline void hl_matrix_add_shared_bias(real* A_d,
real* B_d,
const int channel,
const int dimM,
const int dimN,
real scale) {}

inline void hl_matrix_collect_shared_bias(real* B_d,
real* A_d,
const int channel,
const int dimM,
const int dimN,
real scale) {}
#endif // HL_MATRIX_STUB_H_
7 changes: 6 additions & 1 deletion paddle/cuda/src/hl_cuda_cudnn.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@ limitations under the License. */
#include "hl_thread.ph"
#include "hl_dso_loader.h"
#include "paddle/utils/Logging.h"
#include "paddle/utils/CommandLineParser.h"

P_DEFINE_int32(cudnn_conv_workspace_limit_in_mb, 4096,
"Specify cuDNN max workspace limit, in units MB, "
"4096MB=4GB by default.");

namespace dynload {

Expand Down Expand Up @@ -242,7 +247,7 @@ void hl_conv_workspace(hl_tensor_descriptor input,
CHECK_NOTNULL(conv);

// Specify workspace limit directly
size_t memoryLimitBytes = 8 * 1024 * 1024;
size_t memoryLimitBytes = (1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb;

// cudnn convolution forward configuration
cudnnTensorDescriptor_t fwd_src_desc = GET_TENSOR_DESCRIPTOR(input);
Expand Down
87 changes: 87 additions & 0 deletions paddle/cuda/src/hl_cuda_matrix.cu
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ limitations under the License. */
#include "hl_sequence.h"
#include "paddle/utils/Logging.h"
#include "hl_device_functions.cuh"
#include "hl_gpu_matrix_kernel.cuh"

DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1*a + p2*b);
Expand Down Expand Up @@ -673,3 +674,89 @@ void hl_cossim_derivative(real* grad,
input1_height, input2_height, scale);
CHECK_SYNC("hl_cossim_derivate failed");
}

__global__ void KeMatrixAddSharedBias(real* A,
real* B,
const int channel,
const int M,
const int N,
real scale) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int dim = N / channel;
if (index < M * N) {
int i = index % N;
i = i / dim;
A[index] += scale * B[i];
}
}

void hl_matrix_add_shared_bias(real* A_d,
real* B_d,
const int channel,
const int dimM,
const int dimN,
real scale) {
const int blocks = 512;
const int grids = DIVUP(dimM * dimN, blocks);
KeMatrixAddSharedBias<<<grids, blocks, 0, STREAM_DEFAULT>>>
(A_d, B_d, channel, dimM, dimN, scale);
CHECK_SYNC("hl_matrix_add_shared_bias failed");
}


template <int blockSize>
__global__ void KeMatrixCollectSharedBias(real *B,
real *A,
const int channel,
const int M,
const int N,
const int dim,
const int limit,
real scale) {
if (dim < limit) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < channel) {
real sum = 0.0;
for (int i = 0; i < M; ++i) {
for (int j = 0; j < dim; ++j) {
sum += A[i * N + index * dim + j];
}
}
B[index] += scale * sum;
}
} else {
const int tid = threadIdx.x;
const int bid = blockIdx.x;
__shared__ real smem[blockSize];
real sum = 0.0;
for (int j = 0; j < ((dim * M + blockSize - 1) / blockSize); ++j) {
int n = j * blockSize + tid;
int m = n / dim;
int w = n % dim;
smem[tid] = (m < M && w < dim) ? A[m * N + bid * dim + w] : 0.0;
__syncthreads();
simpleReduce(smem, tid, blockSize);
sum += smem[0];
}
if (tid == 0) {
B[bid] += scale * sum;
}
}
}

void hl_matrix_collect_shared_bias(real* B_d,
real* A_d,
const int channel,
const int dimM,
const int dimN,
real scale) {
const int dim = dimN / channel;
const int blocks = 256;
const int limit = 64;
int grids = (dimM * dim) < limit ? DIVUP(channel, blocks) : channel;

KeMatrixCollectSharedBias<blocks>
<<< grids, blocks, 0, STREAM_DEFAULT>>>
(B_d, A_d, channel, dimM, dimN, dim, limit, scale);
CHECK_SYNC("hl_matrix_collect_shared_bias failed");
}
18 changes: 0 additions & 18 deletions paddle/cuda/src/hl_cuda_sparse.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -908,24 +908,6 @@ int findIndex(int* indice, int num, int index) {
return (end - 1);
}

/**
* @brief sum reduction
*
* @param[in,out] smem input data, better to use __shared__ memory.
* @param[in] tid local thread index.
* @param[in] blockDimX the size of blockDim.x.
*
* note: return smem[0]: the sum of each elements of smem.
*/
__device__ __forceinline__
void reduce(real* smem, int tid, int blockDimX) {
for (unsigned int s = blockDimX / 2; s > 0; s >>= 1) {
if (tid < s) {
smem[tid] += smem[tid + s];
}
__syncthreads();
}
}

/**
* @brief sum columns of csr sparse matrix (csr_val), then add to a_val.
Expand Down
33 changes: 29 additions & 4 deletions paddle/gserver/layers/ConcatenateLayer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,8 @@ void ConcatenateLayer::backward(const UpdateCallback& callback) {
*/
class ConcatenateLayer2 : public Layer {
public:
explicit ConcatenateLayer2(const LayerConfig& config) : Layer(config) {}
explicit ConcatenateLayer2(const LayerConfig& config) :
Layer(config) {}

~ConcatenateLayer2() {}

Expand All @@ -110,6 +111,8 @@ class ConcatenateLayer2 : public Layer {
std::vector<std::unique_ptr<Projection>> projections_;
std::vector<Argument> projOutput_;
std::vector<std::pair<size_t, size_t>> projCol_;
bool sharedBias_;
std::unique_ptr<Weight> biases_;
};

REGISTER_LAYER(concat2, ConcatenateLayer2);
Expand All @@ -119,7 +122,6 @@ bool ConcatenateLayer2::init(const LayerMap& layerMap,
/* Initialize the basic parent class */
if (!Layer::init(layerMap, parameterMap)) return false;

CHECK(!biasParameter_);
CHECK_EQ(inputLayers_.size(), parameters_.size());
projections_.reserve(inputLayers_.size());
projCol_.reserve(inputLayers_.size());
Expand All @@ -137,6 +139,13 @@ bool ConcatenateLayer2::init(const LayerMap& layerMap,
}
CHECK_EQ(getSize(), endCol);

/* initialize biases_ */
if (biasParameter_.get() != NULL) {
sharedBias_ = config_.shared_biases();
size_t psize = config_.bias_size();
biases_ = std::unique_ptr<Weight>(new Weight(1, psize, biasParameter_));
}

return true;
}

Expand All @@ -154,8 +163,17 @@ void ConcatenateLayer2::forward(PassType passType) {
projOutput_[i].grad = output_.grad->subColMatrix(startCol, endCol);
}

for (size_t i = 0; i != inputLayers_.size(); ++i) {
projections_[i]->forward(&getInput(i), &projOutput_[i], passType);
{
AsyncGpuBlock block;
for (size_t i = 0; i != inputLayers_.size(); ++i) {
projections_[i]->forward(&getInput(i), &projOutput_[i], passType);
}
}

/* add the bias-vector */
if (biases_) {
REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
output_.value->addBias(*(biases_->getW()), 1, sharedBias_);
}

/* activation */ {
Expand All @@ -170,6 +188,13 @@ void ConcatenateLayer2::backward(const UpdateCallback& callback) {
backwardActivation();
}

AsyncGpuBlock block;
if (biases_ && biases_->getWGrad()) {
REGISTER_TIMER_INFO("Concat2BpBiasTimer", getName().c_str());
biases_->getWGrad()->collectBias(*getOutputGrad(), 1, sharedBias_);
biases_->getParameterPtr()->incUpdate(callback);
}

for (size_t i = 0; i != inputLayers_.size(); ++i) {
if (projections_[i]) {
projections_[i]->backward(callback);
Expand Down
51 changes: 34 additions & 17 deletions paddle/gserver/layers/ConvBaseLayer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,25 +35,12 @@ bool ConvBaseLayer::init(const LayerMap& layerMap,
filterSizeY_.push_back(conf.filter_size_y());
filterPixels_.push_back(filterSize_.back() * filterSizeY_.back());
channels_.push_back(conf.channels());
imgSize_.push_back(conf.img_size());
imgPixels_.push_back(imgSize_.back() * imgSize_.back());
imgSizeH_.push_back(conf.img_size());
imgSizeW_.push_back(conf.img_size());
groups_.push_back(conf.groups());
filterChannels_.push_back(conf.filter_channels());
outputX_.push_back(conf.output_x());
outputs_.push_back(outputX_.back() * outputX_.back());
}

/* initialize the weightList */
CHECK(inputLayers_.size() == parameters_.size());
for (size_t i = 0; i < inputLayers_.size(); i++) {
size_t height, width;
height = filterPixels_[i] * filterChannels_[i];
width = numFilters_;

// create a new weight
CHECK_EQ(parameters_[i]->getSize(), width * height);
Weight* w = new Weight(height, width, parameters_[i]);
weights_.emplace_back(w);
outputH_.push_back(conf.output_x());
outputW_.push_back(conf.output_x());
}

/* initialize the biases_ */
Expand All @@ -74,4 +61,34 @@ bool ConvBaseLayer::init(const LayerMap& layerMap,
return true;
}

size_t ConvBaseLayer::calOutputSize() {
auto clearAndReserve = [this](IntV* vec) {
vec->clear();
vec->reserve(this->inputLayers_.size());
};
clearAndReserve(&imgSizeH_);
clearAndReserve(&imgSizeW_);
clearAndReserve(&outputH_);
clearAndReserve(&outputW_);
size_t layerSize = 0;
for (size_t i = 0; i < inputLayers_.size(); i++) {
imgSizeH_.push_back(inputLayers_[i]->getOutput().getFrameHeight());
imgSizeW_.push_back(inputLayers_[i]->getOutput().getFrameWidth());
if (imgSizeH_[i] == 0)
imgSizeH_[i] = config_.inputs(i).conv_conf().img_size();
if (imgSizeW_[i] == 0)
imgSizeW_[i] = config_.inputs(i).conv_conf().img_size();
outputH_.push_back(
outputSize(imgSizeH_[i], filterSizeY_[i], paddingY_[i], strideY_[i]));
outputW_.push_back(
outputSize(imgSizeW_[i], filterSize_[i], padding_[i], stride_[i]));
CHECK_EQ(outputH_[i], outputH_[0]);
CHECK_EQ(outputW_[i], outputW_[0]);
}
getOutput().setFrameHeight(outputH_[0]);
getOutput().setFrameWidth(outputW_[0]);
layerSize = outputH_[0] * outputW_[0] * size_t(numFilters_);
return layerSize;
}

} // namespace paddle
Loading