Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

delete memory copy from linear_chain_crf_op. #7675

Merged
merged 1 commit into from
Jan 19, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions paddle/operators/linear_chain_crf_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ class LinearChainCRFOp : public framework::OperatorWithKernel {
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<LoDTensor>("Emission")->type()),
ctx.device_context());
platform::CPUPlace());
}
};

Expand Down Expand Up @@ -248,7 +248,7 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel {
framework::ToDataType(
ctx.Input<LoDTensor>(framework::GradVarName("LogLikelihood"))
->type()),
ctx.device_context());
platform::CPUPlace());
}
};

Expand Down
226 changes: 18 additions & 208 deletions paddle/operators/linear_chain_crf_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,57 +65,14 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
const size_t level = 0;
const size_t seq_num = in_lod[level].size() - 1;

// These local variables hold the inputs and outputs, garanteeing them on
// CPU memory, to provide a consistent reference.
// TODO(caoying) Fix this by moving all these local variables into the
// class's data members once we can profile the whole training process.
LoDTensor* emission_weights = nullptr;
LoDTensor emission_weight_tensor;
Tensor* transition_weights = nullptr;
Tensor transition_weight_tensor;
LoDTensor* label = nullptr;
LoDTensor label_tensor;

Tensor* emission_exps = nullptr;
Tensor emission_exps_tensor;
Tensor* transition_exps = nullptr;
Tensor transition_exps_tensor;
Tensor* alpha = nullptr;
Tensor alpha_tensor;
Tensor* ll = nullptr;
Tensor ll_tensor;

if (platform::is_gpu_place(ctx.GetPlace())) {
emission_weights = &emission_weight_tensor;
transition_weights = &transition_weight_tensor;
label = &label_tensor;

CopyInputsToCpuMemory(
ctx.device_context(), *ctx.Input<LoDTensor>("Emission"),
*ctx.Input<Tensor>("Transition"), *ctx.Input<LoDTensor>("Label"),
emission_weights, transition_weights, label);

emission_exps = &emission_exps_tensor;
emission_exps->Resize(emission_weights->dims());

transition_exps = &transition_exps_tensor;
transition_exps->Resize(transition_weights->dims());

alpha = &alpha_tensor;
alpha->Resize(ctx.Output<Tensor>("Alpha")->dims());

ll = &ll_tensor;
} else {
emission_weights =
const_cast<LoDTensor*>(ctx.Input<LoDTensor>("Emission"));
transition_weights = const_cast<Tensor*>(ctx.Input<Tensor>("Transition"));
label = const_cast<LoDTensor*>(ctx.Input<LoDTensor>("Label"));

emission_exps = ctx.Output<Tensor>("EmissionExps");
transition_exps = ctx.Output<Tensor>("TransitionExps");
alpha = ctx.Output<Tensor>("Alpha");
ll = ctx.Output<Tensor>("LogLikelihood");
}
const LoDTensor* emission_weights = ctx.Input<LoDTensor>("Emission");
const Tensor* transition_weights = ctx.Input<Tensor>("Transition");
const LoDTensor* label = ctx.Input<LoDTensor>("Label");

Tensor* emission_exps = ctx.Output<Tensor>("EmissionExps");
Tensor* transition_exps = ctx.Output<Tensor>("TransitionExps");
Tensor* alpha = ctx.Output<Tensor>("Alpha");
Tensor* ll = ctx.Output<Tensor>("LogLikelihood");

// Because the computation codes only runs on CPU, here the memory for all
// the outputs is FIXED to be allocated on the CPU memory.
Expand Down Expand Up @@ -173,61 +130,9 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
one_seq, one_seq_row_max, one_seq_exps, *transition_weights,
*transition_exps, one_seq_label, &one_seq_alpha);
}

if (platform::is_gpu_place(ctx.GetPlace())) {
CopyOutputsToGpuMemory(
ctx.device_context(), *emission_exps, *transition_exps, *alpha, *ll,
ctx.Output<Tensor>("EmissionExps"),
ctx.Output<Tensor>("TransitionExps"), ctx.Output<Tensor>("Alpha"),
ctx.Output<Tensor>("LogLikelihood"));
}
};

private:
void CopyInputsToCpuMemory(const platform::DeviceContext& ctx,
const LoDTensor& emission_weights_src,
const Tensor& transition_weights_src,
const LoDTensor& label_src,
LoDTensor* emission_weights_dst,
Tensor* transition_weights_dst,
LoDTensor* label_dst) const {
// Copy the inputs from GPU memory to CPU memory if this operators runs on
// GPU device.
auto copyLoDTensor = [](const platform::DeviceContext& ctx,
const LoDTensor& src, LoDTensor* dst) {
dst->mutable_data<T>(src.dims(), platform::CPUPlace());
framework::Copy(src, platform::CPUPlace(), ctx, dst);
};

copyLoDTensor(ctx, emission_weights_src, emission_weights_dst);
copyLoDTensor(ctx, label_src, label_dst);

transition_weights_dst->mutable_data<T>(transition_weights_src.dims(),
platform::CPUPlace());
framework::Copy(transition_weights_src, platform::CPUPlace(), ctx,
transition_weights_dst);
}

void CopyOutputsToGpuMemory(const platform::DeviceContext& ctx,
const Tensor& emission_exps_src,
const Tensor& transition_exps_src,
const Tensor& alpha_src, const Tensor& ll_src,
Tensor* emission_exps_dst,
Tensor* transition_exps_dst, Tensor* alpha_dst,
Tensor* ll_dst) const {
// Copy the forward results from CPU memory to GPU memory if this
// operators runs on GPU device.
auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor& src,
Tensor* dst) {
dst->mutable_data<T>(platform::CUDAPlace());
framework::Copy(src, platform::CUDAPlace(), ctx, dst);
};
copyTensor(ctx, emission_exps_src, emission_exps_dst);
copyTensor(ctx, transition_exps_src, transition_exps_dst);
copyTensor(ctx, alpha_src, alpha_dst);
copyTensor(ctx, ll_src, ll_dst);
}

T ForwardOneSequence(const Tensor& emission, const Tensor& emission_row_max,
const Tensor& emission_exps, const Tensor& trans_weights,
const Tensor& trans_weight_exps, const Tensor& label,
Expand Down Expand Up @@ -296,63 +201,17 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
auto lod = ctx.Input<LoDTensor>("Label")->lod();
PADDLE_ENFORCE(lod.size(), "Input(Label) must be a sequence.");

// These local variables hold the inputs and outputs, garanteeing them on
// CPU memory, to provide a consistent reference.
// TODO(caoying) Fix this by moving all these local variables into the
// class's data members once we can profile the training process, or
// implementing a real GPU kernel for CRF.
Tensor* label = nullptr;
Tensor label_tensor;
Tensor* emission_exps = nullptr;
Tensor emission_exps_tensor;
Tensor* transition_exps = nullptr;
Tensor transition_exps_tensor;
Tensor* alpha = nullptr;
Tensor alpha_tensor;
Tensor ll_grad_tensor;
T* ll_grad = nullptr;

Tensor* emission_grad = nullptr;
Tensor emission_grad_tensor;
Tensor* transition_grad = nullptr;
Tensor transition_grad_tensor;

if (platform::is_gpu_place(ctx.GetPlace())) {
label = &label_tensor;
emission_exps = &emission_exps_tensor;
transition_exps = &transition_exps_tensor;
alpha = &alpha_tensor;
CopyInputsToCpuMemory(
ctx.device_context(), *ctx.Input<LoDTensor>("Label"),
*ctx.Input<Tensor>("EmissionExps"),
*ctx.Input<Tensor>("TransitionExps"), *ctx.Input<Tensor>("Alpha"),
*ctx.Input<Tensor>(framework::GradVarName("LogLikelihood")), label,
emission_exps, transition_exps, alpha, &ll_grad_tensor);
ll_grad = ll_grad_tensor.data<T>();

if (ctx.Output<Tensor>(framework::GradVarName("Emission"))) {
emission_grad = &emission_grad_tensor;
emission_grad->Resize(emission_exps->dims());
}
const Tensor* label = ctx.Input<LoDTensor>("Label");
const Tensor* emission_exps = ctx.Input<Tensor>("EmissionExps");
const Tensor* transition_exps = ctx.Input<Tensor>("TransitionExps");
const Tensor* alpha = ctx.Input<Tensor>("Alpha");
const T* ll_grad =
ctx.Input<Tensor>(framework::GradVarName("LogLikelihood"))->data<T>();

if (ctx.Output<Tensor>(framework::GradVarName("Transition"))) {
transition_grad = &transition_grad_tensor;
transition_grad->Resize(transition_exps->dims());
}
} else {
label = const_cast<LoDTensor*>(ctx.Input<LoDTensor>("Label"));
emission_exps = const_cast<Tensor*>(ctx.Input<Tensor>("EmissionExps"));
transition_exps =
const_cast<Tensor*>(ctx.Input<Tensor>("TransitionExps"));
alpha = const_cast<Tensor*>(ctx.Input<Tensor>("Alpha"));
ll_grad = const_cast<Tensor*>(
ctx.Input<Tensor>(framework::GradVarName("LogLikelihood")))
->data<T>();

emission_grad = ctx.Output<Tensor>(framework::GradVarName("Emission"));
transition_grad =
ctx.Output<Tensor>(framework::GradVarName("Transition"));
}
Tensor* emission_grad =
ctx.Output<Tensor>(framework::GradVarName("Emission"));
Tensor* transition_grad =
ctx.Output<Tensor>(framework::GradVarName("Transition"));

// TODO(caoying) Fix this constraint. When the Input(Emission) is from the
// data reader operator, it can have no gradients.
Expand Down Expand Up @@ -389,58 +248,9 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
one_seq_emission_exps, *transition_exps, one_seq_alpha, one_seq_label,
&one_seq_beta, transition_grad, &one_seq_emission_grad);
}

if (platform::is_gpu_place(ctx.GetPlace())) {
CopyOutputsToGpuMemory(
ctx.device_context(), emission_grad, transition_grad,
ctx.Output<Tensor>(framework::GradVarName("Emission")),
ctx.Output<Tensor>(framework::GradVarName("Transition")));
}
};

private:
void CopyInputsToCpuMemory(const platform::DeviceContext& ctx,
const LoDTensor& label_src,
const Tensor& emission_exps_src,
const Tensor& transition_exps_src,
const Tensor& alpha_src, const Tensor& ll_grad_src,
Tensor* label_dst, Tensor* emission_exps_dst,
Tensor* transition_exps_dst, Tensor* alpha_dst,
Tensor* ll_grad_dst) const {
// Copy the inputs from GPU memory to CPU memory when this operators runs on
// GPU device.
label_dst->mutable_data<T>(label_src.dims(), platform::CPUPlace());
framework::Copy(label_src, platform::CPUPlace(), ctx, label_dst);

auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor& src,
Tensor* dst) {
dst->mutable_data<T>(src.dims(), platform::CPUPlace());
framework::Copy(src, platform::CPUPlace(), ctx, dst);
};
copyTensor(ctx, emission_exps_src, emission_exps_dst);
copyTensor(ctx, transition_exps_src, transition_exps_dst);
copyTensor(ctx, alpha_src, alpha_dst);
copyTensor(ctx, ll_grad_src, ll_grad_dst);
}

void CopyOutputsToGpuMemory(const platform::DeviceContext& ctx,
const Tensor* emission_grad_src,
const Tensor* transition_grad_src,
Tensor* emission_grad_dst,
Tensor* transition_grad_dst) const {
// Copy the backward results from CPU memory to GPU
// memory if this operators runs on GPU device.
auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor* src,
Tensor* dst) {
if (src && dst) {
dst->mutable_data<T>(platform::CUDAPlace());
framework::Copy(*src, platform::CUDAPlace(), ctx, dst);
}
};
copyTensor(ctx, emission_grad_src, emission_grad_dst);
copyTensor(ctx, transition_grad_src, transition_grad_dst);
}

void BackwardOneSequence(const platform::CPUDeviceContext& ctx,
const T ll_grad, const Tensor& emission_exps,
const Tensor& transition_exps, const Tensor& alpha,
Expand Down