From 4020451a7b6a42e3a86bafeb683b2c20297560ab Mon Sep 17 00:00:00 2001 From: caoying03 Date: Fri, 19 Jan 2018 10:12:11 +0800 Subject: [PATCH] delete memory copy from linear_chain_crf_op. --- paddle/operators/linear_chain_crf_op.cc | 4 +- paddle/operators/linear_chain_crf_op.h | 226 ++---------------------- 2 files changed, 20 insertions(+), 210 deletions(-) diff --git a/paddle/operators/linear_chain_crf_op.cc b/paddle/operators/linear_chain_crf_op.cc index 975e394c78db0..e24bf622b7f11 100644 --- a/paddle/operators/linear_chain_crf_op.cc +++ b/paddle/operators/linear_chain_crf_op.cc @@ -187,7 +187,7 @@ class LinearChainCRFOp : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("Emission")->type()), - ctx.device_context()); + platform::CPUPlace()); } }; @@ -248,7 +248,7 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel { framework::ToDataType( ctx.Input(framework::GradVarName("LogLikelihood")) ->type()), - ctx.device_context()); + platform::CPUPlace()); } }; diff --git a/paddle/operators/linear_chain_crf_op.h b/paddle/operators/linear_chain_crf_op.h index f502ebefde1fb..afc197a1c3809 100644 --- a/paddle/operators/linear_chain_crf_op.h +++ b/paddle/operators/linear_chain_crf_op.h @@ -65,57 +65,14 @@ class LinearChainCRFOpKernel : public framework::OpKernel { const size_t level = 0; const size_t seq_num = in_lod[level].size() - 1; - // These local variables hold the inputs and outputs, garanteeing them on - // CPU memory, to provide a consistent reference. - // TODO(caoying) Fix this by moving all these local variables into the - // class's data members once we can profile the whole training process. - LoDTensor* emission_weights = nullptr; - LoDTensor emission_weight_tensor; - Tensor* transition_weights = nullptr; - Tensor transition_weight_tensor; - LoDTensor* label = nullptr; - LoDTensor label_tensor; - - Tensor* emission_exps = nullptr; - Tensor emission_exps_tensor; - Tensor* transition_exps = nullptr; - Tensor transition_exps_tensor; - Tensor* alpha = nullptr; - Tensor alpha_tensor; - Tensor* ll = nullptr; - Tensor ll_tensor; - - if (platform::is_gpu_place(ctx.GetPlace())) { - emission_weights = &emission_weight_tensor; - transition_weights = &transition_weight_tensor; - label = &label_tensor; - - CopyInputsToCpuMemory( - ctx.device_context(), *ctx.Input("Emission"), - *ctx.Input("Transition"), *ctx.Input("Label"), - emission_weights, transition_weights, label); - - emission_exps = &emission_exps_tensor; - emission_exps->Resize(emission_weights->dims()); - - transition_exps = &transition_exps_tensor; - transition_exps->Resize(transition_weights->dims()); - - alpha = &alpha_tensor; - alpha->Resize(ctx.Output("Alpha")->dims()); - - ll = &ll_tensor; - } else { - emission_weights = - const_cast(ctx.Input("Emission")); - transition_weights = const_cast(ctx.Input("Transition")); - label = const_cast(ctx.Input("Label")); - - emission_exps = ctx.Output("EmissionExps"); - transition_exps = ctx.Output("TransitionExps"); - alpha = ctx.Output("Alpha"); - ll = ctx.Output("LogLikelihood"); - } + const LoDTensor* emission_weights = ctx.Input("Emission"); + const Tensor* transition_weights = ctx.Input("Transition"); + const LoDTensor* label = ctx.Input("Label"); + + Tensor* emission_exps = ctx.Output("EmissionExps"); + Tensor* transition_exps = ctx.Output("TransitionExps"); + Tensor* alpha = ctx.Output("Alpha"); + Tensor* ll = ctx.Output("LogLikelihood"); // Because the computation codes only runs on CPU, here the memory for all // the outputs is FIXED to be allocated on the CPU memory. @@ -173,61 +130,9 @@ class LinearChainCRFOpKernel : public framework::OpKernel { one_seq, one_seq_row_max, one_seq_exps, *transition_weights, *transition_exps, one_seq_label, &one_seq_alpha); } - - if (platform::is_gpu_place(ctx.GetPlace())) { - CopyOutputsToGpuMemory( - ctx.device_context(), *emission_exps, *transition_exps, *alpha, *ll, - ctx.Output("EmissionExps"), - ctx.Output("TransitionExps"), ctx.Output("Alpha"), - ctx.Output("LogLikelihood")); - } }; private: - void CopyInputsToCpuMemory(const platform::DeviceContext& ctx, - const LoDTensor& emission_weights_src, - const Tensor& transition_weights_src, - const LoDTensor& label_src, - LoDTensor* emission_weights_dst, - Tensor* transition_weights_dst, - LoDTensor* label_dst) const { - // Copy the inputs from GPU memory to CPU memory if this operators runs on - // GPU device. - auto copyLoDTensor = [](const platform::DeviceContext& ctx, - const LoDTensor& src, LoDTensor* dst) { - dst->mutable_data(src.dims(), platform::CPUPlace()); - framework::Copy(src, platform::CPUPlace(), ctx, dst); - }; - - copyLoDTensor(ctx, emission_weights_src, emission_weights_dst); - copyLoDTensor(ctx, label_src, label_dst); - - transition_weights_dst->mutable_data(transition_weights_src.dims(), - platform::CPUPlace()); - framework::Copy(transition_weights_src, platform::CPUPlace(), ctx, - transition_weights_dst); - } - - void CopyOutputsToGpuMemory(const platform::DeviceContext& ctx, - const Tensor& emission_exps_src, - const Tensor& transition_exps_src, - const Tensor& alpha_src, const Tensor& ll_src, - Tensor* emission_exps_dst, - Tensor* transition_exps_dst, Tensor* alpha_dst, - Tensor* ll_dst) const { - // Copy the forward results from CPU memory to GPU memory if this - // operators runs on GPU device. - auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor& src, - Tensor* dst) { - dst->mutable_data(platform::CUDAPlace()); - framework::Copy(src, platform::CUDAPlace(), ctx, dst); - }; - copyTensor(ctx, emission_exps_src, emission_exps_dst); - copyTensor(ctx, transition_exps_src, transition_exps_dst); - copyTensor(ctx, alpha_src, alpha_dst); - copyTensor(ctx, ll_src, ll_dst); - } - T ForwardOneSequence(const Tensor& emission, const Tensor& emission_row_max, const Tensor& emission_exps, const Tensor& trans_weights, const Tensor& trans_weight_exps, const Tensor& label, @@ -296,63 +201,17 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel { auto lod = ctx.Input("Label")->lod(); PADDLE_ENFORCE(lod.size(), "Input(Label) must be a sequence."); - // These local variables hold the inputs and outputs, garanteeing them on - // CPU memory, to provide a consistent reference. - // TODO(caoying) Fix this by moving all these local variables into the - // class's data members once we can profile the training process, or - // implementing a real GPU kernel for CRF. - Tensor* label = nullptr; - Tensor label_tensor; - Tensor* emission_exps = nullptr; - Tensor emission_exps_tensor; - Tensor* transition_exps = nullptr; - Tensor transition_exps_tensor; - Tensor* alpha = nullptr; - Tensor alpha_tensor; - Tensor ll_grad_tensor; - T* ll_grad = nullptr; - - Tensor* emission_grad = nullptr; - Tensor emission_grad_tensor; - Tensor* transition_grad = nullptr; - Tensor transition_grad_tensor; - - if (platform::is_gpu_place(ctx.GetPlace())) { - label = &label_tensor; - emission_exps = &emission_exps_tensor; - transition_exps = &transition_exps_tensor; - alpha = &alpha_tensor; - CopyInputsToCpuMemory( - ctx.device_context(), *ctx.Input("Label"), - *ctx.Input("EmissionExps"), - *ctx.Input("TransitionExps"), *ctx.Input("Alpha"), - *ctx.Input(framework::GradVarName("LogLikelihood")), label, - emission_exps, transition_exps, alpha, &ll_grad_tensor); - ll_grad = ll_grad_tensor.data(); - - if (ctx.Output(framework::GradVarName("Emission"))) { - emission_grad = &emission_grad_tensor; - emission_grad->Resize(emission_exps->dims()); - } + const Tensor* label = ctx.Input("Label"); + const Tensor* emission_exps = ctx.Input("EmissionExps"); + const Tensor* transition_exps = ctx.Input("TransitionExps"); + const Tensor* alpha = ctx.Input("Alpha"); + const T* ll_grad = + ctx.Input(framework::GradVarName("LogLikelihood"))->data(); - if (ctx.Output(framework::GradVarName("Transition"))) { - transition_grad = &transition_grad_tensor; - transition_grad->Resize(transition_exps->dims()); - } - } else { - label = const_cast(ctx.Input("Label")); - emission_exps = const_cast(ctx.Input("EmissionExps")); - transition_exps = - const_cast(ctx.Input("TransitionExps")); - alpha = const_cast(ctx.Input("Alpha")); - ll_grad = const_cast( - ctx.Input(framework::GradVarName("LogLikelihood"))) - ->data(); - - emission_grad = ctx.Output(framework::GradVarName("Emission")); - transition_grad = - ctx.Output(framework::GradVarName("Transition")); - } + Tensor* emission_grad = + ctx.Output(framework::GradVarName("Emission")); + Tensor* transition_grad = + ctx.Output(framework::GradVarName("Transition")); // TODO(caoying) Fix this constraint. When the Input(Emission) is from the // data reader operator, it can have no gradients. @@ -389,58 +248,9 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel { one_seq_emission_exps, *transition_exps, one_seq_alpha, one_seq_label, &one_seq_beta, transition_grad, &one_seq_emission_grad); } - - if (platform::is_gpu_place(ctx.GetPlace())) { - CopyOutputsToGpuMemory( - ctx.device_context(), emission_grad, transition_grad, - ctx.Output(framework::GradVarName("Emission")), - ctx.Output(framework::GradVarName("Transition"))); - } }; private: - void CopyInputsToCpuMemory(const platform::DeviceContext& ctx, - const LoDTensor& label_src, - const Tensor& emission_exps_src, - const Tensor& transition_exps_src, - const Tensor& alpha_src, const Tensor& ll_grad_src, - Tensor* label_dst, Tensor* emission_exps_dst, - Tensor* transition_exps_dst, Tensor* alpha_dst, - Tensor* ll_grad_dst) const { - // Copy the inputs from GPU memory to CPU memory when this operators runs on - // GPU device. - label_dst->mutable_data(label_src.dims(), platform::CPUPlace()); - framework::Copy(label_src, platform::CPUPlace(), ctx, label_dst); - - auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor& src, - Tensor* dst) { - dst->mutable_data(src.dims(), platform::CPUPlace()); - framework::Copy(src, platform::CPUPlace(), ctx, dst); - }; - copyTensor(ctx, emission_exps_src, emission_exps_dst); - copyTensor(ctx, transition_exps_src, transition_exps_dst); - copyTensor(ctx, alpha_src, alpha_dst); - copyTensor(ctx, ll_grad_src, ll_grad_dst); - } - - void CopyOutputsToGpuMemory(const platform::DeviceContext& ctx, - const Tensor* emission_grad_src, - const Tensor* transition_grad_src, - Tensor* emission_grad_dst, - Tensor* transition_grad_dst) const { - // Copy the backward results from CPU memory to GPU - // memory if this operators runs on GPU device. - auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor* src, - Tensor* dst) { - if (src && dst) { - dst->mutable_data(platform::CUDAPlace()); - framework::Copy(*src, platform::CUDAPlace(), ctx, dst); - } - }; - copyTensor(ctx, emission_grad_src, emission_grad_dst); - copyTensor(ctx, transition_grad_src, transition_grad_dst); - } - void BackwardOneSequence(const platform::CPUDeviceContext& ctx, const T ll_grad, const Tensor& emission_exps, const Tensor& transition_exps, const Tensor& alpha,