From 11002430de2efb496b348d2443610cbcf104a4cf Mon Sep 17 00:00:00 2001 From: Haohongxiang <86215757+haohongxiang@users.noreply.github.com> Date: Thu, 22 Sep 2022 11:32:21 +0800 Subject: [PATCH] [Dygraph] Fix bugs of mp in eager mode (#46303) * fix bugs of mp * fix bugs of mp * update * update * fix bug --- .../c_softmax_with_cross_entropy_op.cu | 4 +++- .../fleet/utils/hybrid_parallel_util.py | 21 +++++++++++++++++-- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu index ef7e298aaf6a3..f1640d2f4a3f5 100644 --- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu +++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu @@ -265,7 +265,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor { auto map = distributed::ProcessGroupMapFromGid::getInstance(); distributed::ProcessGroup* pg = map->get(rid); distributed::AllreduceOptions opts; - opts.reduce_op = distributed::ReduceOp::SUM; + opts.reduce_op = distributed::ReduceOp::MAX; // allocate memory on device. softmax->mutable_data(place); @@ -348,6 +348,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor { in_out.clear(); in_out.push_back(predicted_logits); + opts.reduce_op = distributed::ReduceOp::SUM; pg->AllReduce(in_out, in_out, opts)->Synchronize(); // step 4, obtain exp(logit) @@ -364,6 +365,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor { in_out.clear(); in_out.push_back(sum_exp_logits); + opts.reduce_op = distributed::ReduceOp::SUM; pg->AllReduce(in_out, in_out, opts)->Synchronize(); auto eigen_loss = math::EigenMatrix::From(loss_2d); diff --git a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py index 3246866f225ce..e7bd434b94fd3 100644 --- a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py +++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py @@ -106,13 +106,26 @@ def _broadcast_data_help(data, shape, dtype, hcg): group=model_parallel_group, sync_op=True) + if mp_rank != 0: + if in_dygraph_mode(): + data._clear_data() + input_data._share_buffer_to(data) + else: + data.value().get_tensor()._clear() + data.value().get_tensor()._share_data_with( + input_data.value().get_tensor()) + def broadcast_input_data(hcg, *inputs, **kwargs): cur_device = paddle.get_device() for v in inputs: if isinstance(v, (core.VarBase, core.eager.Tensor)): with framework.no_grad(): - v = v.cuda() if "gpu" in cur_device else v + if "gpu" in cur_device and in_dygraph_mode() \ + and not v.place.is_gpu_place(): + v_gpu = v.cuda(int(cur_device.split(":")[1])) + v._clear_data() + v_gpu._share_buffer_to(v) _broadcast_data_help(v, v.shape, v.dtype, hcg) else: logger.error("it doesn't support data type {}".format(type(v))) @@ -120,7 +133,11 @@ def broadcast_input_data(hcg, *inputs, **kwargs): for k, v in kwargs.items(): if isinstance(v, (core.VarBase, core.eager.Tensor)): with framework.no_grad(): - v = v.cuda() if "gpu" in cur_device else v + if "gpu" in cur_device and in_dygraph_mode() \ + and not v.place.is_gpu_place(): + v_gpu = v.cuda(int(cur_device.split(":")[1])) + v._clear_data() + v_gpu._share_buffer_to(v) _broadcast_data_help(v, v.shape, v.dtype, hcg) kwargs[k] = v else: