From 81df751d5a3e43968eb10618141774af9dc9c212 Mon Sep 17 00:00:00 2001 From: smallpoxscattered <1989838596@qq.com> Date: Sun, 11 Aug 2024 12:02:41 +0000 Subject: [PATCH 01/10] trains barrier --- .../fluid/operators/collective/barrier_op.cc | 46 --------- .../operators/collective/barrier_op.cu.cc | 96 ------------------- .../fluid/operators/collective/barrier_op.h | 67 ------------- paddle/phi/kernels/cpu/barrier_kernel.cc | 49 ++++++++++ paddle/phi/kernels/gpu/barrier_kernel.cu | 78 +++++++++++++++ .../phi/ops/yaml/inconsistent/static_ops.yaml | 6 -- paddle/phi/ops/yaml/op_compat.yaml | 2 +- paddle/phi/ops/yaml/ops.yaml | 11 +++ 8 files changed, 139 insertions(+), 216 deletions(-) delete mode 100644 paddle/fluid/operators/collective/barrier_op.cc delete mode 100644 paddle/fluid/operators/collective/barrier_op.cu.cc delete mode 100644 paddle/fluid/operators/collective/barrier_op.h create mode 100644 paddle/phi/kernels/cpu/barrier_kernel.cc create mode 100644 paddle/phi/kernels/gpu/barrier_kernel.cu mode change 100755 => 100644 paddle/phi/ops/yaml/ops.yaml diff --git a/paddle/fluid/operators/collective/barrier_op.cc b/paddle/fluid/operators/collective/barrier_op.cc deleted file mode 100644 index 02d69773c775c..0000000000000 --- a/paddle/fluid/operators/collective/barrier_op.cc +++ /dev/null @@ -1,46 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/collective/barrier_op.h" - -#include - -namespace paddle::operators { - -class BarrierOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override {} -}; - -class BarrierOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", "(Tensor) Input data (only used in CUDAKernel)."); - AddOutput("Out", "(Tensor) Output data (only used in CUDAKernel)."); - AddAttr("ring_id", "(int default 0) communication ring id.") - .SetDefault(0); - AddComment(R"DOC( -Barrier Operator - Barrier among all participators.)DOC"); - } -}; - -} // namespace paddle::operators - -namespace ops = paddle::operators; - -REGISTER_OP_WITHOUT_GRADIENT(barrier, ops::BarrierOp, ops::BarrierOpMaker); - -PD_REGISTER_STRUCT_KERNEL( - barrier, CPU, ALL_LAYOUT, ops::BarrierOpCPUKernel, int) {} diff --git a/paddle/fluid/operators/collective/barrier_op.cu.cc b/paddle/fluid/operators/collective/barrier_op.cu.cc deleted file mode 100644 index f6e628f4e26c3..0000000000000 --- a/paddle/fluid/operators/collective/barrier_op.cu.cc +++ /dev/null @@ -1,96 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/collective/barrier_op.h" -#include "paddle/phi/core/distributed/comm_context_manager.h" - -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) -#include "paddle/common/flags.h" -#include "paddle/fluid/platform/device/gpu/nccl_helper.h" -#include "paddle/phi/core/distributed/nccl_comm_context.h" -#include "paddle/phi/core/platform/collective_helper.h" -COMMON_DECLARE_bool(dynamic_static_unified_comm); -#endif - -namespace paddle { -namespace operators { - -template -class BarrierOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - auto in = ctx.Input("X"); - auto out = ctx.Output("Out"); - - auto place = ctx.GetPlace(); - ncclDataType_t dtype = phi::ToNCCLDataType(in->dtype()); - int64_t numel = in->numel(); - const void* sendbuff = in->data(); - void* recvbuff = out->mutable_data(place); - - int rid = ctx.Attr("ring_id"); - const auto& comm_context_manager = - phi::distributed::CommContextManager::GetInstance(); - if (FLAGS_dynamic_static_unified_comm) { - PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(rid)), - true, - common::errors::InvalidArgument( - "You choose to use new communication library by " - "setting environment " - "variable FLAGS_dynamic_static_unified_comm True. " - "But ring_id(%d) is " - "not found in comm_context_manager.", - std::to_string(rid))); - auto comm_ctx = static_cast( - comm_context_manager.Get(std::to_string(rid))); - PADDLE_ENFORCE_NE(comm_ctx, - nullptr, - common::errors::Unavailable( - "NCCLCommContext is nullptr, collective op should " - "has ring_id attr.")); - auto stream = comm_ctx->GetStream(); - ncclRedOp_t nccl_red_type = ncclSum; - comm_ctx->AllReduce(out, *in, nccl_red_type, stream); - phi::backends::gpu::GpuStreamSync(stream); - VLOG(3) << "new NCCLCommContext has rid " << rid; - } else { - auto comm = platform::NCCLCommContext::Instance().Get(rid, place); - // should ExecutionContext for calc stream. - auto stream = ctx.cuda_device_context().stream(); - ncclRedOp_t nccl_red_type = ncclSum; - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(sendbuff, - recvbuff, - numel, - dtype, - nccl_red_type, - comm->comm(), - stream)); - phi::backends::gpu::GpuStreamSync(stream); - VLOG(3) << "old NCCLCommContext has rid " << rid; - } -#else - PADDLE_THROW( - common::errors::Unavailable("PaddlePaddle should compile with NCCL.")); -#endif - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -PD_REGISTER_STRUCT_KERNEL( - barrier, GPU, ALL_LAYOUT, ops::BarrierOpCUDAKernel, int) {} diff --git a/paddle/fluid/operators/collective/barrier_op.h b/paddle/fluid/operators/collective/barrier_op.h deleted file mode 100644 index d7cb87be716a6..0000000000000 --- a/paddle/fluid/operators/collective/barrier_op.h +++ /dev/null @@ -1,67 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include - -#include "paddle/common/ddim.h" -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/core/distributed/comm_context_manager.h" - -#if defined(PADDLE_WITH_GLOO) -#include - -#include "paddle/fluid/framework/fleet/gloo_wrapper.h" -#include "paddle/phi/core/distributed/gloo_comm_context.h" -#endif - -namespace paddle { -namespace operators { - -template -class BarrierOpCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { -#if defined(PADDLE_WITH_GLOO) - int rid = ctx.Attr("ring_id"); - const auto& comm_context_manager = - phi::distributed::CommContextManager::GetInstance(); - if (comm_context_manager.Has(std::to_string(rid))) { - auto* comm_context = static_cast( - comm_context_manager.Get(std::to_string(rid))); - comm_context->Barrier(); - } else { - auto gloo = paddle::framework::GlooWrapper::GetInstance(); - PADDLE_ENFORCE_EQ( - gloo->IsInitialized(), - true, - common::errors::PreconditionNotMet( - "You must initialize the gloo environment first to use it.")); - gloo::BarrierOptions opts(gloo->GetContext()); - gloo::barrier(opts); - } -#else - PADDLE_THROW(common::errors::Unavailable( - "PaddlePaddle should compile with GLOO by setting WITH_GLOO=ON")); -#endif - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/kernels/cpu/barrier_kernel.cc b/paddle/phi/kernels/cpu/barrier_kernel.cc new file mode 100644 index 0000000000000..029629103dffc --- /dev/null +++ b/paddle/phi/kernels/cpu/barrier_kernel.cc @@ -0,0 +1,49 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/distributed/comm_context_manager.h" +#include "paddle/phi/core/kernel_registry.h" + +#if defined(PADDLE_WITH_GLOO) +#include +#include "paddle/phi/core/distributed/gloo_comm_context.h" +#endif + +namespace phi { + +template +void BarrierKernel(const Context& dev_ctx, + const DenseTensor& x_in, + int ring_id, + DenseTensor* out) { +#if defined(PADDLE_WITH_GLOO) + const auto& comm_context_manager = + phi::distributed::CommContextManager::GetInstance(); + if (comm_context_manager.Has(std::to_string(ring_id))) { + auto* comm_context = static_cast( + comm_context_manager.Get(std::to_string(ring_id))); + comm_context->Barrier(); + } else { + PADDLE_THROW(phi::errors::Unavailable( + "You must initialize the gloo environment first to use it.")); + } +#else + PADDLE_THROW(phi::errors::Unavailable( + "PaddlePaddle should compile with GLOO by setting WITH_GLOO=ON")); +#endif +} + +} // namespace phi + +PD_REGISTER_KERNEL(barrier, CPU, ALL_LAYOUT, phi::BarrierKernel, int) {} diff --git a/paddle/phi/kernels/gpu/barrier_kernel.cu b/paddle/phi/kernels/gpu/barrier_kernel.cu new file mode 100644 index 0000000000000..5177e89a616dc --- /dev/null +++ b/paddle/phi/kernels/gpu/barrier_kernel.cu @@ -0,0 +1,78 @@ +// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/distributed/comm_context_manager.h" +#include "paddle/phi/core/kernel_registry.h" + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/common/flags.h" +#include "paddle/phi/backends/gpu/gpu_helper.h" +#include "paddle/phi/core/distributed/nccl_comm_context.h" +COMMON_DECLARE_bool(dynamic_static_unified_comm); +#endif + +namespace phi { + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +template +void BarrierKernel(const Context& dev_ctx, + const DenseTensor& x_in, + int ring_id, + DenseTensor* out) { + auto place = dev_ctx.GetPlace(); + ncclDataType_t dtype = phi::ToNCCLDataType(x_in.dtype()); + int64_t numel = x_in.numel(); + const void* sendbuff = x_in.data(); + void* recvbuff = dev_ctx.template Alloc(out); + + const auto& comm_context_manager = + phi::distributed::CommContextManager::GetInstance(); + if (FLAGS_dynamic_static_unified_comm) { + PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(ring_id)), + true, + phi::errors::InvalidArgument( + "You choose to use new communication library by " + "setting environment " + "variable FLAGS_dynamic_static_unified_comm True. " + "But ring_id(%d) is " + "not found in comm_context_manager.", + std::to_string(ring_id))); + auto comm_ctx = static_cast( + comm_context_manager.Get(std::to_string(ring_id))); + PADDLE_ENFORCE_NOT_NULL( + comm_ctx, + phi::errors::Unavailable( + "NCCLCommContext is nullptr, collective op should " + "has ring_id attr.")); + auto stream = comm_ctx->GetStream(); + ncclRedOp_t nccl_red_type = ncclSum; + comm_ctx->AllReduce(out, x_in, nccl_red_type, stream); + phi::backends::gpu::GpuStreamSync(stream); + VLOG(3) << "new NCCLCommContext has rid " << ring_id; + } else { + auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place); + auto stream = dev_ctx.stream(); + ncclRedOp_t nccl_red_type = ncclSum; + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce( + sendbuff, recvbuff, numel, dtype, nccl_red_type, comm->comm(), stream)); + phi::backends::gpu::GpuStreamSync(stream); + VLOG(3) << "old NCCLCommContext has rid " << ring_id; + } +} +#endif + +} // namespace phi + +PD_REGISTER_KERNEL(barrier, GPU, ALL_LAYOUT, phi::BarrierKernel, int) {} diff --git a/paddle/phi/ops/yaml/inconsistent/static_ops.yaml b/paddle/phi/ops/yaml/inconsistent/static_ops.yaml index 4a8e4cd429287..34d18adce9181 100644 --- a/paddle/phi/ops/yaml/inconsistent/static_ops.yaml +++ b/paddle/phi/ops/yaml/inconsistent/static_ops.yaml @@ -75,12 +75,6 @@ interfaces : paddle::dialect::InferSymbolicShapeInterface traits : paddle::dialect::ForwardOnlyTrait -- op : barrier - args : (Tensor x, int ring_id=0) - output : Tensor(out) - kernel : - func : barrier - - op : batch_norm args : (Tensor x, Tensor mean, Tensor variance, Tensor scale, Tensor bias, bool is_test, float momentum, float epsilon, str data_format, bool use_global_stats, bool trainable_statistics) output : Tensor(out), Tensor(mean_out), Tensor(variance_out), Tensor(saved_mean), Tensor(saved_variance), Tensor(reserve_space) diff --git a/paddle/phi/ops/yaml/op_compat.yaml b/paddle/phi/ops/yaml/op_compat.yaml index 5ed6b0812ef94..b6a47fa0bb2ba 100755 --- a/paddle/phi/ops/yaml/op_compat.yaml +++ b/paddle/phi/ops/yaml/op_compat.yaml @@ -364,7 +364,7 @@ - op : barrier inputs : - {x : X} + {x : X, ring_id : Ring_id} outputs : out : Out diff --git a/paddle/phi/ops/yaml/ops.yaml b/paddle/phi/ops/yaml/ops.yaml old mode 100755 new mode 100644 index dd30e85fc84b0..3510b214ad20f --- a/paddle/phi/ops/yaml/ops.yaml +++ b/paddle/phi/ops/yaml/ops.yaml @@ -455,6 +455,17 @@ data_type : param inplace : (in_sum_1 -> out_sum_1), (in_sum_2 -> out_sum_2), (in_sum_3 -> out_sum_3), (in_num_accumulates -> out_num_accumulates), (in_old_num_accumulates -> out_old_num_accumulates), (in_num_updates -> out_num_updates) +- op : barrier + args : (Tensor x, int ring_id) + output : Tensor(out) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : barrier + data_type : x + traits : paddle::dialect::ForwardOnlyTrait + - op : batch_fc args : (Tensor input, Tensor w, Tensor bias) output : Tensor(out) From c381aa5d6cf4b48c17e4ce10b9381d68cca4bcb8 Mon Sep 17 00:00:00 2001 From: smallpoxscattered <1989838596@qq.com> Date: Sun, 11 Aug 2024 13:19:22 +0000 Subject: [PATCH 02/10] trains barrier --- paddle/phi/kernels/gpu/barrier_kernel.cu | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/paddle/phi/kernels/gpu/barrier_kernel.cu b/paddle/phi/kernels/gpu/barrier_kernel.cu index 5177e89a616dc..f31ea5a7456ec 100644 --- a/paddle/phi/kernels/gpu/barrier_kernel.cu +++ b/paddle/phi/kernels/gpu/barrier_kernel.cu @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "glog/logging.h" + #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/distributed/comm_context_manager.h" #include "paddle/phi/core/kernel_registry.h" @@ -20,6 +22,7 @@ #include "paddle/common/flags.h" #include "paddle/phi/backends/gpu/gpu_helper.h" #include "paddle/phi/core/distributed/nccl_comm_context.h" +#include "paddle/phi/core/platform/collective_helper.h" COMMON_DECLARE_bool(dynamic_static_unified_comm); #endif @@ -62,7 +65,7 @@ void BarrierKernel(const Context& dev_ctx, phi::backends::gpu::GpuStreamSync(stream); VLOG(3) << "new NCCLCommContext has rid " << ring_id; } else { - auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place); + auto comm = phi::platform::NCCLCommContext::Instance().Get(ring_id, place); auto stream = dev_ctx.stream(); ncclRedOp_t nccl_red_type = ncclSum; PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce( From 385a89a5d87afcb56fa7d81038d9c512400e7ba8 Mon Sep 17 00:00:00 2001 From: smallpoxscattered <1989838596@qq.com> Date: Sun, 11 Aug 2024 13:41:06 +0000 Subject: [PATCH 03/10] trains barrier --- paddle/phi/kernels/gpu/barrier_kernel.cu | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/paddle/phi/kernels/gpu/barrier_kernel.cu b/paddle/phi/kernels/gpu/barrier_kernel.cu index f31ea5a7456ec..7fcd9421304ba 100644 --- a/paddle/phi/kernels/gpu/barrier_kernel.cu +++ b/paddle/phi/kernels/gpu/barrier_kernel.cu @@ -28,12 +28,12 @@ COMMON_DECLARE_bool(dynamic_static_unified_comm); namespace phi { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) template -void BarrierKernel(const Context& dev_ctx, - const DenseTensor& x_in, - int ring_id, - DenseTensor* out) { +void BarrierOpCUDAKernel(const Context& dev_ctx, + const DenseTensor& x_in, + int ring_id, + DenseTensor* out) { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) auto place = dev_ctx.GetPlace(); ncclDataType_t dtype = phi::ToNCCLDataType(x_in.dtype()); int64_t numel = x_in.numel(); @@ -65,7 +65,7 @@ void BarrierKernel(const Context& dev_ctx, phi::backends::gpu::GpuStreamSync(stream); VLOG(3) << "new NCCLCommContext has rid " << ring_id; } else { - auto comm = phi::platform::NCCLCommContext::Instance().Get(ring_id, place); + auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place); auto stream = dev_ctx.stream(); ncclRedOp_t nccl_red_type = ncclSum; PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce( @@ -73,9 +73,12 @@ void BarrierKernel(const Context& dev_ctx, phi::backends::gpu::GpuStreamSync(stream); VLOG(3) << "old NCCLCommContext has rid " << ring_id; } -} +#else + PADDLE_THROW( + phi::errors::Unavailable("PaddlePaddle should compile with NCCL.")); #endif +} } // namespace phi -PD_REGISTER_KERNEL(barrier, GPU, ALL_LAYOUT, phi::BarrierKernel, int) {} +PD_REGISTER_KERNEL(barrier, GPU, ALL_LAYOUT, phi::BarrierOpCUDAKernel, int) {} From 6a599ecf31f0805fdd7cca46e5c01688ab49571a Mon Sep 17 00:00:00 2001 From: smallpoxscattered <1989838596@qq.com> Date: Sun, 11 Aug 2024 15:12:55 +0000 Subject: [PATCH 04/10] fix barrier --- paddle/phi/kernels/gpu/barrier_kernel.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/phi/kernels/gpu/barrier_kernel.cu b/paddle/phi/kernels/gpu/barrier_kernel.cu index 7fcd9421304ba..97b3fbd179ec3 100644 --- a/paddle/phi/kernels/gpu/barrier_kernel.cu +++ b/paddle/phi/kernels/gpu/barrier_kernel.cu @@ -65,7 +65,8 @@ void BarrierOpCUDAKernel(const Context& dev_ctx, phi::backends::gpu::GpuStreamSync(stream); VLOG(3) << "new NCCLCommContext has rid " << ring_id; } else { - auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place); + auto comm = + paddle::platform::NCCLCommContext::Instance().Get(ring_id, place); auto stream = dev_ctx.stream(); ncclRedOp_t nccl_red_type = ncclSum; PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce( From 88bea2955fc20976b645eb3916e067d5953b1d2c Mon Sep 17 00:00:00 2001 From: smallpoxscattered <1989838596@qq.com> Date: Mon, 12 Aug 2024 04:40:44 +0000 Subject: [PATCH 05/10] fix barrier --- paddle/phi/ops/yaml/op_compat.yaml | 2 +- paddle/phi/ops/yaml/ops.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/phi/ops/yaml/op_compat.yaml b/paddle/phi/ops/yaml/op_compat.yaml index b6a47fa0bb2ba..5ed6b0812ef94 100755 --- a/paddle/phi/ops/yaml/op_compat.yaml +++ b/paddle/phi/ops/yaml/op_compat.yaml @@ -364,7 +364,7 @@ - op : barrier inputs : - {x : X, ring_id : Ring_id} + {x : X} outputs : out : Out diff --git a/paddle/phi/ops/yaml/ops.yaml b/paddle/phi/ops/yaml/ops.yaml index 3510b214ad20f..12396ee06fed8 100644 --- a/paddle/phi/ops/yaml/ops.yaml +++ b/paddle/phi/ops/yaml/ops.yaml @@ -456,7 +456,7 @@ inplace : (in_sum_1 -> out_sum_1), (in_sum_2 -> out_sum_2), (in_sum_3 -> out_sum_3), (in_num_accumulates -> out_num_accumulates), (in_old_num_accumulates -> out_old_num_accumulates), (in_num_updates -> out_num_updates) - op : barrier - args : (Tensor x, int ring_id) + args : (Tensor x, int ring_id = 0) output : Tensor(out) infer_meta : func : UnchangedInferMeta From d1296d98335b38a1c70fa4c49e1617fea5e93d87 Mon Sep 17 00:00:00 2001 From: smallpoxscattered <1989838596@qq.com> Date: Tue, 13 Aug 2024 01:42:07 +0000 Subject: [PATCH 06/10] fix barrier --- paddle/phi/kernels/cpu/barrier_kernel.cc | 20 ++++++++------------ paddle/phi/kernels/gpu/barrier_kernel.cu | 23 ++++++----------------- 2 files changed, 14 insertions(+), 29 deletions(-) diff --git a/paddle/phi/kernels/cpu/barrier_kernel.cc b/paddle/phi/kernels/cpu/barrier_kernel.cc index 029629103dffc..76c164f9c64c5 100644 --- a/paddle/phi/kernels/cpu/barrier_kernel.cc +++ b/paddle/phi/kernels/cpu/barrier_kernel.cc @@ -12,12 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/core/distributed/comm_context_manager.h" #include "paddle/phi/core/kernel_registry.h" #if defined(PADDLE_WITH_GLOO) #include #include "paddle/phi/core/distributed/gloo_comm_context.h" +#include "paddle/phi/core/distributed/nccl_comm_context.h" #endif namespace phi { @@ -25,19 +25,15 @@ namespace phi { template void BarrierKernel(const Context& dev_ctx, const DenseTensor& x_in, - int ring_id, DenseTensor* out) { #if defined(PADDLE_WITH_GLOO) - const auto& comm_context_manager = - phi::distributed::CommContextManager::GetInstance(); - if (comm_context_manager.Has(std::to_string(ring_id))) { - auto* comm_context = static_cast( - comm_context_manager.Get(std::to_string(ring_id))); - comm_context->Barrier(); - } else { - PADDLE_THROW(phi::errors::Unavailable( - "You must initialize the gloo environment first to use it.")); - } + auto comm_ctx = + static_cast(dev_ctx.GetCommContext()); + PADDLE_ENFORCE_NE( + comm_ctx, + nullptr, + errors::Unavailable("NCCLCommContext is nullptr, collective op should " + "has ring_id attr.")); #else PADDLE_THROW(phi::errors::Unavailable( "PaddlePaddle should compile with GLOO by setting WITH_GLOO=ON")); diff --git a/paddle/phi/kernels/gpu/barrier_kernel.cu b/paddle/phi/kernels/gpu/barrier_kernel.cu index 97b3fbd179ec3..b3bd4b14d575a 100644 --- a/paddle/phi/kernels/gpu/barrier_kernel.cu +++ b/paddle/phi/kernels/gpu/barrier_kernel.cu @@ -15,7 +15,6 @@ #include "glog/logging.h" #include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/core/distributed/comm_context_manager.h" #include "paddle/phi/core/kernel_registry.h" #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) @@ -29,10 +28,9 @@ COMMON_DECLARE_bool(dynamic_static_unified_comm); namespace phi { template -void BarrierOpCUDAKernel(const Context& dev_ctx, - const DenseTensor& x_in, - int ring_id, - DenseTensor* out) { +void BarrierKernel(const Context& dev_ctx, + const DenseTensor& x_in, + DenseTensor* out) { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) auto place = dev_ctx.GetPlace(); ncclDataType_t dtype = phi::ToNCCLDataType(x_in.dtype()); @@ -52,8 +50,8 @@ void BarrierOpCUDAKernel(const Context& dev_ctx, "But ring_id(%d) is " "not found in comm_context_manager.", std::to_string(ring_id))); - auto comm_ctx = static_cast( - comm_context_manager.Get(std::to_string(ring_id))); + auto comm_ctx = + static_cast(dev_ctx.GetCommContext()); PADDLE_ENFORCE_NOT_NULL( comm_ctx, phi::errors::Unavailable( @@ -64,15 +62,6 @@ void BarrierOpCUDAKernel(const Context& dev_ctx, comm_ctx->AllReduce(out, x_in, nccl_red_type, stream); phi::backends::gpu::GpuStreamSync(stream); VLOG(3) << "new NCCLCommContext has rid " << ring_id; - } else { - auto comm = - paddle::platform::NCCLCommContext::Instance().Get(ring_id, place); - auto stream = dev_ctx.stream(); - ncclRedOp_t nccl_red_type = ncclSum; - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce( - sendbuff, recvbuff, numel, dtype, nccl_red_type, comm->comm(), stream)); - phi::backends::gpu::GpuStreamSync(stream); - VLOG(3) << "old NCCLCommContext has rid " << ring_id; } #else PADDLE_THROW( @@ -82,4 +71,4 @@ void BarrierOpCUDAKernel(const Context& dev_ctx, } // namespace phi -PD_REGISTER_KERNEL(barrier, GPU, ALL_LAYOUT, phi::BarrierOpCUDAKernel, int) {} +PD_REGISTER_KERNEL(barrier, GPU, ALL_LAYOUT, phi::BarrierKernel, int) {} From e6c643fafa88e86479aa5137e76f1e52c4335082 Mon Sep 17 00:00:00 2001 From: smallpoxscattered <1989838596@qq.com> Date: Tue, 13 Aug 2024 01:45:56 +0000 Subject: [PATCH 07/10] fix barrier --- paddle/phi/kernels/gpu/barrier_kernel.cu | 4 ---- 1 file changed, 4 deletions(-) diff --git a/paddle/phi/kernels/gpu/barrier_kernel.cu b/paddle/phi/kernels/gpu/barrier_kernel.cu index b3bd4b14d575a..2bed586ce6e0f 100644 --- a/paddle/phi/kernels/gpu/barrier_kernel.cu +++ b/paddle/phi/kernels/gpu/barrier_kernel.cu @@ -33,10 +33,6 @@ void BarrierKernel(const Context& dev_ctx, DenseTensor* out) { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) auto place = dev_ctx.GetPlace(); - ncclDataType_t dtype = phi::ToNCCLDataType(x_in.dtype()); - int64_t numel = x_in.numel(); - const void* sendbuff = x_in.data(); - void* recvbuff = dev_ctx.template Alloc(out); const auto& comm_context_manager = phi::distributed::CommContextManager::GetInstance(); From d3ee025341c9d7cf8e5b2bf4c2e4e6d46989cd91 Mon Sep 17 00:00:00 2001 From: smallpoxscattered <1989838596@qq.com> Date: Tue, 13 Aug 2024 01:47:51 +0000 Subject: [PATCH 08/10] fix barrier --- paddle/phi/kernels/gpu/barrier_kernel.cu | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/paddle/phi/kernels/gpu/barrier_kernel.cu b/paddle/phi/kernels/gpu/barrier_kernel.cu index 2bed586ce6e0f..64bed2ae6f454 100644 --- a/paddle/phi/kernels/gpu/barrier_kernel.cu +++ b/paddle/phi/kernels/gpu/barrier_kernel.cu @@ -32,20 +32,8 @@ void BarrierKernel(const Context& dev_ctx, const DenseTensor& x_in, DenseTensor* out) { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - auto place = dev_ctx.GetPlace(); - const auto& comm_context_manager = - phi::distributed::CommContextManager::GetInstance(); if (FLAGS_dynamic_static_unified_comm) { - PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(ring_id)), - true, - phi::errors::InvalidArgument( - "You choose to use new communication library by " - "setting environment " - "variable FLAGS_dynamic_static_unified_comm True. " - "But ring_id(%d) is " - "not found in comm_context_manager.", - std::to_string(ring_id))); auto comm_ctx = static_cast(dev_ctx.GetCommContext()); PADDLE_ENFORCE_NOT_NULL( From 3d0bfd0658a1f7d49c383ce8a319262b533c5688 Mon Sep 17 00:00:00 2001 From: smallpoxscattered <1989838596@qq.com> Date: Tue, 13 Aug 2024 02:55:57 +0000 Subject: [PATCH 09/10] fix barrier --- paddle/phi/kernels/gpu/barrier_kernel.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/paddle/phi/kernels/gpu/barrier_kernel.cu b/paddle/phi/kernels/gpu/barrier_kernel.cu index 64bed2ae6f454..4065e2361b41d 100644 --- a/paddle/phi/kernels/gpu/barrier_kernel.cu +++ b/paddle/phi/kernels/gpu/barrier_kernel.cu @@ -45,7 +45,6 @@ void BarrierKernel(const Context& dev_ctx, ncclRedOp_t nccl_red_type = ncclSum; comm_ctx->AllReduce(out, x_in, nccl_red_type, stream); phi::backends::gpu::GpuStreamSync(stream); - VLOG(3) << "new NCCLCommContext has rid " << ring_id; } #else PADDLE_THROW( From 44cd7ec1d98ab6255f82c624715f3c9231115fe9 Mon Sep 17 00:00:00 2001 From: smallpoxscattered <1989838596@qq.com> Date: Wed, 21 Aug 2024 01:39:28 +0000 Subject: [PATCH 10/10] fix barrier --- paddle/phi/kernels/cpu/barrier_kernel.cc | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/paddle/phi/kernels/cpu/barrier_kernel.cc b/paddle/phi/kernels/cpu/barrier_kernel.cc index 76c164f9c64c5..2d90dd21a992d 100644 --- a/paddle/phi/kernels/cpu/barrier_kernel.cc +++ b/paddle/phi/kernels/cpu/barrier_kernel.cc @@ -17,7 +17,6 @@ #if defined(PADDLE_WITH_GLOO) #include #include "paddle/phi/core/distributed/gloo_comm_context.h" -#include "paddle/phi/core/distributed/nccl_comm_context.h" #endif namespace phi { @@ -28,11 +27,11 @@ void BarrierKernel(const Context& dev_ctx, DenseTensor* out) { #if defined(PADDLE_WITH_GLOO) auto comm_ctx = - static_cast(dev_ctx.GetCommContext()); + static_cast(dev_ctx.GetCommContext()); PADDLE_ENFORCE_NE( comm_ctx, nullptr, - errors::Unavailable("NCCLCommContext is nullptr, collective op should " + errors::Unavailable("GlooCommContext is nullptr, collective op should " "has ring_id attr.")); #else PADDLE_THROW(phi::errors::Unavailable(