-
Notifications
You must be signed in to change notification settings - Fork 5.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
【Hackathon 6th Fundable Projects 3 No.13】barrier #67310
Changes from 6 commits
81df751
c08f3bd
c381aa5
385a89a
6a599ec
88bea29
d1296d9
e6c643f
d3ee025
3d0bfd0
44cd7ec
9a050a3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
This file was deleted.
This file was deleted.
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
#include "paddle/phi/core/distributed/comm_context_manager.h" | ||
#include "paddle/phi/core/kernel_registry.h" | ||
|
||
#if defined(PADDLE_WITH_GLOO) | ||
#include <gloo/barrier.h> | ||
#include "paddle/phi/core/distributed/gloo_comm_context.h" | ||
#endif | ||
|
||
namespace phi { | ||
|
||
template <typename T, typename Context> | ||
void BarrierKernel(const Context& dev_ctx, | ||
const DenseTensor& x_in, | ||
int ring_id, | ||
DenseTensor* out) { | ||
#if defined(PADDLE_WITH_GLOO) | ||
const auto& comm_context_manager = | ||
phi::distributed::CommContextManager::GetInstance(); | ||
if (comm_context_manager.Has(std::to_string(ring_id))) { | ||
auto* comm_context = static_cast<phi::distributed::GlooCommContext*>( | ||
comm_context_manager.Get(std::to_string(ring_id))); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. phi kernel 里不能使用单例。
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 老师,cpu里面好像用不了 NCCL,我可以改成 auto comm_ctx = |
||
comm_context->Barrier(); | ||
} else { | ||
PADDLE_THROW(phi::errors::Unavailable( | ||
"You must initialize the gloo environment first to use it.")); | ||
} | ||
#else | ||
PADDLE_THROW(phi::errors::Unavailable( | ||
"PaddlePaddle should compile with GLOO by setting WITH_GLOO=ON")); | ||
#endif | ||
} | ||
|
||
} // namespace phi | ||
|
||
PD_REGISTER_KERNEL(barrier, CPU, ALL_LAYOUT, phi::BarrierKernel, int) {} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
#include "glog/logging.h" | ||
|
||
#include "paddle/phi/backends/gpu/gpu_context.h" | ||
#include "paddle/phi/core/distributed/comm_context_manager.h" | ||
#include "paddle/phi/core/kernel_registry.h" | ||
|
||
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) | ||
#include "paddle/common/flags.h" | ||
#include "paddle/phi/backends/gpu/gpu_helper.h" | ||
#include "paddle/phi/core/distributed/nccl_comm_context.h" | ||
#include "paddle/phi/core/platform/collective_helper.h" | ||
COMMON_DECLARE_bool(dynamic_static_unified_comm); | ||
#endif | ||
|
||
namespace phi { | ||
|
||
template <typename T, typename Context> | ||
void BarrierOpCUDAKernel(const Context& dev_ctx, | ||
const DenseTensor& x_in, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. BarrierOpCUDAKernel 改成 BarrierKernel |
||
int ring_id, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ring_id kernel中用不到,这里删掉 |
||
DenseTensor* out) { | ||
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) | ||
auto place = dev_ctx.GetPlace(); | ||
ncclDataType_t dtype = phi::ToNCCLDataType(x_in.dtype()); | ||
int64_t numel = x_in.numel(); | ||
const void* sendbuff = x_in.data(); | ||
void* recvbuff = dev_ctx.template Alloc<T>(out); | ||
|
||
const auto& comm_context_manager = | ||
phi::distributed::CommContextManager::GetInstance(); | ||
if (FLAGS_dynamic_static_unified_comm) { | ||
PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(ring_id)), | ||
true, | ||
phi::errors::InvalidArgument( | ||
"You choose to use new communication library by " | ||
"setting environment " | ||
"variable FLAGS_dynamic_static_unified_comm True. " | ||
"But ring_id(%d) is " | ||
"not found in comm_context_manager.", | ||
std::to_string(ring_id))); | ||
auto comm_ctx = static_cast<phi::distributed::NCCLCommContext*>( | ||
comm_context_manager.Get(std::to_string(ring_id))); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. phi kernel 里不能使用单例。
|
||
PADDLE_ENFORCE_NOT_NULL( | ||
comm_ctx, | ||
phi::errors::Unavailable( | ||
"NCCLCommContext is nullptr, collective op should " | ||
"has ring_id attr.")); | ||
auto stream = comm_ctx->GetStream(); | ||
ncclRedOp_t nccl_red_type = ncclSum; | ||
comm_ctx->AllReduce(out, x_in, nccl_red_type, stream); | ||
phi::backends::gpu::GpuStreamSync(stream); | ||
VLOG(3) << "new NCCLCommContext has rid " << ring_id; | ||
} else { | ||
auto comm = | ||
paddle::platform::NCCLCommContext::Instance().Get(ring_id, place); | ||
auto stream = dev_ctx.stream(); | ||
ncclRedOp_t nccl_red_type = ncclSum; | ||
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce( | ||
sendbuff, recvbuff, numel, dtype, nccl_red_type, comm->comm(), stream)); | ||
phi::backends::gpu::GpuStreamSync(stream); | ||
VLOG(3) << "old NCCLCommContext has rid " << ring_id; | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. phi 下算子,只用新通信库,即 FLAGS_dynamic_static_unified_comm = True。那么, |
||
#else | ||
PADDLE_THROW( | ||
phi::errors::Unavailable("PaddlePaddle should compile with NCCL.")); | ||
#endif | ||
} | ||
|
||
} // namespace phi | ||
|
||
PD_REGISTER_KERNEL(barrier, GPU, ALL_LAYOUT, phi::BarrierOpCUDAKernel, int) {} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -455,6 +455,17 @@ | |
data_type : param | ||
inplace : (in_sum_1 -> out_sum_1), (in_sum_2 -> out_sum_2), (in_sum_3 -> out_sum_3), (in_num_accumulates -> out_num_accumulates), (in_old_num_accumulates -> out_old_num_accumulates), (in_num_updates -> out_num_updates) | ||
|
||
- op : barrier | ||
args : (Tensor x, int ring_id = 0) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 注意:这里的ring_id 不能删 |
||
output : Tensor(out) | ||
infer_meta : | ||
func : UnchangedInferMeta | ||
param : [x] | ||
kernel : | ||
func : barrier | ||
data_type : x | ||
traits : paddle::dialect::ForwardOnlyTrait | ||
|
||
- op : batch_fc | ||
args : (Tensor input, Tensor w, Tensor bias) | ||
output : Tensor(out) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ring_id kernel中用不到,这里删掉