diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_moe_op.cu b/paddle/fluid/operators/fused/fused_multi_transformer_moe_op.cu index 91fa104498bf4..6e6b41dd6ab74 100644 --- a/paddle/fluid/operators/fused/fused_multi_transformer_moe_op.cu +++ b/paddle/fluid/operators/fused/fused_multi_transformer_moe_op.cu @@ -429,7 +429,6 @@ class FusedMultiTransformerMoeOpKernel : public framework::OpKernel { } else { sliced_inp = buf0; } - VLOG(0) << "sliced_inp shape[0]: " << sliced_inp.dims()[0]; #ifdef _DEBUG_FUSED_MULTI_TRANSFORMER VLOG(0) << "moe, gate & topk"; #endif @@ -825,12 +824,10 @@ class FusedMultiTransformerMoeOpKernel : public framework::OpKernel { // phi::AddKernel(dev_ctx, all_gather_out, bias_dropout_residual_out, &moe_out); x_data = moe_out.data(); - VLOG(0) << "layer " << i + 1 << " end"; } // layers loop end moe_out.Resize({{bsz, seq_len, dim_embed}}); *out = moe_out; - VLOG(0) << "kernel end"; } }; diff --git a/paddle/phi/kernels/fused_moe_kernel.h b/paddle/phi/kernels/fused_moe_kernel.h index a4abd127beb9a..d6a1f112c762c 100644 --- a/paddle/phi/kernels/fused_moe_kernel.h +++ b/paddle/phi/kernels/fused_moe_kernel.h @@ -72,7 +72,6 @@ static void AllToAll(Tensor& tensor, // NOLINT out_tensor.push_back(out); auto task = pg_nccl->AllToAll(in_tensor, out_tensor); task->Wait(); - VLOG(0) << "wait, all to all success !"; } else { auto dtype = platform::ToNCCLDataType( framework::TransToProtoVarType(tensor.dtype())); @@ -245,7 +244,6 @@ void GlobalScatterFunctor(const phi::GPUContext& ctx, } #ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); - // VLOG(0) << "GlobalScatterFunctor cudaDeviceSynchronize success !"; #else PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); #endif @@ -334,12 +332,10 @@ void GlobalScatterProcessGroupFunctor(const phi::GPUContext& ctx, } } PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); - // VLOG(0) << "GlobalScatterProcessGroupFunctor ncclGroupEnd " << i; } #ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); - // VLOG(0) << "GlobalScatterProcessGroupFunctor cudaDeviceSynchronize success!"; #else PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); #endif @@ -449,7 +445,6 @@ void GlobalGatherFunctor(const phi::GPUContext& ctx, #ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); - // VLOG(0) << "GlobalGatherFunctor cudaDeviceSynchronize success !"; #else PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); #endif @@ -542,7 +537,6 @@ void GlobalGatherProcessGroupFunctor(const phi::GPUContext& ctx, #ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); - // VLOG(0) << "GlobalGatherProcessGroupFunctor cudaDeviceSynchronize success !"; #else PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); #endif diff --git a/paddle/phi/kernels/gpu/fused_moe_kernel.cu b/paddle/phi/kernels/gpu/fused_moe_kernel.cu index d9d5abab64e82..657f53b9e29a8 100644 --- a/paddle/phi/kernels/gpu/fused_moe_kernel.cu +++ b/paddle/phi/kernels/gpu/fused_moe_kernel.cu @@ -54,7 +54,6 @@ void FusedMoeKernel(const DeviceContext& dev_ctx, // auto out_dim = out->dims(); int bsz_seq = bsz * seq_len; int tot_expert = world_size * num_expert; - // VLOG(0) << "moe, get dim: bsz_seq:" << bsz_seq << ", x.dim:" << x_dim << ", out.dim:" << out_dim; // pre_layer_norm const U* ln_scale_ptr = ln_scale.data(); @@ -70,7 +69,6 @@ void FusedMoeKernel(const DeviceContext& dev_ctx, Tensor ln_out; ln_out.Resize({{bsz, seq_len, d_model}}); auto *ln_out_data = dev_ctx.template Alloc(&ln_out); - // VLOG(0) << "moe, alloc pre layer norm"; // after slice, bsz_seq should be change int sliced_bsz_seq = bsz_seq; int start = 0; @@ -418,4 +416,4 @@ PD_REGISTER_KERNEL(fused_moe_kernel, phi::FusedMoeKernel, float, double, - paddle::platform::float16) {} \ No newline at end of file + paddle::platform::float16) {}