diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index e10a985b34833..8ce39f0db7bc1 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -92,8 +92,7 @@ if(WITH_GPU) memory dynload_cuda variable_visitor - place - device_memory_aligment) + place) nv_library( grad_merge_all_reduce_op_handle SRCS grad_merge_all_reduce_op_handle.cc @@ -105,7 +104,6 @@ if(WITH_GPU) dynload_cuda variable_visitor place - device_memory_aligment all_reduce_op_handle fused_all_reduce_op_handle) @@ -170,8 +168,7 @@ elseif(WITH_ROCM) memory dynload_cuda variable_visitor - place - device_memory_aligment) + place) hip_library( grad_merge_all_reduce_op_handle SRCS grad_merge_all_reduce_op_handle.cc @@ -183,7 +180,6 @@ elseif(WITH_ROCM) dynload_cuda variable_visitor place - device_memory_aligment all_reduce_op_handle fused_all_reduce_op_handle) @@ -233,8 +229,7 @@ else() ddim memory variable_visitor - place - device_memory_aligment) + place) cc_library( grad_merge_all_reduce_op_handle SRCS grad_merge_all_reduce_op_handle.cc @@ -245,7 +240,6 @@ else() memory variable_visitor place - device_memory_aligment all_reduce_op_handle fused_all_reduce_op_handle) if(WITH_DISTRIBUTE) diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc index bca280fc35cb9..de9161b1e3312 100644 --- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc @@ -16,9 +16,9 @@ #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/variable_visitor.h" -#include "paddle/fluid/platform/device_memory_aligment.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler/event_tracing.h" +#include "paddle/phi/backends/device_memory_aligment.h" DEFINE_bool(skip_fused_all_reduce_check, false, ""); DECLARE_bool(allreduce_record_one_event); @@ -247,7 +247,7 @@ void FusedAllReduceOpHandle::FusedAllReduceFunc( for (size_t k = 1; k < g_tensor.size(); ++k) { const void *cur_address = g_tensor.at(k - 1).second->data(); int64_t len = g_tensor.at(k - 1).second->numel(); - auto offset = platform::Alignment(len * size_of_dtype, places_[0]); + auto offset = phi::Alignment(len * size_of_dtype, places_[0]); void *infer_next_address = reinterpret_cast( reinterpret_cast(cur_address) + offset); const void *next_address = g_tensor.at(k).second->data(); @@ -400,8 +400,7 @@ void FusedAllReduceOpHandle::GetDTypeAndNumel( "The size of grad tensors of fused_all_reduce_op_handle " "must be > 0, but got %d.", len)); - *numel += - platform::Alignment(len * size_of_dtype, places_[0]) / size_of_dtype; + *numel += phi::Alignment(len * size_of_dtype, places_[0]) / size_of_dtype; } } diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 3f1bde61aa644..fce844960b0fb 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -153,7 +153,7 @@ cc_library(ops_extra_info SRCS ops_extra_info.cc DEPS attribute cudnn_workspace_ set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows_utils lapack_function lod_tensor maxouting unpooling pooling lod_rank_table context_project -sequence_pooling executor device_memory_aligment generator) +sequence_pooling executor generator) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc_functor matrix_inverse matrix_solve) @@ -167,7 +167,6 @@ if(WITH_XPU) cc_test(beam_search_decode_op_xpu_test SRCS beam_search_decode_op_xpu_test.cc DEPS lod_tensor) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xpulib) endif() -set(COMMON_OP_DEPS ${COMMON_OP_DEPS} device_memory_aligment) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} layer) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} tensor_formatter) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} op_version_registry) diff --git a/paddle/fluid/operators/coalesce_tensor_op.cc b/paddle/fluid/operators/coalesce_tensor_op.cc index e148c5b4b10e5..6bdfe9e8b754f 100644 --- a/paddle/fluid/operators/coalesce_tensor_op.cc +++ b/paddle/fluid/operators/coalesce_tensor_op.cc @@ -19,7 +19,7 @@ #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/platform/device_memory_aligment.h" +#include "paddle/phi/backends/device_memory_aligment.h" #include "paddle/phi/kernels/funcs/math_function.h" #ifdef PADDLE_WITH_ASCEND_CL #include "paddle/fluid/platform/device/npu/npu_op_runner.h" @@ -250,9 +250,9 @@ class CoalesceTensorOpKernel : public framework::OpKernel { framework::TensorCopy( *in_tensors[i], context.GetPlace(), dev_ctx, &sub_tensor); - offset += use_align ? platform::Alignment(len * size_of_dtype, - context.GetPlace(), - align_size) / + offset += use_align ? phi::Alignment(len * size_of_dtype, + context.GetPlace(), + align_size) / size_of_dtype : len; } @@ -274,9 +274,9 @@ class CoalesceTensorOpKernel : public framework::OpKernel { framework::TensorCopy( *out_tensors[i], context.GetPlace(), dev_ctx, &sub_tensor); } - offset += use_align ? platform::Alignment(len * size_of_dtype, - context.GetPlace(), - align_size) / + offset += use_align ? phi::Alignment(len * size_of_dtype, + context.GetPlace(), + align_size) / size_of_dtype : len; } @@ -296,7 +296,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel { static_cast(offset), static_cast(offset + len))) .Resize(dim); len = use_align - ? platform::Alignment( + ? phi::Alignment( len * size_of_dtype, context.GetPlace(), align_size) / size_of_dtype : len; @@ -342,12 +342,12 @@ class CoalesceTensorOpKernel : public framework::OpKernel { 0, platform::errors::InvalidArgument( "The number of tensor `%s`'s elements is 0.", var_names[i])); - auto len = use_align ? platform::Alignment( - static_cast(size) * size_of_dtype, - place, - align_size) / - size_of_dtype - : static_cast(size); + auto len = use_align + ? phi::Alignment(static_cast(size) * size_of_dtype, + place, + align_size) / + size_of_dtype + : static_cast(size); const void *ptr = lod_tensors[i]->IsInitialized() ? lod_tensors[i]->data() : nullptr; VLOG(4) << size << " " << len; diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 4a4af8d75fb40..2db144f423fc7 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -378,10 +378,6 @@ if(WITH_GPU) stats op_proto_maker shape_inference) - nv_library( - device_memory_aligment - SRCS device_memory_aligment.cc - DEPS cpu_info gpu_info place) elseif(WITH_ROCM) hip_library( profiler @@ -394,10 +390,6 @@ elseif(WITH_ROCM) stats op_proto_maker shape_inference) - hip_library( - device_memory_aligment - SRCS device_memory_aligment.cc - DEPS cpu_info gpu_info place) else() cc_library( profiler @@ -409,10 +401,6 @@ else() stats op_proto_maker shape_inference) - cc_library( - device_memory_aligment - SRCS device_memory_aligment.cc - DEPS cpu_info place) endif() cc_test( diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc index 217ba89272412..8ce9dc54b021d 100644 --- a/paddle/fluid/platform/cpu_info.cc +++ b/paddle/fluid/platform/cpu_info.cc @@ -79,11 +79,6 @@ size_t CpuMaxAllocSize() { return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory(); } -size_t CpuMinChunkSize() { - // Allow to allocate the minimum chunk size is 4 KB. - return 1 << 12; -} - size_t CpuMaxChunkSize() { // Allow to allocate the maximum chunk size is roughly 3% of CPU memory, // or the initial_cpu_memory_in_mb. diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h index 6c5bf68227a02..b1220e615da00 100644 --- a/paddle/fluid/platform/cpu_info.h +++ b/paddle/fluid/platform/cpu_info.h @@ -63,8 +63,7 @@ size_t CpuMaxAllocSize(); //! Get the maximum allocation size for a machine. size_t CUDAPinnedMaxAllocSize(); -//! Get the minimum chunk size for buddy allocator. -size_t CpuMinChunkSize(); +using phi::backends::cpu::CpuMinChunkSize; //! Get the maximum chunk size for buddy allocator. size_t CpuMaxChunkSize(); diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc index 7cceb8ccec3e1..b25f15d688730 100644 --- a/paddle/fluid/platform/device/gpu/gpu_info.cc +++ b/paddle/fluid/platform/device/gpu/gpu_info.cc @@ -124,11 +124,6 @@ size_t GpuInitAllocSize() { return GpuAllocSize(/* realloc = */ false); } size_t GpuReallocSize() { return GpuAllocSize(/* realloc = */ true); } -size_t GpuMinChunkSize() { - // Allow to allocate the minimum chunk size is 256 bytes. - return 1 << 8; -} - size_t GpuMaxChunkSize() { size_t max_chunk_size = GpuMaxAllocSize(); VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M"; @@ -410,8 +405,8 @@ void RecordedGpuFree(void *p, size_t size, int dev_id) { CUresult RecordedGpuMemCreate(CUmemGenericAllocationHandle *handle, size_t size, const CUmemAllocationProp *prop, - unsigned long long flags, - int dev_id) { // NOLINT + unsigned long long flags, // NOLINT + int dev_id) { return RecordedGpuMallocHelper::Instance(dev_id)->MemCreate( handle, size, prop, flags); } diff --git a/paddle/fluid/platform/device/gpu/gpu_info.h b/paddle/fluid/platform/device/gpu/gpu_info.h index 5e5128c29fbbf..bb876f5c526d5 100644 --- a/paddle/fluid/platform/device/gpu/gpu_info.h +++ b/paddle/fluid/platform/device/gpu/gpu_info.h @@ -20,6 +20,7 @@ limitations under the License. */ #include #include "paddle/fluid/platform/device/gpu/gpu_types.h" +#include "paddle/phi/backends/gpu/gpu_info.h" namespace paddle { namespace platform { @@ -81,8 +82,7 @@ size_t GpuInitAllocSize(); //! Get the re-allocation size of current GPU device. size_t GpuReallocSize(); -//! Get the minimum chunk size for GPU buddy allocator. -size_t GpuMinChunkSize(); +using phi::backends::gpu::GpuMinChunkSize; //! Get the maximum chunk size for GPU buddy allocator. size_t GpuMaxChunkSize(); @@ -140,8 +140,8 @@ gpuError_t GpuGetLastError(); CUresult RecordedGpuMemCreate(CUmemGenericAllocationHandle *handle, size_t size, const CUmemAllocationProp *prop, - unsigned long long flags, - int dev_id); // NOLINT + unsigned long long flags, // NOLINT + int dev_id); //! cuMemRelease with recorded info CUresult RecordedGpuMemRelease(CUmemGenericAllocationHandle handle, diff --git a/paddle/fluid/platform/device/mlu/mlu_info.cc b/paddle/fluid/platform/device/mlu/mlu_info.cc index a2e063397bd3a..f4df71d984711 100644 --- a/paddle/fluid/platform/device/mlu/mlu_info.cc +++ b/paddle/fluid/platform/device/mlu/mlu_info.cc @@ -226,11 +226,6 @@ size_t MLUInitAllocSize() { return MLUAllocSize(/* realloc = */ false); } size_t MLUReallocSize() { return MLUAllocSize(/* realloc = */ true); } -size_t MLUMinChunkSize() { - // Allow to allocate the minimum chunk size is 256 bytes. - return 1 << 8; -} - size_t MLUMaxChunkSize() { size_t max_chunk_size = MLUMaxAllocSize(); VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M"; diff --git a/paddle/fluid/platform/device/mlu/mlu_info.h b/paddle/fluid/platform/device/mlu/mlu_info.h index c0cd24f00fbb6..435e71cf10564 100644 --- a/paddle/fluid/platform/device/mlu/mlu_info.h +++ b/paddle/fluid/platform/device/mlu/mlu_info.h @@ -25,6 +25,7 @@ limitations under the License. */ #include #endif #include +#include "paddle/phi/backends/mlu/mlu_info.h" namespace paddle { @@ -89,8 +90,7 @@ size_t MLUInitAllocSize(); //! Get the re-allocation size of current MLU device. size_t MLUReallocSize(); -//! Get the minimum chunk size for MLU buddy allocator. -size_t MLUMinChunkSize(); +using phi::backends::mlu::MLUMinChunkSize; //! Get the maximum chunk size for MLU buddy allocator. size_t MLUMaxChunkSize(); diff --git a/paddle/fluid/platform/device/npu/npu_info.cc b/paddle/fluid/platform/device/npu/npu_info.cc index 9acdef985ade2..b9409802b3d30 100644 --- a/paddle/fluid/platform/device/npu/npu_info.cc +++ b/paddle/fluid/platform/device/npu/npu_info.cc @@ -179,14 +179,6 @@ size_t NPUInitAllocSize() { return NPUAllocSize(/* realloc = */ false); } size_t NPUReallocSize() { return NPUAllocSize(/* realloc = */ true); } -size_t NPUMinChunkSize() { - // NOTE(zhiqiu): It seems the min chunk size should be 512 on NPU, - // though no document specify that explicitly. - // See https://gitee.com/zhiqiuchen/Ascend/tree/master/test_reduce_sum_d for - // details. - return 1 << 9; -} - size_t NPUMaxChunkSize() { size_t max_chunk_size = NPUMaxAllocSize(); VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M"; diff --git a/paddle/fluid/platform/device/npu/npu_info.h b/paddle/fluid/platform/device/npu/npu_info.h index ea55831db2e22..534493266a5b6 100644 --- a/paddle/fluid/platform/device/npu/npu_info.h +++ b/paddle/fluid/platform/device/npu/npu_info.h @@ -22,6 +22,7 @@ limitations under the License. */ #include "acl/acl.h" #include "paddle/fluid/platform/device/npu/enforce_npu.h" +#include "paddle/phi/backends/npu/npu_info.h" namespace paddle { namespace platform { @@ -69,8 +70,7 @@ size_t NPUInitAllocSize(); //! Get the re-allocation size of current NPU device. size_t NPUReallocSize(); -//! Get the minimum chunk size for NPU buddy allocator. -size_t NPUMinChunkSize(); +using phi::backends::npu::NPUMinChunkSize; //! Get the maximum chunk size for NPU buddy allocator. size_t NPUMaxChunkSize(); diff --git a/paddle/fluid/platform/device_memory_aligment.h b/paddle/fluid/platform/device_memory_aligment.h deleted file mode 100644 index 175a9a136be35..0000000000000 --- a/paddle/fluid/platform/device_memory_aligment.h +++ /dev/null @@ -1,34 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include - -#include "paddle/fluid/platform/cpu_info.h" -#include "paddle/fluid/platform/place.h" -#if defined(PADDLE_WITH_ASCEND_CL) -#include "paddle/fluid/platform/device/npu/npu_info.h" -#endif -#include "paddle/fluid/platform/device/gpu/gpu_info.h" -#ifdef PADDLE_WITH_MLU -#include "paddle/fluid/platform/device/mlu/mlu_info.h" -#endif - -namespace paddle { -namespace platform { -size_t Alignment(size_t size, - const platform::Place &place, - int align_size = -1); -} // namespace platform -} // namespace paddle diff --git a/paddle/phi/backends/cpu/cpu_info.h b/paddle/phi/backends/cpu/cpu_info.h index cf7c6d95057f8..12db2c7d09d39 100644 --- a/paddle/phi/backends/cpu/cpu_info.h +++ b/paddle/phi/backends/cpu/cpu_info.h @@ -39,6 +39,13 @@ namespace phi { namespace backends { namespace cpu { + +//! Get the minimum chunk size for buddy allocator. +inline size_t CpuMinChunkSize() { + // Allow to allocate the minimum chunk size is 4 KB. + return 1 << 12; +} + typedef enum { isa_any, sse42, @@ -51,6 +58,7 @@ typedef enum { avx512_mic_4ops, avx512_bf16, } cpu_isa_t; // Instruction set architecture + } // namespace cpu } // namespace backends } // namespace phi diff --git a/paddle/fluid/platform/device_memory_aligment.cc b/paddle/phi/backends/device_memory_aligment.h similarity index 54% rename from paddle/fluid/platform/device_memory_aligment.cc rename to paddle/phi/backends/device_memory_aligment.h index e8a6051c19f2d..a9e1fc384085a 100644 --- a/paddle/fluid/platform/device_memory_aligment.cc +++ b/paddle/phi/backends/device_memory_aligment.h @@ -12,38 +12,53 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/platform/device_memory_aligment.h" +#pragma once +#include -namespace paddle { -namespace platform { -size_t Alignment(size_t size, const platform::Place &place, int align_size) { +#include "paddle/phi/backends/cpu/cpu_info.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/enforce.h" +#include "paddle/phi/core/errors.h" +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/phi/backends/npu/npu_info.h" +#endif +#include "paddle/phi/backends/gpu/gpu_info.h" +#ifdef PADDLE_WITH_MLU +#include "paddle/phi/backends/mlu/mlu_info.h" +#endif + +namespace phi { + +inline size_t Alignment(size_t size, + const phi::Place &place, + int align_size = -1) { size_t alignment = 0; if (align_size > 0) { alignment = align_size; } else { alignment = 1024; - if (platform::is_cpu_place(place)) { - alignment = CpuMinChunkSize(); + if (place.GetType() == phi::AllocationType::CPU) { + alignment = phi::backends::cpu::CpuMinChunkSize(); } else { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - alignment = GpuMinChunkSize(); + alignment = phi::backends::gpu::GpuMinChunkSize(); #elif defined(PADDLE_WITH_XPU) alignment = alignment; #elif defined(PADDLE_WITH_ASCEND_CL) - alignment = NPUMinChunkSize(); + alignment = phi::backends::npu::NPUMinChunkSize(); #elif defined(PADDLE_WITH_MLU) - alignment = MLUMinChunkSize(); + alignment = phi::backends::mlu::MLUMinChunkSize(); #else - PADDLE_THROW(platform::errors::PreconditionNotMet( + PADDLE_THROW(phi::errors::PreconditionNotMet( "Fluid is not compiled with CUDA/XPU/NPU/MLU.")); #endif } } - if (is_npu_place(place)) { + if (place.GetType() == phi::AllocationType::NPU) { size += 32; // required by ascendcl } size_t remaining = size % alignment; return remaining == 0 ? size : size + (alignment - remaining); } -} // namespace platform -} // namespace paddle + +} // namespace phi diff --git a/paddle/phi/backends/gpu/gpu_info.h b/paddle/phi/backends/gpu/gpu_info.h index 323565c000a1c..0f3c984ce8582 100644 --- a/paddle/phi/backends/gpu/gpu_info.h +++ b/paddle/phi/backends/gpu/gpu_info.h @@ -67,6 +67,12 @@ const gpuDeviceProp &GetDeviceProperties(int id); //! Set the GPU device id for next execution. void SetDeviceId(int device_id); +//! Get the minimum chunk size for GPU buddy allocator. +inline size_t GpuMinChunkSize() { + // Allow to allocate the minimum chunk size is 256 bytes. + return 1 << 8; +} + //! Copy memory from address src to dst asynchronously. void GpuMemcpyAsync(void *dst, const void *src, diff --git a/paddle/phi/backends/mlu/mlu_info.h b/paddle/phi/backends/mlu/mlu_info.h new file mode 100644 index 0000000000000..bf75c1cf295e8 --- /dev/null +++ b/paddle/phi/backends/mlu/mlu_info.h @@ -0,0 +1,33 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#ifdef PADDLE_WITH_MLU + +namespace phi { +namespace backends { +namespace mlu { + +//! Get the minimum chunk size for MLU buddy allocator. +inline size_t MLUMinChunkSize() { + // Allow to allocate the minimum chunk size is 256 bytes. + return 1 << 8; +} + +} // namespace mlu +} // namespace backends +} // namespace phi + +#endif diff --git a/paddle/phi/backends/npu/npu_info.h b/paddle/phi/backends/npu/npu_info.h new file mode 100644 index 0000000000000..21206ae0b28f3 --- /dev/null +++ b/paddle/phi/backends/npu/npu_info.h @@ -0,0 +1,36 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#ifdef PADDLE_WITH_ASCEND_CL + +namespace phi { +namespace backends { +namespace npu { + +//! Get the minimum chunk size for NPU buddy allocator. +inline size_t NPUMinChunkSize() { + // NOTE(zhiqiu): It seems the min chunk size should be 512 on NPU, + // though no document specify that explicitly. + // See https://gitee.com/zhiqiuchen/Ascend/tree/master/test_reduce_sum_d for + // details. + return 1 << 9; +} + +} // namespace npu +} // namespace backends +} // namespace phi + +#endif diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index 75659d2bcd81a..ef2231c059ad9 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -76,8 +76,7 @@ set(COMMON_KERNEL_DEPS fft phi_data_layout_transform gpc - utf8proc - device_memory_aligment) + utf8proc) set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} processgroup) if(WITH_NCCL OR WITH_RCCL) diff --git a/paddle/phi/kernels/check_memory_continue_kernel.cc b/paddle/phi/kernels/check_memory_continue_kernel.cc index 2fafb5b9efae6..819bb30712bf0 100644 --- a/paddle/phi/kernels/check_memory_continue_kernel.cc +++ b/paddle/phi/kernels/check_memory_continue_kernel.cc @@ -20,7 +20,7 @@ #include "paddle/phi/core/kernel_registry.h" -#include "paddle/fluid/platform/device_memory_aligment.h" +#include "paddle/phi/backends/device_memory_aligment.h" namespace phi { @@ -44,8 +44,7 @@ void CheckMemoryContinueKernel(const Context &dev_ctx, input.at(i)->dtype())); const void *cur_address = input.at(i - 1)->data(); int64_t len = input.at(i - 1)->numel(); - auto offset = - paddle::platform::Alignment(len * size_of_dtype, dev_ctx.GetPlace()); + auto offset = phi::Alignment(len * size_of_dtype, dev_ctx.GetPlace()); void *infer_next_address = reinterpret_cast( reinterpret_cast(cur_address) + offset); const void *next_address = input.at(i)->data(); @@ -71,8 +70,8 @@ void CheckMemoryContinueKernel(const Context &dev_ctx, infer_next_address, next_address)); } - numel += paddle::platform::Alignment( - (*input.rbegin())->numel() * size_of_dtype, dev_ctx.GetPlace()); + numel += phi::Alignment((*input.rbegin())->numel() * size_of_dtype, + dev_ctx.GetPlace()); // reset holder, do inplace output->ShareBufferWith(*input.at(0)); output->Resize({numel / size_of_dtype}); diff --git a/paddle/phi/kernels/coalesce_tensor_kernel.cc b/paddle/phi/kernels/coalesce_tensor_kernel.cc index 7c3e3aafd22fa..faa6f71d4cdf5 100644 --- a/paddle/phi/kernels/coalesce_tensor_kernel.cc +++ b/paddle/phi/kernels/coalesce_tensor_kernel.cc @@ -17,8 +17,8 @@ #include #include -#include "paddle/fluid/platform/device_memory_aligment.h" #include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/backends/device_memory_aligment.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" @@ -72,12 +72,12 @@ void GetMemSizeAndDtype(const std::vector &lod_tensors, 0, errors::InvalidArgument( "The number of `%d`-th tensor's elements is 0.", i)); - auto len = use_align ? paddle::platform::Alignment( - static_cast(size) * size_of_dtype, - place, - align_size) / - size_of_dtype - : static_cast(size); + auto len = use_align + ? phi::Alignment(static_cast(size) * size_of_dtype, + place, + align_size) / + size_of_dtype + : static_cast(size); const void *ptr = lod_tensors[i]->IsInitialized() ? lod_tensors[i]->data() : nullptr; VLOG(4) << size << " " << len; @@ -206,7 +206,7 @@ void CoalesceTensorKernel(const Context &dev_ctx, phi::Copy(dev_ctx, *input[i], dev_ctx.GetPlace(), false, &sub_tensor); offset += use_align - ? paddle::platform::Alignment( + ? phi::Alignment( len * size_of_dtype, dev_ctx.GetPlace(), align_size) / size_of_dtype : len; @@ -224,7 +224,7 @@ void CoalesceTensorKernel(const Context &dev_ctx, phi::Copy(dev_ctx, *output[i], dev_ctx.GetPlace(), false, &sub_tensor); } offset += use_align - ? paddle::platform::Alignment( + ? phi::Alignment( len * size_of_dtype, dev_ctx.GetPlace(), align_size) / size_of_dtype : len; @@ -244,7 +244,7 @@ void CoalesceTensorKernel(const Context &dev_ctx, ->ShareDataWith(fused_output->Slice(static_cast(offset), static_cast(offset + len))) .Resize(dim); - len = use_align ? paddle::platform::Alignment( + len = use_align ? phi::Alignment( len * size_of_dtype, dev_ctx.GetPlace(), align_size) / size_of_dtype : len;