diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index e10a985b34833..8ce39f0db7bc1 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -92,8 +92,7 @@ if(WITH_GPU)
          memory
          dynload_cuda
          variable_visitor
-         place
-         device_memory_aligment)
+         place)
   nv_library(
     grad_merge_all_reduce_op_handle
     SRCS grad_merge_all_reduce_op_handle.cc
@@ -105,7 +104,6 @@ if(WITH_GPU)
          dynload_cuda
          variable_visitor
          place
-         device_memory_aligment
          all_reduce_op_handle
          fused_all_reduce_op_handle)
 
@@ -170,8 +168,7 @@ elseif(WITH_ROCM)
          memory
          dynload_cuda
          variable_visitor
-         place
-         device_memory_aligment)
+         place)
   hip_library(
     grad_merge_all_reduce_op_handle
     SRCS grad_merge_all_reduce_op_handle.cc
@@ -183,7 +180,6 @@ elseif(WITH_ROCM)
          dynload_cuda
          variable_visitor
          place
-         device_memory_aligment
          all_reduce_op_handle
          fused_all_reduce_op_handle)
 
@@ -233,8 +229,7 @@ else()
          ddim
          memory
          variable_visitor
-         place
-         device_memory_aligment)
+         place)
   cc_library(
     grad_merge_all_reduce_op_handle
     SRCS grad_merge_all_reduce_op_handle.cc
@@ -245,7 +240,6 @@ else()
          memory
          variable_visitor
          place
-         device_memory_aligment
          all_reduce_op_handle
          fused_all_reduce_op_handle)
   if(WITH_DISTRIBUTE)
diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
index bca280fc35cb9..de9161b1e3312 100644
--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
@@ -16,9 +16,9 @@
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
-#include "paddle/fluid/platform/device_memory_aligment.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
+#include "paddle/phi/backends/device_memory_aligment.h"
 
 DEFINE_bool(skip_fused_all_reduce_check, false, "");
 DECLARE_bool(allreduce_record_one_event);
@@ -247,7 +247,7 @@ void FusedAllReduceOpHandle::FusedAllReduceFunc(
     for (size_t k = 1; k < g_tensor.size(); ++k) {
       const void *cur_address = g_tensor.at(k - 1).second->data();
       int64_t len = g_tensor.at(k - 1).second->numel();
-      auto offset = platform::Alignment(len * size_of_dtype, places_[0]);
+      auto offset = phi::Alignment(len * size_of_dtype, places_[0]);
       void *infer_next_address = reinterpret_cast<void *>(
           reinterpret_cast<uintptr_t>(cur_address) + offset);
       const void *next_address = g_tensor.at(k).second->data();
@@ -400,8 +400,7 @@ void FusedAllReduceOpHandle::GetDTypeAndNumel(
             "The size of grad tensors of fused_all_reduce_op_handle  "
             "must be > 0, but got %d.",
             len));
-    *numel +=
-        platform::Alignment(len * size_of_dtype, places_[0]) / size_of_dtype;
+    *numel += phi::Alignment(len * size_of_dtype, places_[0]) / size_of_dtype;
   }
 }
 
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 3f1bde61aa644..fce844960b0fb 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -153,7 +153,7 @@ cc_library(ops_extra_info SRCS ops_extra_info.cc DEPS attribute cudnn_workspace_
 
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows_utils lapack_function
 lod_tensor maxouting unpooling pooling lod_rank_table context_project
-sequence_pooling executor device_memory_aligment generator)
+sequence_pooling executor generator)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc_functor matrix_inverse matrix_solve)
@@ -167,7 +167,6 @@ if(WITH_XPU)
   cc_test(beam_search_decode_op_xpu_test SRCS beam_search_decode_op_xpu_test.cc DEPS lod_tensor)
   set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xpulib)
 endif()
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} device_memory_aligment)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} layer)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} tensor_formatter)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} op_version_registry)
diff --git a/paddle/fluid/operators/coalesce_tensor_op.cc b/paddle/fluid/operators/coalesce_tensor_op.cc
index e148c5b4b10e5..6bdfe9e8b754f 100644
--- a/paddle/fluid/operators/coalesce_tensor_op.cc
+++ b/paddle/fluid/operators/coalesce_tensor_op.cc
@@ -19,7 +19,7 @@
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/platform/device_memory_aligment.h"
+#include "paddle/phi/backends/device_memory_aligment.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
@@ -250,9 +250,9 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
         framework::TensorCopy(
             *in_tensors[i], context.GetPlace(), dev_ctx, &sub_tensor);
 
-        offset += use_align ? platform::Alignment(len * size_of_dtype,
-                                                  context.GetPlace(),
-                                                  align_size) /
+        offset += use_align ? phi::Alignment(len * size_of_dtype,
+                                             context.GetPlace(),
+                                             align_size) /
                                   size_of_dtype
                             : len;
       }
@@ -274,9 +274,9 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
           framework::TensorCopy(
               *out_tensors[i], context.GetPlace(), dev_ctx, &sub_tensor);
         }
-        offset += use_align ? platform::Alignment(len * size_of_dtype,
-                                                  context.GetPlace(),
-                                                  align_size) /
+        offset += use_align ? phi::Alignment(len * size_of_dtype,
+                                             context.GetPlace(),
+                                             align_size) /
                                   size_of_dtype
                             : len;
       }
@@ -296,7 +296,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
               static_cast<int64_t>(offset), static_cast<int64_t>(offset + len)))
           .Resize(dim);
       len = use_align
-                ? platform::Alignment(
+                ? phi::Alignment(
                       len * size_of_dtype, context.GetPlace(), align_size) /
                       size_of_dtype
                 : len;
@@ -342,12 +342,12 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
           0,
           platform::errors::InvalidArgument(
               "The number of tensor `%s`'s elements is 0.", var_names[i]));
-      auto len = use_align ? platform::Alignment(
-                                 static_cast<size_t>(size) * size_of_dtype,
-                                 place,
-                                 align_size) /
-                                 size_of_dtype
-                           : static_cast<size_t>(size);
+      auto len = use_align
+                     ? phi::Alignment(static_cast<size_t>(size) * size_of_dtype,
+                                      place,
+                                      align_size) /
+                           size_of_dtype
+                     : static_cast<size_t>(size);
       const void *ptr =
           lod_tensors[i]->IsInitialized() ? lod_tensors[i]->data() : nullptr;
       VLOG(4) << size << " " << len;
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 4a4af8d75fb40..2db144f423fc7 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -378,10 +378,6 @@ if(WITH_GPU)
          stats
          op_proto_maker
          shape_inference)
-  nv_library(
-    device_memory_aligment
-    SRCS device_memory_aligment.cc
-    DEPS cpu_info gpu_info place)
 elseif(WITH_ROCM)
   hip_library(
     profiler
@@ -394,10 +390,6 @@ elseif(WITH_ROCM)
          stats
          op_proto_maker
          shape_inference)
-  hip_library(
-    device_memory_aligment
-    SRCS device_memory_aligment.cc
-    DEPS cpu_info gpu_info place)
 else()
   cc_library(
     profiler
@@ -409,10 +401,6 @@ else()
          stats
          op_proto_maker
          shape_inference)
-  cc_library(
-    device_memory_aligment
-    SRCS device_memory_aligment.cc
-    DEPS cpu_info place)
 endif()
 
 cc_test(
diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc
index 217ba89272412..8ce9dc54b021d 100644
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -79,11 +79,6 @@ size_t CpuMaxAllocSize() {
   return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory();
 }
 
-size_t CpuMinChunkSize() {
-  // Allow to allocate the minimum chunk size is 4 KB.
-  return 1 << 12;
-}
-
 size_t CpuMaxChunkSize() {
   // Allow to allocate the maximum chunk size is roughly 3% of CPU memory,
   // or the initial_cpu_memory_in_mb.
diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h
index 6c5bf68227a02..b1220e615da00 100644
--- a/paddle/fluid/platform/cpu_info.h
+++ b/paddle/fluid/platform/cpu_info.h
@@ -63,8 +63,7 @@ size_t CpuMaxAllocSize();
 //! Get the maximum allocation size for a machine.
 size_t CUDAPinnedMaxAllocSize();
 
-//! Get the minimum chunk size for buddy allocator.
-size_t CpuMinChunkSize();
+using phi::backends::cpu::CpuMinChunkSize;
 
 //! Get the maximum chunk size for buddy allocator.
 size_t CpuMaxChunkSize();
diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc
index 7cceb8ccec3e1..b25f15d688730 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -124,11 +124,6 @@ size_t GpuInitAllocSize() { return GpuAllocSize(/* realloc = */ false); }
 
 size_t GpuReallocSize() { return GpuAllocSize(/* realloc = */ true); }
 
-size_t GpuMinChunkSize() {
-  // Allow to allocate the minimum chunk size is 256 bytes.
-  return 1 << 8;
-}
-
 size_t GpuMaxChunkSize() {
   size_t max_chunk_size = GpuMaxAllocSize();
   VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M";
@@ -410,8 +405,8 @@ void RecordedGpuFree(void *p, size_t size, int dev_id) {
 CUresult RecordedGpuMemCreate(CUmemGenericAllocationHandle *handle,
                               size_t size,
                               const CUmemAllocationProp *prop,
-                              unsigned long long flags,
-                              int dev_id) {  // NOLINT
+                              unsigned long long flags,  // NOLINT
+                              int dev_id) {
   return RecordedGpuMallocHelper::Instance(dev_id)->MemCreate(
       handle, size, prop, flags);
 }
diff --git a/paddle/fluid/platform/device/gpu/gpu_info.h b/paddle/fluid/platform/device/gpu/gpu_info.h
index 5e5128c29fbbf..bb876f5c526d5 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.h
+++ b/paddle/fluid/platform/device/gpu/gpu_info.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/platform/device/gpu/gpu_types.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
 
 namespace paddle {
 namespace platform {
@@ -81,8 +82,7 @@ size_t GpuInitAllocSize();
 //! Get the re-allocation size of current GPU device.
 size_t GpuReallocSize();
 
-//! Get the minimum chunk size for GPU buddy allocator.
-size_t GpuMinChunkSize();
+using phi::backends::gpu::GpuMinChunkSize;
 
 //! Get the maximum chunk size for GPU buddy allocator.
 size_t GpuMaxChunkSize();
@@ -140,8 +140,8 @@ gpuError_t GpuGetLastError();
 CUresult RecordedGpuMemCreate(CUmemGenericAllocationHandle *handle,
                               size_t size,
                               const CUmemAllocationProp *prop,
-                              unsigned long long flags,
-                              int dev_id);  // NOLINT
+                              unsigned long long flags,  // NOLINT
+                              int dev_id);
 
 //! cuMemRelease with recorded info
 CUresult RecordedGpuMemRelease(CUmemGenericAllocationHandle handle,
diff --git a/paddle/fluid/platform/device/mlu/mlu_info.cc b/paddle/fluid/platform/device/mlu/mlu_info.cc
index a2e063397bd3a..f4df71d984711 100644
--- a/paddle/fluid/platform/device/mlu/mlu_info.cc
+++ b/paddle/fluid/platform/device/mlu/mlu_info.cc
@@ -226,11 +226,6 @@ size_t MLUInitAllocSize() { return MLUAllocSize(/* realloc = */ false); }
 
 size_t MLUReallocSize() { return MLUAllocSize(/* realloc = */ true); }
 
-size_t MLUMinChunkSize() {
-  // Allow to allocate the minimum chunk size is 256 bytes.
-  return 1 << 8;
-}
-
 size_t MLUMaxChunkSize() {
   size_t max_chunk_size = MLUMaxAllocSize();
   VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M";
diff --git a/paddle/fluid/platform/device/mlu/mlu_info.h b/paddle/fluid/platform/device/mlu/mlu_info.h
index c0cd24f00fbb6..435e71cf10564 100644
--- a/paddle/fluid/platform/device/mlu/mlu_info.h
+++ b/paddle/fluid/platform/device/mlu/mlu_info.h
@@ -25,6 +25,7 @@ limitations under the License. */
 #include <cncl.h>
 #endif
 #include <vector>
+#include "paddle/phi/backends/mlu/mlu_info.h"
 
 namespace paddle {
 
@@ -89,8 +90,7 @@ size_t MLUInitAllocSize();
 //! Get the re-allocation size of current MLU device.
 size_t MLUReallocSize();
 
-//! Get the minimum chunk size for MLU buddy allocator.
-size_t MLUMinChunkSize();
+using phi::backends::mlu::MLUMinChunkSize;
 
 //! Get the maximum chunk size for MLU buddy allocator.
 size_t MLUMaxChunkSize();
diff --git a/paddle/fluid/platform/device/npu/npu_info.cc b/paddle/fluid/platform/device/npu/npu_info.cc
index 9acdef985ade2..b9409802b3d30 100644
--- a/paddle/fluid/platform/device/npu/npu_info.cc
+++ b/paddle/fluid/platform/device/npu/npu_info.cc
@@ -179,14 +179,6 @@ size_t NPUInitAllocSize() { return NPUAllocSize(/* realloc = */ false); }
 
 size_t NPUReallocSize() { return NPUAllocSize(/* realloc = */ true); }
 
-size_t NPUMinChunkSize() {
-  // NOTE(zhiqiu): It seems the min chunk size should be 512 on NPU,
-  // though no document specify that explicitly.
-  // See https://gitee.com/zhiqiuchen/Ascend/tree/master/test_reduce_sum_d for
-  // details.
-  return 1 << 9;
-}
-
 size_t NPUMaxChunkSize() {
   size_t max_chunk_size = NPUMaxAllocSize();
   VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M";
diff --git a/paddle/fluid/platform/device/npu/npu_info.h b/paddle/fluid/platform/device/npu/npu_info.h
index ea55831db2e22..534493266a5b6 100644
--- a/paddle/fluid/platform/device/npu/npu_info.h
+++ b/paddle/fluid/platform/device/npu/npu_info.h
@@ -22,6 +22,7 @@ limitations under the License. */
 
 #include "acl/acl.h"
 #include "paddle/fluid/platform/device/npu/enforce_npu.h"
+#include "paddle/phi/backends/npu/npu_info.h"
 
 namespace paddle {
 namespace platform {
@@ -69,8 +70,7 @@ size_t NPUInitAllocSize();
 //! Get the re-allocation size of current NPU device.
 size_t NPUReallocSize();
 
-//! Get the minimum chunk size for NPU buddy allocator.
-size_t NPUMinChunkSize();
+using phi::backends::npu::NPUMinChunkSize;
 
 //! Get the maximum chunk size for NPU buddy allocator.
 size_t NPUMaxChunkSize();
diff --git a/paddle/fluid/platform/device_memory_aligment.h b/paddle/fluid/platform/device_memory_aligment.h
deleted file mode 100644
index 175a9a136be35..0000000000000
--- a/paddle/fluid/platform/device_memory_aligment.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <stddef.h>
-
-#include "paddle/fluid/platform/cpu_info.h"
-#include "paddle/fluid/platform/place.h"
-#if defined(PADDLE_WITH_ASCEND_CL)
-#include "paddle/fluid/platform/device/npu/npu_info.h"
-#endif
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#ifdef PADDLE_WITH_MLU
-#include "paddle/fluid/platform/device/mlu/mlu_info.h"
-#endif
-
-namespace paddle {
-namespace platform {
-size_t Alignment(size_t size,
-                 const platform::Place &place,
-                 int align_size = -1);
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/phi/backends/cpu/cpu_info.h b/paddle/phi/backends/cpu/cpu_info.h
index cf7c6d95057f8..12db2c7d09d39 100644
--- a/paddle/phi/backends/cpu/cpu_info.h
+++ b/paddle/phi/backends/cpu/cpu_info.h
@@ -39,6 +39,13 @@
 namespace phi {
 namespace backends {
 namespace cpu {
+
+//! Get the minimum chunk size for buddy allocator.
+inline size_t CpuMinChunkSize() {
+  // Allow to allocate the minimum chunk size is 4 KB.
+  return 1 << 12;
+}
+
 typedef enum {
   isa_any,
   sse42,
@@ -51,6 +58,7 @@ typedef enum {
   avx512_mic_4ops,
   avx512_bf16,
 } cpu_isa_t;  // Instruction set architecture
+
 }  // namespace cpu
 }  // namespace backends
 }  // namespace phi
diff --git a/paddle/fluid/platform/device_memory_aligment.cc b/paddle/phi/backends/device_memory_aligment.h
similarity index 54%
rename from paddle/fluid/platform/device_memory_aligment.cc
rename to paddle/phi/backends/device_memory_aligment.h
index e8a6051c19f2d..a9e1fc384085a 100644
--- a/paddle/fluid/platform/device_memory_aligment.cc
+++ b/paddle/phi/backends/device_memory_aligment.h
@@ -12,38 +12,53 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/platform/device_memory_aligment.h"
+#pragma once
+#include <stddef.h>
 
-namespace paddle {
-namespace platform {
-size_t Alignment(size_t size, const platform::Place &place, int align_size) {
+#include "paddle/phi/backends/cpu/cpu_info.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/errors.h"
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/phi/backends/npu/npu_info.h"
+#endif
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#ifdef PADDLE_WITH_MLU
+#include "paddle/phi/backends/mlu/mlu_info.h"
+#endif
+
+namespace phi {
+
+inline size_t Alignment(size_t size,
+                        const phi::Place &place,
+                        int align_size = -1) {
   size_t alignment = 0;
   if (align_size > 0) {
     alignment = align_size;
   } else {
     alignment = 1024;
-    if (platform::is_cpu_place(place)) {
-      alignment = CpuMinChunkSize();
+    if (place.GetType() == phi::AllocationType::CPU) {
+      alignment = phi::backends::cpu::CpuMinChunkSize();
     } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      alignment = GpuMinChunkSize();
+      alignment = phi::backends::gpu::GpuMinChunkSize();
 #elif defined(PADDLE_WITH_XPU)
       alignment = alignment;
 #elif defined(PADDLE_WITH_ASCEND_CL)
-      alignment = NPUMinChunkSize();
+      alignment = phi::backends::npu::NPUMinChunkSize();
 #elif defined(PADDLE_WITH_MLU)
-      alignment = MLUMinChunkSize();
+      alignment = phi::backends::mlu::MLUMinChunkSize();
 #else
-      PADDLE_THROW(platform::errors::PreconditionNotMet(
+      PADDLE_THROW(phi::errors::PreconditionNotMet(
           "Fluid is not compiled with CUDA/XPU/NPU/MLU."));
 #endif
     }
   }
-  if (is_npu_place(place)) {
+  if (place.GetType() == phi::AllocationType::NPU) {
     size += 32;  // required by ascendcl
   }
   size_t remaining = size % alignment;
   return remaining == 0 ? size : size + (alignment - remaining);
 }
-}  // namespace platform
-}  // namespace paddle
+
+}  // namespace phi
diff --git a/paddle/phi/backends/gpu/gpu_info.h b/paddle/phi/backends/gpu/gpu_info.h
index 323565c000a1c..0f3c984ce8582 100644
--- a/paddle/phi/backends/gpu/gpu_info.h
+++ b/paddle/phi/backends/gpu/gpu_info.h
@@ -67,6 +67,12 @@ const gpuDeviceProp &GetDeviceProperties(int id);
 //! Set the GPU device id for next execution.
 void SetDeviceId(int device_id);
 
+//! Get the minimum chunk size for GPU buddy allocator.
+inline size_t GpuMinChunkSize() {
+  // Allow to allocate the minimum chunk size is 256 bytes.
+  return 1 << 8;
+}
+
 //! Copy memory from address src to dst asynchronously.
 void GpuMemcpyAsync(void *dst,
                     const void *src,
diff --git a/paddle/phi/backends/mlu/mlu_info.h b/paddle/phi/backends/mlu/mlu_info.h
new file mode 100644
index 0000000000000..bf75c1cf295e8
--- /dev/null
+++ b/paddle/phi/backends/mlu/mlu_info.h
@@ -0,0 +1,33 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_MLU
+
+namespace phi {
+namespace backends {
+namespace mlu {
+
+//! Get the minimum chunk size for MLU buddy allocator.
+inline size_t MLUMinChunkSize() {
+  // Allow to allocate the minimum chunk size is 256 bytes.
+  return 1 << 8;
+}
+
+}  // namespace mlu
+}  // namespace backends
+}  // namespace phi
+
+#endif
diff --git a/paddle/phi/backends/npu/npu_info.h b/paddle/phi/backends/npu/npu_info.h
new file mode 100644
index 0000000000000..21206ae0b28f3
--- /dev/null
+++ b/paddle/phi/backends/npu/npu_info.h
@@ -0,0 +1,36 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_ASCEND_CL
+
+namespace phi {
+namespace backends {
+namespace npu {
+
+//! Get the minimum chunk size for NPU buddy allocator.
+inline size_t NPUMinChunkSize() {
+  // NOTE(zhiqiu): It seems the min chunk size should be 512 on NPU,
+  // though no document specify that explicitly.
+  // See https://gitee.com/zhiqiuchen/Ascend/tree/master/test_reduce_sum_d for
+  // details.
+  return 1 << 9;
+}
+
+}  // namespace npu
+}  // namespace backends
+}  // namespace phi
+
+#endif
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index 75659d2bcd81a..ef2231c059ad9 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -76,8 +76,7 @@ set(COMMON_KERNEL_DEPS
     fft
     phi_data_layout_transform
     gpc
-    utf8proc
-    device_memory_aligment)
+    utf8proc)
 
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} processgroup)
 if(WITH_NCCL OR WITH_RCCL)
diff --git a/paddle/phi/kernels/check_memory_continue_kernel.cc b/paddle/phi/kernels/check_memory_continue_kernel.cc
index 2fafb5b9efae6..819bb30712bf0 100644
--- a/paddle/phi/kernels/check_memory_continue_kernel.cc
+++ b/paddle/phi/kernels/check_memory_continue_kernel.cc
@@ -20,7 +20,7 @@
 
 #include "paddle/phi/core/kernel_registry.h"
 
-#include "paddle/fluid/platform/device_memory_aligment.h"
+#include "paddle/phi/backends/device_memory_aligment.h"
 
 namespace phi {
 
@@ -44,8 +44,7 @@ void CheckMemoryContinueKernel(const Context &dev_ctx,
             input.at(i)->dtype()));
     const void *cur_address = input.at(i - 1)->data();
     int64_t len = input.at(i - 1)->numel();
-    auto offset =
-        paddle::platform::Alignment(len * size_of_dtype, dev_ctx.GetPlace());
+    auto offset = phi::Alignment(len * size_of_dtype, dev_ctx.GetPlace());
     void *infer_next_address = reinterpret_cast<void *>(
         reinterpret_cast<uintptr_t>(cur_address) + offset);
     const void *next_address = input.at(i)->data();
@@ -71,8 +70,8 @@ void CheckMemoryContinueKernel(const Context &dev_ctx,
             infer_next_address,
             next_address));
   }
-  numel += paddle::platform::Alignment(
-      (*input.rbegin())->numel() * size_of_dtype, dev_ctx.GetPlace());
+  numel += phi::Alignment((*input.rbegin())->numel() * size_of_dtype,
+                          dev_ctx.GetPlace());
   // reset holder, do inplace
   output->ShareBufferWith(*input.at(0));
   output->Resize({numel / size_of_dtype});
diff --git a/paddle/phi/kernels/coalesce_tensor_kernel.cc b/paddle/phi/kernels/coalesce_tensor_kernel.cc
index 7c3e3aafd22fa..faa6f71d4cdf5 100644
--- a/paddle/phi/kernels/coalesce_tensor_kernel.cc
+++ b/paddle/phi/kernels/coalesce_tensor_kernel.cc
@@ -17,8 +17,8 @@
 #include <sstream>
 #include <vector>
 
-#include "paddle/fluid/platform/device_memory_aligment.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/device_memory_aligment.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
@@ -72,12 +72,12 @@ void GetMemSizeAndDtype(const std::vector<const DenseTensor *> &lod_tensors,
                       0,
                       errors::InvalidArgument(
                           "The number of `%d`-th tensor's elements is 0.", i));
-    auto len = use_align ? paddle::platform::Alignment(
-                               static_cast<size_t>(size) * size_of_dtype,
-                               place,
-                               align_size) /
-                               size_of_dtype
-                         : static_cast<size_t>(size);
+    auto len = use_align
+                   ? phi::Alignment(static_cast<size_t>(size) * size_of_dtype,
+                                    place,
+                                    align_size) /
+                         size_of_dtype
+                   : static_cast<size_t>(size);
     const void *ptr =
         lod_tensors[i]->IsInitialized() ? lod_tensors[i]->data() : nullptr;
     VLOG(4) << size << " " << len;
@@ -206,7 +206,7 @@ void CoalesceTensorKernel(const Context &dev_ctx,
       phi::Copy(dev_ctx, *input[i], dev_ctx.GetPlace(), false, &sub_tensor);
 
       offset += use_align
-                    ? paddle::platform::Alignment(
+                    ? phi::Alignment(
                           len * size_of_dtype, dev_ctx.GetPlace(), align_size) /
                           size_of_dtype
                     : len;
@@ -224,7 +224,7 @@ void CoalesceTensorKernel(const Context &dev_ctx,
         phi::Copy(dev_ctx, *output[i], dev_ctx.GetPlace(), false, &sub_tensor);
       }
       offset += use_align
-                    ? paddle::platform::Alignment(
+                    ? phi::Alignment(
                           len * size_of_dtype, dev_ctx.GetPlace(), align_size) /
                           size_of_dtype
                     : len;
@@ -244,7 +244,7 @@ void CoalesceTensorKernel(const Context &dev_ctx,
         ->ShareDataWith(fused_output->Slice(static_cast<int64_t>(offset),
                                             static_cast<int64_t>(offset + len)))
         .Resize(dim);
-    len = use_align ? paddle::platform::Alignment(
+    len = use_align ? phi::Alignment(
                           len * size_of_dtype, dev_ctx.GetPlace(), align_size) /
                           size_of_dtype
                     : len;