diff --git a/cmake/hip.cmake b/cmake/hip.cmake
index 83c283a10d576..5114289951318 100644
--- a/cmake/hip.cmake
+++ b/cmake/hip.cmake
@@ -113,11 +113,14 @@ list(APPEND HIP_CXX_FLAGS -Wno-dangling-gsl)
 list(APPEND HIP_CXX_FLAGS -Wno-unused-value)
 list(APPEND HIP_CXX_FLAGS -Wno-braced-scalar-init)
 list(APPEND HIP_CXX_FLAGS -Wno-return-type)
+list(APPEND HIP_CXX_FLAGS -Wno-pragma-once-outside-header)
+
 if(WITH_CINN)
   list(APPEND HIP_CXX_FLAGS -std=c++14)
 else()
   list(APPEND HIP_CXX_FLAGS -std=c++17)
 endif()
+list(APPEND HIP_CXX_FLAGS --gpu-max-threads-per-block=1024)
 
 if(CMAKE_BUILD_TYPE MATCHES Debug)
   list(APPEND HIP_CXX_FLAGS -g2)
@@ -130,11 +133,11 @@ set(HIP_CLANG_FLAGS ${HIP_CXX_FLAGS})
 # Ask hcc to generate device code during compilation so we can use
 # host linker to link.
 list(APPEND HIP_HCC_FLAGS -fno-gpu-rdc)
-list(APPEND HIP_HCC_FLAGS --amdgpu-target=gfx906)
-list(APPEND HIP_HCC_FLAGS --amdgpu-target=gfx908)
+list(APPEND HIP_HCC_FLAGS --offload-arch=gfx906)
+list(APPEND HIP_HCC_FLAGS --offload-arch=gfx908)
 list(APPEND HIP_CLANG_FLAGS -fno-gpu-rdc)
-list(APPEND HIP_CLANG_FLAGS --amdgpu-target=gfx906)
-list(APPEND HIP_CLANG_FLAGS --amdgpu-target=gfx908)
+list(APPEND HIP_CLANG_FLAGS --offload-arch=gfx906)
+list(APPEND HIP_CLANG_FLAGS --offload-arch=gfx908)
 
 if(HIP_COMPILER STREQUAL clang)
   set(hip_library_name amdhip64)
diff --git a/paddle/phi/common/bfloat16.h b/paddle/phi/common/bfloat16.h
index 7ea9b0cbb6477..ccaf9b82bc5a5 100644
--- a/paddle/phi/common/bfloat16.h
+++ b/paddle/phi/common/bfloat16.h
@@ -31,11 +31,15 @@
 #include <cuda_bf16.h>
 #endif
 
+#ifndef PADDLE_WITH_HIP
 #if !defined(_WIN32)
 #define PADDLE_ALIGN(x) __attribute__((aligned(x)))
 #else
 #define PADDLE_ALIGN(x) __declspec(align(x))
 #endif
+#else
+#define PADDLE_ALIGN(x)
+#endif
 
 namespace phi {
 namespace dtype {
diff --git a/paddle/phi/common/complex.h b/paddle/phi/common/complex.h
index 833ddcf46b2fe..130047f850426 100644
--- a/paddle/phi/common/complex.h
+++ b/paddle/phi/common/complex.h
@@ -31,11 +31,15 @@
 #include <thrust/complex.h>  // NOLINT
 #endif
 
+#ifndef PADDLE_WITH_HIP
 #if !defined(_WIN32)
 #define PADDLE_ALIGN(x) __attribute__((aligned(x)))
 #else
 #define PADDLE_ALIGN(x) __declspec(align(x))
 #endif
+#else
+#define PADDLE_ALIGN(x)
+#endif
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 // todo
diff --git a/paddle/phi/common/float16.h b/paddle/phi/common/float16.h
index 86168d441ded2..9d60b8c6241ae 100644
--- a/paddle/phi/common/float16.h
+++ b/paddle/phi/common/float16.h
@@ -51,11 +51,15 @@
 #include <hip/hip_fp16.h>
 #endif
 
+#ifndef PADDLE_WITH_HIP
 #if !defined(_WIN32)
 #define PADDLE_ALIGN(x) __attribute__((aligned(x)))
 #else
 #define PADDLE_ALIGN(x) __declspec(align(x))
 #endif
+#else
+#define PADDLE_ALIGN(x)
+#endif
 
 #define CUDA_ARCH_FP16_SUPPORTED(CUDA_ARCH) (CUDA_ARCH >= 600)
 
diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cu b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
index 5a7574b56a891..d2e01503d43b2 100644
--- a/paddle/phi/kernels/funcs/concat_and_split_functor.cu
+++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
@@ -49,11 +49,15 @@ static inline void GetBlockDims(const phi::GPUContext& context,
   *grid_dims = dim3(grid_cols, grid_rows, 1);
 }
 
+#ifndef PADDLE_WITH_HIP
 #if !defined(_WIN32)
 #define PADDLE_ALIGN(x) __attribute__((aligned(x)))
 #else
 #define PADDLE_ALIGN(x)
 #endif
+#else
+#define PADDLE_ALIGN(x)
+#endif
 
 template <typename T, int Size>
 struct PointerWrapper {
@@ -572,15 +576,6 @@ void ConcatFunctorWithIndexType(const phi::GPUContext& ctx,
   std::vector<IndexT> inputs_col_vec(inputs_col_num, 0);
   const T** inputs_data = inputs_data_vec.data();
   IndexT* inputs_col = inputs_col_vec.data();
-#ifdef PADDLE_WITH_HIP
-  // TODO(chentianyu03): try to find a method to remove the Alloc function
-  phi::Allocator::AllocationPtr data_alloc =
-      phi::memory_utils::Alloc(phi::GPUPinnedPlace(), in_num * sizeof(T*));
-  inputs_data = reinterpret_cast<const T**>(data_alloc->ptr());
-  phi::Allocator::AllocationPtr col_alloc = phi::memory_utils::Alloc(
-      phi::GPUPinnedPlace(), inputs_col_num * sizeof(IndexT));
-  inputs_col = reinterpret_cast<IndexT*>(col_alloc->ptr());
-#endif
 
   bool has_same_shape = true;
   for (int i = 0; i < in_num; ++i) {
@@ -604,19 +599,6 @@ void ConcatFunctorWithIndexType(const phi::GPUContext& ctx,
                                   in_num,
                                   limit_num,
                                   has_same_shape);
-
-#ifdef PADDLE_WITH_HIP
-  // Prevent pinned memory from being covered and release the memory after
-  // kernel launch of the stream is executed (reapply pinned memory next time)
-  auto* data_alloc_released = data_alloc.release();
-  auto* col_alloc_released = col_alloc.release();
-  ctx.AddStreamCallback([data_alloc_released, col_alloc_released] {
-    VLOG(4) << "Delete cuda pinned at " << data_alloc_released;
-    VLOG(4) << "Delete cuda pinned at " << col_alloc_released;
-    phi::memory_utils::AllocationDeleter(data_alloc_released);
-    phi::memory_utils::AllocationDeleter(col_alloc_released);
-  });
-#endif
 }
 
 template <typename T>
@@ -780,25 +762,6 @@ void SplitFunctorDispatchWithIndexType(
   IndexT* outs_cols = outputs_cols_vec.data();
   T** outs_data = nullptr;
 
-// There are some differences between hip runtime and NV runtime.
-// In NV, when the pageable memory data less than 64K is transferred from
-// hosttodevice, it will be automatically asynchronous.
-// However, only pinned memory in hip can copy asynchronously
-// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#concurrent-execution-host-device
-// 3.2.6.1. Concurrent Execution between Host and Device
-// Memory copies from host to device of a memory block of 64 KB or less
-#ifdef PADDLE_WITH_HIP
-  phi::Allocator::AllocationPtr data_alloc, cols_alloc;
-  // TODO(chentianyu03): try to find a method to remove the Alloc function
-  data_alloc =
-      phi::memory_utils::Alloc(phi::GPUPinnedPlace(), out_num * sizeof(T*));
-  outs_data = reinterpret_cast<T**>(data_alloc->ptr());
-  // TODO(chentianyu03): try to find a method to remove the Alloc function
-  cols_alloc = phi::memory_utils::Alloc(phi::GPUPinnedPlace(),
-                                        (out_cols_num) * sizeof(IndexT));
-  outs_cols = reinterpret_cast<IndexT*>(cols_alloc->ptr());
-#endif
-
   outs_cols[0] = 0;
   for (int i = 0; i < out_num; ++i) {
     IndexT t_col = ref_ins.at(i)->numel() / out_row;
@@ -835,17 +798,6 @@ void SplitFunctorDispatchWithIndexType(
               outs_data));
     }
   }
-
-#ifdef PADDLE_WITH_HIP
-  // Prevent pinned memory from being covered and release the memory after
-  // kernel launch of the stream is executed (reapply pinned memory next time)
-  auto* data_alloc_released = data_alloc.release();
-  auto* cols_alloc_released = cols_alloc.release();
-  ctx.AddStreamCallback([data_alloc_released, cols_alloc_released] {
-    phi::memory_utils::AllocationDeleter(data_alloc_released);
-    phi::memory_utils::AllocationDeleter(cols_alloc_released);
-  });
-#endif
 }
 
 template <typename T>
diff --git a/paddle/phi/kernels/funcs/segmented_array.h b/paddle/phi/kernels/funcs/segmented_array.h
index 60e25a95747ea..e6ecb9819e505 100644
--- a/paddle/phi/kernels/funcs/segmented_array.h
+++ b/paddle/phi/kernels/funcs/segmented_array.h
@@ -21,11 +21,15 @@
 namespace phi {
 namespace funcs {
 
+#ifndef PADDLE_WITH_HIP
 #if !defined(_WIN32)
 #define PADDLE_ALIGN(x) __attribute__((aligned(x)))
 #else
 #define PADDLE_ALIGN(x)
 #endif
+#else
+#define PADDLE_ALIGN(x)
+#endif
 
 enum class SegmentedArraySize {
   kVariableLength = 0,