diff --git a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
index d62fc1c084962..e342d4422a5ed 100644
--- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
@@ -7,10 +7,10 @@ IF(WITH_GPU)
         get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)
         SET(HETERPS_DEPS ${HETERPS_DEPS} ${RPC_DEPS})
     endif()
-    nv_library(heter_comm_kernel SRCS heter_comm_kernel.cu feature_value.h DEPS ${HETERPS_DEPS})
-    nv_library(hashtable_kernel SRCS hashtable_kernel.cu feature_value.h DEPS ${HETERPS_DEPS})
-    nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h mem_pool.h DEPS ${HETERPS_DEPS} heter_comm_kernel hashtable_kernel)
-    nv_test(test_heter_comm SRCS feature_value.h DEPS heter_comm)
+    nv_library(heter_comm_kernel SRCS heter_comm_kernel.cu feature_value.h feature_value.cu DEPS ${HETERPS_DEPS})
+    nv_library(hashtable_kernel SRCS hashtable_kernel.cu feature_value.h feature_value.cu DEPS ${HETERPS_DEPS})
+    nv_library(heter_comm SRCS heter_comm.h feature_value.h feature_value.cu heter_resource.cc heter_resource.h mem_pool.h DEPS ${HETERPS_DEPS} heter_comm_kernel hashtable_kernel)
+    nv_test(test_heter_comm SRCS DEPS heter_comm)
     nv_library(heter_ps SRCS heter_ps.cu DEPS heter_comm)
     if(WITH_PSCORE)
         nv_library(graph_gpu_ps SRCS graph_gpu_ps_table_inl.cu DEPS heter_comm table hashtable_kernel)
diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.cu b/paddle/fluid/framework/fleet/heter_ps/feature_value.cu
new file mode 100644
index 0000000000000..25ad7e99f2895
--- /dev/null
+++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.cu
@@ -0,0 +1,445 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_HETERPS
+#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+
+namespace paddle {
+namespace framework {
+
+const int CUDA_NUM_THREADS = platform::PADDLE_CUDA_NUM_THREADS;
+#define GET_BLOCK(N) ((N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS)
+#define CUDA_BLOCK(N) GET_BLOCK(N), CUDA_NUM_THREADS, 0
+
+template <typename FVAccessor>
+__global__ void PullCopy(float** dest,
+                         const float* src,
+                         const int64_t* len,
+                         int slot_num,
+                         int total_len,
+                         uint64_t** keys,
+                         uint64_t max_val_size,
+                         int* gpu_dim,
+                         FVAccessor feature_value_accessor) {
+  CUDA_KERNEL_LOOP(i, total_len) {
+    int low = 0;
+    int high = slot_num - 1;
+    while (low < high) {
+      int mid = (low + high) / 2;
+      if (i < len[mid])
+        high = mid;
+      else
+        low = mid + 1;
+    }
+    int x = low;
+    int y = i - (x ? len[x - 1] : 0);
+    float* feature_value_ptr =
+        (float*)((char*)src + uint64_t(i) * uint64_t(max_val_size));
+    int mf_dim = gpu_dim[x] - 3;
+    feature_value_accessor.Select(
+        dest[x] + y * (mf_dim + 3), feature_value_ptr, keys[x] + y, mf_dim);
+  }
+}
+
+template<typename TAccess>
+__global__ void PullDedupCopy(
+    const size_t N, const uint64_t* total_keys, float** dest, const float* src,
+    const int64_t* slot_lens, uint64_t max_val_size, const int* slot_dims,
+    const int hidden, const int* key2slot, const uint32_t* restore_idx,
+    TAccess accessor) {
+  CUDA_KERNEL_LOOP(idx, N) {
+    int i = idx / hidden;
+    int off = idx % hidden;
+
+    int x = key2slot[i];
+    int y = i - slot_lens[x];
+
+    assert(slot_dims[x] == hidden);
+    float* dest_ptr = dest[x] + y * hidden;
+    // 0 key fill zero
+    if (total_keys[i] == 0) {
+      *(dest_ptr + off) = 0;
+      return;
+    }
+
+    float* src_ptr =
+        (float*)((char*)src +
+                 uint64_t(restore_idx[i]) * uint64_t(max_val_size));
+    switch (off) {
+      case 0:
+        *(dest_ptr + off) = src_ptr[accessor.ShowIndex()];
+        break;
+      case 1:
+        *(dest_ptr + off) = src_ptr[accessor.ClickIndex()];
+        break;
+      case 2:
+        *(dest_ptr + off) = src_ptr[accessor.EmbedWIndex()];
+        break;
+      default:
+        if (src_ptr[accessor.MfSizeIndex()] == 0) {
+          *(dest_ptr + off) = 0;
+        } else {
+          *(dest_ptr + off) =
+                  src_ptr[accessor.EmbedxWIndex() + off - 3];
+        }
+        break;
+    }
+  }
+}
+
+template <typename FVAccessor>
+__global__ void PushCopyWithPool(float* dest,
+                                 float** src,
+                                 int64_t* len,
+                                 int slot_num,
+                                 uint64_t total_len,
+                                 int bs,
+                                 int* slot_vector,
+                                 int* mf_dim_vector,
+                                 size_t grad_value_size,
+                                 FVAccessor feature_value_accessor) {
+  CUDA_KERNEL_LOOP(i, total_len) {
+    int low = 0;
+    int high = slot_num - 1;
+    while (low < high) {
+      int mid = (low + high) / 2;
+      if (i < len[mid])
+        high = mid;
+      else
+        low = mid + 1;
+    }
+    int x = low;
+    int y = i - (x ? len[low - 1] : 0);
+    float* cur = (float*)((char*)dest + i * grad_value_size);
+
+    cur[feature_value_accessor.common_push_value.SlotIndex()] =
+        (float)slot_vector[x];
+    int mf_dim = mf_dim_vector[x];
+    cur[feature_value_accessor.common_push_value.MfDimIndex()] = mf_dim;
+
+    cur[feature_value_accessor.common_push_value.ShowIndex()] =
+        *(src[x] + y * (mf_dim + 3));
+    cur[feature_value_accessor.common_push_value.ClickIndex()] =
+        *(src[x] + y * (mf_dim + 3) + 1);
+    cur[feature_value_accessor.common_push_value.EmbedGIndex()] =
+        *(src[x] + y * (mf_dim + 3) + 2) * -1. * bs;
+    for (int j = 0; j < mf_dim; j++) {
+      cur[feature_value_accessor.common_push_value.EmbedxGIndex() + j] =
+          *(src[x] + y * (mf_dim + 3) + 3 + j) * -1. * bs;
+    }
+  }
+}
+
+template<typename TAccess>
+__global__ void PushMergeCopyAtomic(
+    const size_t N, const uint64_t* total_keys, float* dest, float** src,
+    const int hidden, const int bs, const int* slot_vector,
+    const int* slot_dims, const int64_t* slot_lens, const int* key2slot,
+    const uint32_t* d_restore_idx, size_t grad_value_size,
+    TAccess accessor) {
+  CUDA_KERNEL_LOOP(idx, N) {
+    int i = idx / hidden;
+    int off = idx % hidden;
+    // filter 0 keys
+    if (total_keys[i] == 0) {
+      return;
+    }
+
+    int x = key2slot[i];
+    int y = i - slot_lens[x];
+
+    const float* ptr = src[x] + y * hidden;
+    float* cur = (float*)((char*)dest + d_restore_idx[i] * grad_value_size);
+    int mf_dim = slot_dims[x] - 3;
+    switch (off) {
+      case 0:
+        cur[accessor.SlotIndex()] = (float)slot_vector[x];
+        cur[accessor.MfDimIndex()] = mf_dim;
+        paddle::platform::CudaAtomicAdd(
+            &cur[accessor.ShowIndex()], *(ptr + off));
+        break;
+      case 1:
+        paddle::platform::CudaAtomicAdd(
+            &cur[accessor.ClickIndex()], *(ptr + off));
+        break;
+      case 2:
+        paddle::platform::CudaAtomicAdd(
+            &cur[accessor.EmbedGIndex()], *(ptr + off) * -1. * bs);
+        break;
+      default:
+        int embedx_idx = off - 3;
+        if (mf_dim < embedx_idx) {
+            return;
+        }
+        paddle::platform::CudaAtomicAdd(
+            &cur[accessor.EmbedxGIndex() + embedx_idx], *(ptr + off) * -1. * bs);
+        break;
+    }
+  }
+}
+
+#define SUM_GRAD_VALUE      \
+   for (uint32_t j = 0; j < count; ++j) { \
+      const uint32_t& pos = d_sort_idx[start + j]; \
+      const int& x = key2slot[pos]; \
+      y = pos - slot_lens[x]; \
+      val += *(reinterpret_cast<float*>(src[x] + y * hidden + off)); \
+   }
+
+template<typename TAccess>
+__global__ void PushMergeCopy(
+    const size_t N, const uint64_t* total_keys, float* dest, float** src,
+    const int hidden, const int bs, const int* slot_vector,
+    const int* slot_dims, const int64_t* slot_lens, const int* key2slot,
+    const uint32_t* d_sort_idx, 
+    const uint32_t* d_sort_offset, 
+    const uint32_t* d_sort_cnt, size_t grad_value_size,
+    TAccess accessor) {
+  CUDA_KERNEL_LOOP(idx, N) {
+    int i = idx / hidden;
+    int off = idx % hidden;
+    // filter 0 keys
+    float* cur = (float*)((char*)dest + i * grad_value_size);
+    
+    if (total_keys[i] == 0) {
+      switch (off) {
+      case 0:
+        cur[accessor.SlotIndex()] = 0;
+        cur[accessor.MfDimIndex()] = 0;
+        cur[accessor.ShowIndex()] = 0.0;
+        break;
+      case 1:
+        cur[accessor.ClickIndex()] = 0.0;
+        break;
+      case 2:
+        cur[accessor.EmbedGIndex()] = 0.0;
+        break;
+      default: 
+        cur[accessor.EmbedxGIndex() + off - 3] = 0.0;
+        break;
+      }
+      return;
+    }
+    
+    const uint32_t& start = d_sort_offset[i];
+    const uint32_t& count = d_sort_cnt[i];
+    const uint32_t& pos = d_sort_idx[start];
+
+    const int& x = key2slot[pos];
+    int y = pos - slot_lens[x];
+    int mf_dim = slot_dims[x] - 3;
+        
+    double val = 0.0;
+    
+    switch (off) {
+      case 0: 
+        cur[accessor.SlotIndex()] = (float)slot_vector[x];
+        cur[accessor.MfDimIndex()] = mf_dim;
+        SUM_GRAD_VALUE
+        cur[accessor.ShowIndex()] = val;
+        break;
+      case 1: 
+        SUM_GRAD_VALUE
+        cur[accessor.ClickIndex()] = val;
+        break;
+      case 2: 
+        SUM_GRAD_VALUE
+        cur[accessor.EmbedGIndex()] = val * -1. * bs;
+        break;
+      default: 
+        int embedx_idx = off - 3;
+        if (mf_dim < embedx_idx) {
+            cur[accessor.EmbedxGIndex() + embedx_idx] = 0.0;
+            return;
+        }
+        SUM_GRAD_VALUE
+        cur[accessor.EmbedxGIndex() + embedx_idx] = val * -1. * bs;
+        break;
+    }
+  }
+}
+
+template <typename GPUAccessor>
+void AccessorWrapper<GPUAccessor>::CopyForPullImpl(
+    const paddle::platform::Place& place,
+    uint64_t** gpu_keys,
+    const std::vector<float*>& values,
+    const float* total_values_gpu,
+    const int64_t* gpu_len,
+    const int slot_num,
+    const int hidden_size,
+    const int64_t total_length,
+    int* gpu_dim,
+    int feature_value_size) {
+  auto stream = dynamic_cast<paddle::platform::CUDADeviceContext*>(
+                    paddle::platform::DeviceContextPool::Instance().Get(place))
+                    ->stream();
+  auto buf_value = memory::Alloc(place, values.size() * sizeof(float*));
+  float** gpu_values = reinterpret_cast<float**>(buf_value->ptr());
+  cudaMemcpy(gpu_values,
+             values.data(),
+             values.size() * sizeof(float*),
+             cudaMemcpyHostToDevice);
+  PullCopy<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>(
+      gpu_values,
+      total_values_gpu,
+      gpu_len,
+      slot_num,
+      total_length,
+      gpu_keys,
+      feature_value_size,
+      gpu_dim,
+      gpu_accessor_);
+  cudaStreamSynchronize(stream);
+}
+
+template <typename GPUAccessor>
+void AccessorWrapper<GPUAccessor>::CopyForPushImpl(
+    const paddle::platform::Place& place,
+    const std::vector<const float*>& grad_values,
+    float* total_grad_values_gpu,
+    const std::vector<int64_t>& slot_lengths,
+    const uint64_t total_length,
+    const int batch_size,
+    size_t grad_value_size,
+    std::vector<int>& slot_vector,
+    std::vector<int>& slot_mf_dim_vector) {
+  auto stream = dynamic_cast<paddle::platform::CUDADeviceContext*>(
+                    paddle::platform::DeviceContextPool::Instance().Get(place))
+                    ->stream();
+  auto slot_lengths_lod = slot_lengths;
+  for (int i = 1; i < slot_lengths_lod.size(); i++) {
+    slot_lengths_lod[i] += slot_lengths_lod[i - 1];
+  }
+  auto buf_grad_value =
+      memory::Alloc(place, grad_values.size() * sizeof(float*));
+  auto buf_length = memory::Alloc(place, slot_lengths.size() * sizeof(int64_t));
+  auto buf_slot_vector =
+      memory::Alloc(place, slot_lengths_lod.size() * sizeof(int));
+  auto buf_mf_dim_vector =
+      memory::Alloc(place, slot_lengths_lod.size() * sizeof(int));
+  float** gpu_values = reinterpret_cast<float**>(buf_grad_value->ptr());
+  int64_t* gpu_len = reinterpret_cast<int64_t*>(buf_length->ptr());
+  int* d_slot_vector = reinterpret_cast<int*>(buf_slot_vector->ptr());
+  int* d_mf_dim_vector = reinterpret_cast<int*>(buf_mf_dim_vector->ptr());
+  cudaMemcpy(gpu_values,
+             grad_values.data(),
+             grad_values.size() * sizeof(float*),
+             cudaMemcpyHostToDevice);
+  cudaMemcpy(gpu_len,
+             slot_lengths_lod.data(),
+             slot_lengths.size() * sizeof(int64_t),
+             cudaMemcpyHostToDevice);
+  cudaMemcpy(d_slot_vector,
+             slot_vector.data(),
+             slot_lengths_lod.size() * sizeof(int),
+             cudaMemcpyHostToDevice);
+  cudaMemcpy(d_mf_dim_vector,
+             slot_mf_dim_vector.data(),
+             slot_lengths_lod.size() * sizeof(int),
+             cudaMemcpyHostToDevice);
+  PushCopyWithPool<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>(
+      total_grad_values_gpu,
+      gpu_values,
+      gpu_len,
+      slot_lengths.size(),
+      total_length,
+      batch_size,
+      d_slot_vector,
+      d_mf_dim_vector,
+      grad_value_size,
+      gpu_accessor_);
+  cudaStreamSynchronize(stream);
+}
+
+template <typename GPUAccessor>
+void AccessorWrapper<GPUAccessor>::CopyForPullDedupImpl(
+    const paddle::platform::Place& place,
+    const uint64_t* total_keys, float** gpu_values,
+    const float* total_values_gpu,
+    const int64_t* slot_lens, const int* key2slot,
+    const int hidden_size,
+    const int64_t total_length, 
+    const int* slot_dims,
+    const uint32_t* gpu_restore_idx,
+    int pull_value_size) {
+  auto stream = dynamic_cast<paddle::platform::CUDADeviceContext*>(
+                    paddle::platform::DeviceContextPool::Instance().Get(place))
+                    ->stream();
+  size_t N = total_length * hidden_size;
+  PullDedupCopy<<<CUDA_BLOCK(N), stream>>>(
+      N, total_keys, gpu_values, total_values_gpu, slot_lens, pull_value_size,
+      slot_dims, hidden_size, key2slot, gpu_restore_idx,
+      gpu_accessor_.common_pull_value);
+  cudaStreamSynchronize(stream);
+}
+
+template <typename GPUAccessor>
+void AccessorWrapper<GPUAccessor>::CopyForPushDedupImpl(
+    const paddle::platform::Place& place,
+    const uint64_t* total_keys, float** grad_values,
+    float* total_grad_values_gpu, const int* slots,
+    const int64_t* slot_lens, const int hidden_size,
+    const int64_t total_length,
+    const int64_t dedup_length, const int batch_size,
+    const int* slot_dims, const int* key2slot,
+    const uint32_t* d_restore_idx,
+    const size_t grad_value_size) {
+  auto stream = dynamic_cast<paddle::platform::CUDADeviceContext*>(
+                    paddle::platform::DeviceContextPool::Instance().Get(place))
+                    ->stream();
+  cudaMemsetAsync(total_grad_values_gpu, 0, dedup_length * grad_value_size,
+                  stream);
+  size_t N = total_length * hidden_size;
+  PushMergeCopyAtomic<<<CUDA_BLOCK(N), stream>>>(
+      N, total_keys, total_grad_values_gpu, grad_values, hidden_size,
+      batch_size, slots, slot_dims, slot_lens, key2slot, d_restore_idx,
+      grad_value_size, gpu_accessor_.common_push_value);
+
+  cudaStreamSynchronize(stream);
+}
+
+template <typename GPUAccessor>
+void AccessorWrapper<GPUAccessor>::CopyForPushDedupImpl(
+    const paddle::platform::Place& place,
+    const uint64_t* total_keys, float** grad_values,
+    float* total_grad_values_gpu, const int* slots,
+    const int64_t* slot_lens, const int hidden_size,
+    const int64_t total_length, const int64_t dedup_length,
+    const int batch_size, const int* slot_dims,
+    const int* key2slot,
+    const uint32_t* gpu_sort_idx,
+    const uint32_t* gpu_sort_offset,
+    const uint32_t* gpu_sort_lens,
+    const size_t grad_value_size) {
+  auto stream = dynamic_cast<paddle::platform::CUDADeviceContext*>(
+                    paddle::platform::DeviceContextPool::Instance().Get(place))
+                    ->stream();
+  // merge all grad to one
+  size_t N = dedup_length * hidden_size;
+  PushMergeCopy<<<CUDA_BLOCK(N), stream>>>(
+        N, total_keys, total_grad_values_gpu, grad_values, hidden_size,
+        batch_size, slots, slot_dims, slot_lens, key2slot, 
+        gpu_sort_idx, gpu_sort_offset, gpu_sort_lens,
+        grad_value_size, gpu_accessor_.common_push_value);
+  cudaStreamSynchronize(stream);
+}
+
+#ifdef PADDLE_WITH_PSCORE
+template class AccessorWrapper<CommonFeatureValueAccessor>;
+#endif
+
+}  // namespace framework
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.h b/paddle/fluid/framework/fleet/heter_ps/feature_value.h
index 134fd1f2760c0..d237521eecf91 100644
--- a/paddle/fluid/framework/fleet/heter_ps/feature_value.h
+++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.h
@@ -20,24 +20,34 @@ limitations under the License. */
 #include <sstream>
 #include <unordered_map>
 
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"
+#ifdef PADDLE_WITH_PSCORE
+#include "paddle/fluid/distributed/ps/table/accessor.h"
+#include "paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h"
+#include "paddle/fluid/distributed/ps/table/depends/feature_value.h"
+#endif
 
 namespace paddle {
 namespace framework {
 #define MF_DIM 8
 
 typedef uint64_t FeatureKey;
+#define TYPEALIGN(ALIGNVAL, LEN) \
+  (((uint64_t)(LEN) + ((ALIGNVAL)-1)) & ~((uint64_t)((ALIGNVAL)-1)))
 
 class FeatureValueAccessor {
  public:
-  __host__ __device__  FeatureValueAccessor() {}
+  __host__ __device__ FeatureValueAccessor() {}
   __host__ __device__ ~FeatureValueAccessor() {}
 
-  __host__ __device__ virtual int Configure(std::unordered_map<std::string, float> config) {
+  __host__ __device__ virtual int Configure(
+      std::unordered_map<std::string, float> config) {
     _config = config;
     Initialize();
     return 0;
   }
-  __host__ __device__  virtual int Initialize() = 0;
+  __host__ __device__ virtual int Initialize() = 0;
 
  protected:
   std::unordered_map<std::string, float> _config;
@@ -63,47 +73,58 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor {
       std::vector<float> embedx_w;
        */
 
-    __host__ __device__ int Dim() { return 9 + embed_sgd_dim + embedx_sgd_dim + embedx_dim; } // has cpu_ptr(2)
-    __host__ __device__ int DimSize(size_t dim, int embedx_dim) { return sizeof(float); }
-    __host__ __device__ int Size() { return Dim() * sizeof(float); } // cpu_ptr:uint64=2float
-    __host__ __device__ int EmbedDim() { return embed_sgd_dim;}
-    __host__ __device__ int EmbedXDim() { return embedx_sgd_dim;}
-    __host__ __device__ int EmbedWDim() { return embedx_dim;}
-    __host__ __device__ int CpuPtrIndex() {return 0; } // cpuprt uint64
-    __host__ __device__ int DeltaScoreIndex() { return CpuPtrIndex() + 2; } 
+    __host__ __device__ int Dim() {
+      return 9 + embed_sgd_dim + embedx_sgd_dim + embedx_dim;
+    }  // has cpu_ptr(2)
+    __host__ __device__ int DimSize(size_t dim, int embedx_dim) {
+      return sizeof(float);
+    }
+    __host__ __device__ size_t Size() {
+      return TYPEALIGN(8, Dim() * sizeof(float));
+    }  // cpu_ptr:uint64=2float
+    __host__ __device__ int EmbedDim() { return embed_sgd_dim; }
+    __host__ __device__ int EmbedXDim() { return embedx_sgd_dim; }
+    __host__ __device__ int EmbedWDim() { return embedx_dim; }
+    __host__ __device__ int CpuPtrIndex() { return 0; }  // cpuprt uint64
+    __host__ __device__ int DeltaScoreIndex() { return CpuPtrIndex() + 2; }
     __host__ __device__ int ShowIndex() { return DeltaScoreIndex() + 1; }
     __host__ __device__ int ClickIndex() { return ShowIndex() + 1; }
     __host__ __device__ int EmbedWIndex() { return ClickIndex() + 1; }
     __host__ __device__ int EmbedG2SumIndex() { return EmbedWIndex() + 1; }
-    __host__ __device__ int SlotIndex() { return EmbedG2SumIndex() + embed_sgd_dim; }
+    __host__ __device__ int SlotIndex() {
+      return EmbedG2SumIndex() + embed_sgd_dim;
+    }
     __host__ __device__ int MfDimIndex() { return SlotIndex() + 1; }
-    __host__ __device__ int MfSizeIndex() { return MfDimIndex() + 1; } // actual mf size (ex. 0)
+    __host__ __device__ int MfSizeIndex() {
+      return MfDimIndex() + 1;
+    }  // actual mf size (ex. 0)
     __host__ __device__ int EmbedxG2SumIndex() { return MfSizeIndex() + 1; }
-    __host__ __device__ int EmbedxWIndex() { return EmbedxG2SumIndex() + embedx_sgd_dim; }
-    
+    __host__ __device__ int EmbedxWIndex() {
+      return EmbedxG2SumIndex() + embedx_sgd_dim;
+    }
 
     // 根据mf_dim计算的总长度
     __host__ __device__ int Dim(int& mf_dim) {
       int tmp_embedx_sgd_dim = 1;
-      if (optimizer_type_ == 3) {//adam
+      if (optimizer_type_ == 3) {  // adam
         tmp_embedx_sgd_dim = mf_dim * 2 + 2;
-      } else if (optimizer_type_ == 4) { //shared_adam
+      } else if (optimizer_type_ == 4) {  // shared_adam
         tmp_embedx_sgd_dim = 4;
       }
       return 9 + embed_sgd_dim + tmp_embedx_sgd_dim + mf_dim;
     }
 
     // 根据mf_dim 计算的总byte数
-    __host__ __device__ int Size(int& mf_dim) {
-      return Dim(mf_dim) * sizeof(float); // cpu_ptr:2float
+    __host__ __device__ size_t Size(int& mf_dim) {
+      return TYPEALIGN(8, Dim(mf_dim) * sizeof(float));  // cpu_ptr:2float
     }
 
     // 根据mf_dim 计算的 mf_size byte数
-    __host__ __device__ int MFSize(int& mf_dim) {
+    __host__ __device__ size_t MFSize(int& mf_dim) {
       int tmp_embedx_sgd_dim = 1;
-      if (optimizer_type_ == 3) { //adam
+      if (optimizer_type_ == 3) {  // adam
         tmp_embedx_sgd_dim = mf_dim * 2 + 2;
-      } else if (optimizer_type_ == 4) { //shared_adam
+      } else if (optimizer_type_ == 4) {  // shared_adam
         tmp_embedx_sgd_dim = 4;
       }
       return (tmp_embedx_sgd_dim + mf_dim) * sizeof(float);
@@ -111,33 +132,42 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor {
 
     __host__ __device__ int EmbedxG2SumOffsetIndex() { return 0; }
     __host__ __device__ int EmbedxWOffsetIndex(float* val) {
-      // has mf 
+      // has mf
       int tmp_embedx_sgd_dim = 1;
       if (int(MfSize(val)) > 0) {
-        if (optimizer_type_ == 3) {//adam
+        if (optimizer_type_ == 3) {  // adam
           tmp_embedx_sgd_dim = int(MfDim(val)) * 2 + 2;
-        } else if (optimizer_type_ == 4) { //shared_adam
+        } else if (optimizer_type_ == 4) {  // shared_adam
           tmp_embedx_sgd_dim = 4;
         }
-        return EmbedxG2SumIndex() + tmp_embedx_sgd_dim; 
+        return EmbedxG2SumIndex() + tmp_embedx_sgd_dim;
       } else {
         // no mf
         return 0;
       }
     }
 
-
-    __host__ __device__ uint64_t CpuPtr(float* val) {return *(reinterpret_cast<uint64_t*>(val)); }
-    __host__ __device__ float& DeltaScore(float* val) { return val[DeltaScoreIndex()]; }
+    __host__ __device__ uint64_t CpuPtr(float* val) {
+      return *(reinterpret_cast<uint64_t*>(val));
+    }
+    __host__ __device__ float& DeltaScore(float* val) {
+      return val[DeltaScoreIndex()];
+    }
     __host__ __device__ float& Show(float* val) { return val[ShowIndex()]; }
     __host__ __device__ float& Click(float* val) { return val[ClickIndex()]; }
     __host__ __device__ float& Slot(float* val) { return val[SlotIndex()]; }
     __host__ __device__ float& MfDim(float* val) { return val[MfDimIndex()]; }
     __host__ __device__ float& MfSize(float* val) { return val[MfSizeIndex()]; }
     __host__ __device__ float& EmbedW(float* val) { return val[EmbedWIndex()]; }
-    __host__ __device__ float& EmbedG2Sum(float* val) { return val[EmbedG2SumIndex()]; }
-    __host__ __device__ float& EmbedxG2Sum(float* val) { return val[EmbedxG2SumIndex()]; }
-    __host__ __device__ float& EmbedxW(float* val) { return val[EmbedxWIndex()]; }
+    __host__ __device__ float& EmbedG2Sum(float* val) {
+      return val[EmbedG2SumIndex()];
+    }
+    __host__ __device__ float& EmbedxG2Sum(float* val) {
+      return val[EmbedxG2SumIndex()];
+    }
+    __host__ __device__ float& EmbedxW(float* val) {
+      return val[EmbedxWIndex()];
+    }
 
     int embed_sgd_dim;
     int embedx_dim;
@@ -146,21 +176,27 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor {
   };
 
   struct CommonPullValue {
-      /*
-        float show;
-        float click;
-        float embed_w;
-        float mf_size
-        std::vector<float> embedx_w;
-      */
-      __host__ __device__ int ShowIndex() { return 0; }
-      __host__ __device__ int ClickIndex() { return 1; }
-      __host__ __device__ int EmbedWIndex() { return 2; }
-      __host__ __device__ int MfSizeIndex() { return 3; } // actual mf size (ex. 0)
-      __host__ __device__ int EmbedxWIndex() { return 4; }
-      __host__ __device__ int Size(const int mf_dim) {
-          return (4 + mf_dim) * sizeof(float);
-      }
+    /*
+      float show;
+      float click;
+      float embed_w;
+      float mf_size
+      std::vector<float> embedx_w;
+    */
+    __host__ __device__ static int Dim(int embedx_dim) {
+      return 4 + embedx_dim;
+    }
+    __host__ __device__ int DimSize(size_t dim) { return sizeof(float); }
+    __host__ __device__ int Size(int embedx_dim) {
+      return TYPEALIGN(8, Dim(embedx_dim) * sizeof(float));
+    }
+    __host__ __device__ int ShowIndex() { return 0; }
+    __host__ __device__ int ClickIndex() { return 1; }
+    __host__ __device__ int EmbedWIndex() { return 2; }
+    __host__ __device__ int MfSizeIndex() {
+      return 3;
+    }  // actual mf size (ex. 0)
+    __host__ __device__ int EmbedxWIndex() { return 4; }
   };
 
   struct CommonPushValue {
@@ -175,14 +211,28 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor {
 
     __host__ __device__ int Dim(int embedx_dim) { return 5 + embedx_dim; }
 
-    __host__ __device__ int DimSize(int dim, int embedx_dim) { return sizeof(float); }
-    __host__ __device__ int Size(int embedx_dim) { return Dim(embedx_dim) * sizeof(float); }
+    __host__ __device__ int DimSize(int dim, int embedx_dim) {
+      return sizeof(float);
+    }
+    __host__ __device__ int Size(int embedx_dim) {
+      return TYPEALIGN(8, Dim(embedx_dim) * sizeof(float));
+    }
     __host__ __device__ int SlotIndex() { return 0; }
-    __host__ __device__ int ShowIndex() { return CommonPushValue::SlotIndex() + 1; }
-    __host__ __device__ int ClickIndex() { return CommonPushValue::ShowIndex() + 1; }
-    __host__ __device__ int MfDimIndex() { return CommonPushValue::ClickIndex() + 1; }
-    __host__ __device__ int EmbedGIndex() { return CommonPushValue::MfDimIndex() + 1; }
-    __host__ __device__ int EmbedxGIndex() { return CommonPushValue::EmbedGIndex() + 1; }
+    __host__ __device__ int ShowIndex() {
+      return CommonPushValue::SlotIndex() + 1;
+    }
+    __host__ __device__ int ClickIndex() {
+      return CommonPushValue::ShowIndex() + 1;
+    }
+    __host__ __device__ int MfDimIndex() {
+      return CommonPushValue::ClickIndex() + 1;
+    }
+    __host__ __device__ int EmbedGIndex() {
+      return CommonPushValue::MfDimIndex() + 1;
+    }
+    __host__ __device__ int EmbedxGIndex() {
+      return CommonPushValue::EmbedGIndex() + 1;
+    }
     __host__ __device__ float& Slot(float* val) {
       return val[CommonPushValue::SlotIndex()];
     }
@@ -203,21 +253,20 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor {
     }
   };
 
- 
   __host__ __device__ CommonFeatureValueAccessor() {}
   __host__ __device__ ~CommonFeatureValueAccessor() {}
 
   __host__ __device__ virtual int Initialize() {
     int optimizer_type = (_config.find("optimizer_type") == _config.end())
-                                 ? 1
-                                 : int(_config["optimizer_type"]);
+                             ? 1
+                             : int(_config["optimizer_type"]);
     int sparse_embedx_dim = (_config.find("embedx_dim") == _config.end())
                                 ? 8
                                 : int(_config["embedx_dim"]);
-    if (optimizer_type == 3) { //adam
+    if (optimizer_type == 3) {  // adam
       common_feature_value.embed_sgd_dim = 4;
       common_feature_value.embedx_sgd_dim = sparse_embedx_dim * 2 + 2;
-    } else if (optimizer_type == 4) { //shared_adam
+    } else if (optimizer_type == 4) {  // shared_adam
       common_feature_value.embed_sgd_dim = 4;
       common_feature_value.embedx_sgd_dim = 4;
     } else {
@@ -230,7 +279,252 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor {
     return 0;
   }
 
-  __host__ __device__ std::string ParseToString(const float* v, int param_size) {
+  // // build阶段从cpu_val赋值给gpu_val
+  __host__ void BuildFill(
+      float* gpu_val, void* cpu,
+      paddle::distributed::ValueAccessor* cpu_table_accessor, int mf_dim) {
+#ifdef PADDLE_WITH_PSCORE
+    paddle::distributed::CtrDymfAccessor* cpu_accessor =
+        dynamic_cast<paddle::distributed::CtrDymfAccessor*>(cpu_table_accessor);
+    paddle::distributed::FixedFeatureValue* cpu_ptr =
+        (paddle::distributed::FixedFeatureValue*)(cpu);
+    float* cpu_val = cpu_ptr->data();
+    size_t cpu_dim = cpu_ptr->size();
+
+    gpu_val[common_feature_value.DeltaScoreIndex()] =
+        cpu_val[cpu_accessor->common_feature_value.DeltaScoreIndex()];
+    gpu_val[common_feature_value.ShowIndex()] =
+        cpu_val[cpu_accessor->common_feature_value.ShowIndex()];
+    gpu_val[common_feature_value.ClickIndex()] =
+        cpu_val[cpu_accessor->common_feature_value.ClickIndex()];
+    gpu_val[common_feature_value.SlotIndex()] =
+        cpu_val[cpu_accessor->common_feature_value.SlotIndex()];
+    gpu_val[common_feature_value.EmbedWIndex()] =
+        cpu_val[cpu_accessor->common_feature_value.EmbedWIndex()];
+    for (int i = 0; i < common_feature_value.EmbedDim(); i++) {
+      gpu_val[common_feature_value.EmbedG2SumIndex() + i] =
+          cpu_val[cpu_accessor->common_feature_value.EmbedG2SumIndex() + i];
+    }
+    *(reinterpret_cast<uint64_t*>(
+        gpu_val + common_feature_value.CpuPtrIndex())) = (uint64_t)(cpu);
+    cpu_val[cpu_accessor->common_feature_value.MfDimIndex()] = float(mf_dim);
+    gpu_val[common_feature_value.MfDimIndex()] = mf_dim;
+    if (cpu_dim > cpu_accessor->GetAccessorInfo().dim -
+                      cpu_accessor->GetAccessorInfo().mf_size / sizeof(float)) {
+      gpu_val[common_feature_value.MfSizeIndex()] =
+          common_feature_value.MFSize(mf_dim) / sizeof(float);
+
+      for (int x = 0;
+           x < int(common_feature_value.MFSize(mf_dim) / sizeof(float)); x++) {
+        gpu_val[common_feature_value.EmbedxG2SumIndex() + x] =
+            cpu_val[cpu_accessor->common_feature_value.EmbedxG2SumIndex() + x];
+      }
+    } else {
+      gpu_val[common_feature_value.MfSizeIndex()] = 0;
+      for (int x = common_feature_value.EmbedxG2SumIndex();
+           x < int(common_feature_value.Size(mf_dim) / sizeof(float)); x++) {
+        gpu_val[x] = 0;
+      }
+    }
+#endif
+  }
+
+  // dump_to_cpu阶段从gpu_val赋值给cpu_val
+  __host__ void DumpFill(float* gpu_val,
+                         paddle::distributed::ValueAccessor* cpu_table_accessor,
+                         int mf_dim) {
+#ifdef PADDLE_WITH_PSCORE
+    paddle::distributed::CtrDymfAccessor* cpu_accessor =
+        dynamic_cast<paddle::distributed::CtrDymfAccessor*>(cpu_table_accessor);
+
+    auto* downpour_value =
+        (paddle::distributed::FixedFeatureValue*)(*(reinterpret_cast<uint64_t*>(
+            gpu_val + common_feature_value.CpuPtrIndex())));
+    size_t downpour_value_size = downpour_value->size();
+    if (gpu_val[common_feature_value.MfSizeIndex()] > 0 &&
+        downpour_value_size == (cpu_accessor->GetAccessorInfo().dim -
+                                int(cpu_accessor->GetAccessorInfo().mf_size /
+                                    sizeof(float)))) {  // cpu_accessor
+      downpour_value->resize(cpu_accessor->common_feature_value.Dim(mf_dim));
+    }
+    float* cpu_val = downpour_value->data();
+    cpu_val[cpu_accessor->common_feature_value.DeltaScoreIndex()] =
+        gpu_val[common_feature_value.DeltaScoreIndex()];
+    cpu_val[cpu_accessor->common_feature_value.ShowIndex()] =
+        gpu_val[common_feature_value.ShowIndex()];
+    cpu_val[cpu_accessor->common_feature_value.ClickIndex()] =
+        gpu_val[common_feature_value.ClickIndex()];
+    cpu_val[cpu_accessor->common_feature_value.EmbedWIndex()] =
+        gpu_val[common_feature_value.EmbedWIndex()];
+    cpu_val[cpu_accessor->common_feature_value.SlotIndex()] =
+        gpu_val[common_feature_value.SlotIndex()];
+
+    for (int i = 0; i < common_feature_value.EmbedDim(); i++) {
+      cpu_val[cpu_accessor->common_feature_value.EmbedG2SumIndex() + i] =
+          gpu_val[common_feature_value.EmbedG2SumIndex() + i];
+    }
+
+    if (gpu_val[common_feature_value.MfSizeIndex()] > 0) {
+      for (int x = 0;
+           x < int(common_feature_value.MFSize(mf_dim) / sizeof(float)); x++) {
+        cpu_val[cpu_accessor->common_feature_value.EmbedxG2SumIndex() + x] =
+            gpu_val[common_feature_value.EmbedxG2SumIndex() + x];
+      }
+    }
+#endif
+  }
+
+  // dy_mf_fill_dvals_kernel 阶段 gpukernel
+  // 中从src_val赋值给dest_val
+  __host__ __device__ void FeatureValueFill(float* dest_val, float* src_val,
+                                            int mf_dim) {
+    *(reinterpret_cast<uint64_t*>(dest_val +
+                                  common_feature_value.CpuPtrIndex())) =
+        *(reinterpret_cast<uint64_t*>(src_val +
+                                      common_feature_value.CpuPtrIndex()));
+    dest_val[common_feature_value.DeltaScoreIndex()] =
+        src_val[common_feature_value.DeltaScoreIndex()];
+    dest_val[common_feature_value.ShowIndex()] =
+        src_val[common_feature_value.ShowIndex()];
+    dest_val[common_feature_value.ClickIndex()] =
+        src_val[common_feature_value.ClickIndex()];
+    dest_val[common_feature_value.EmbedWIndex()] =
+        src_val[common_feature_value.EmbedWIndex()];
+    for (int i = 0; i < common_feature_value.EmbedDim(); i++) {
+      dest_val[common_feature_value.EmbedG2SumIndex() + i] =
+          src_val[common_feature_value.EmbedG2SumIndex() + i];
+    }
+    dest_val[common_feature_value.SlotIndex()] =
+        src_val[common_feature_value.SlotIndex()];
+    dest_val[common_feature_value.MfDimIndex()] = mf_dim;
+    dest_val[common_feature_value.MfSizeIndex()] =
+        src_val[common_feature_value.MfSizeIndex()];
+
+    for (int x = common_feature_value.EmbedxG2SumIndex();
+         x < int(common_feature_value.Size(mf_dim) / sizeof(float)); x++) {
+      dest_val[x] = src_val[x];
+    }
+  }
+
+  // dy_mf_fill_dvals_kernel, dy_mf_search_kernel 阶段 gpukernel
+  // 中从src_val赋值给dest_val
+  __host__ __device__ void PullValueFill(float* dest_val, float* src_val) {
+    dest_val[common_pull_value.ShowIndex()] =
+        src_val[common_feature_value.ShowIndex()];
+    dest_val[common_pull_value.ClickIndex()] =
+        src_val[common_feature_value.ClickIndex()];
+    dest_val[common_pull_value.EmbedWIndex()] =
+        src_val[common_feature_value.EmbedWIndex()];
+
+    int mf_size = int(src_val[common_feature_value.MfSizeIndex()]);
+    if (mf_size == 0) {
+      dest_val[common_pull_value.MfSizeIndex()] = 0;
+      return;
+    }
+    // set pull value real dim size
+    int mf_dim = int(src_val[common_feature_value.MfDimIndex()]);
+    dest_val[common_pull_value.MfSizeIndex()] = mf_dim;
+
+    int embedx_off = common_pull_value.EmbedxWIndex();
+    int value_off = common_feature_value.EmbedxWIndex();
+    for (int k = 0; k < mf_dim; ++k) {
+      dest_val[embedx_off + k] = src_val[value_off + k];
+    }
+  }
+
+  // dy_mf_fill_shard_grads_kernel,update_one 阶段 gpukernel
+  // 中从src_val赋值给dest_val
+  __host__ __device__ void PushValueFill(float* dest_val,
+                                         const float* src_val) {
+    dest_val[common_push_value.SlotIndex()] =
+        src_val[common_push_value.SlotIndex()];
+    dest_val[common_push_value.ShowIndex()] =
+        src_val[common_push_value.ShowIndex()];
+    dest_val[common_push_value.ClickIndex()] =
+        src_val[common_push_value.ClickIndex()];
+    dest_val[common_push_value.MfDimIndex()] =
+        src_val[common_push_value.MfDimIndex()];
+    dest_val[common_push_value.EmbedGIndex()] =
+        src_val[common_push_value.EmbedGIndex()];
+
+    for (int x = 0; x < int(src_val[common_push_value.MfDimIndex()]); x++) {
+      dest_val[common_push_value.EmbedxGIndex() + x] =
+          src_val[common_push_value.EmbedxGIndex() + x];
+    }
+  }
+
+  // update_basic 阶段 gpukernel 中从src_val赋值给dest_val
+  __host__ __device__ void PushValueFillBasic(float* dest_val,
+                                              const float* src_val) {
+    dest_val[common_push_value.SlotIndex()] =
+        src_val[common_push_value.SlotIndex()];
+    dest_val[common_push_value.ShowIndex()] =
+        src_val[common_push_value.ShowIndex()];
+    dest_val[common_push_value.ClickIndex()] =
+        src_val[common_push_value.ClickIndex()];
+    dest_val[common_push_value.MfDimIndex()] =
+        src_val[common_push_value.MfDimIndex()];
+    dest_val[common_push_value.EmbedGIndex()] =
+        src_val[common_push_value.EmbedGIndex()];
+  }
+
+  // merge_one 阶段 gpukernel 中 PushValue 从src_val赋值给dest_val
+  __host__ __device__ void MergePushValue(float* dest_val,
+                                          const float* src_val) {
+    dest_val[common_push_value.ShowIndex()] +=
+        src_val[common_push_value.ShowIndex()];
+    dest_val[common_push_value.ClickIndex()] +=
+        src_val[common_push_value.ClickIndex()];
+    dest_val[common_push_value.EmbedGIndex()] +=
+        src_val[common_push_value.EmbedGIndex()];
+    for (int j = 0; j < int(dest_val[common_push_value.MfDimIndex()]); j++) {
+      dest_val[common_push_value.EmbedxGIndex() + j] +=
+          src_val[common_push_value.EmbedxGIndex() + j];
+    }
+  }
+
+  // merge_basic 阶段 gpukernel 中 PushValue 从src_val赋值给dest_val
+  __host__ __device__ void MergePushValueBasic(float* dest_val,
+                                               const float* src_val) {
+    dest_val[common_push_value.ShowIndex()] +=
+        src_val[common_push_value.ShowIndex()];
+    dest_val[common_push_value.ClickIndex()] +=
+        src_val[common_push_value.ClickIndex()];
+    dest_val[common_push_value.EmbedGIndex()] +=
+        src_val[common_push_value.EmbedGIndex()];
+  }
+
+  // PullCopy 阶段 gpukernel 中  FeatureValue回填到PullValue
+  __host__ __device__ void Select(float* dest_val, float* src_val,
+                                  uint64_t* key, int mf_dim) {
+    if (*key == 0) {
+      *(dest_val + common_pull_value.ShowIndex()) = 0;
+      *(dest_val + common_pull_value.ClickIndex()) = 0;
+      *(dest_val + common_pull_value.EmbedWIndex()) = 0;
+    } else {
+      *(dest_val + common_pull_value.ShowIndex()) =
+          src_val[common_pull_value.ShowIndex()];
+      *(dest_val + common_pull_value.ClickIndex()) =
+          src_val[common_pull_value.ClickIndex()];
+      *(dest_val + common_pull_value.EmbedWIndex()) =
+          src_val[common_pull_value.EmbedWIndex()];
+    }
+
+    if (src_val[common_pull_value.MfSizeIndex()] == 0 || *key == 0) {
+      for (int j = 0; j < mf_dim; j++) {
+        *(dest_val + common_pull_value.EmbedxWIndex() + j) = 0;
+      }
+    } else {
+      for (int j = 0; j < mf_dim; j++) {
+        // common_pull_value EmbedxWIndex 之前还有 MfSizeIndex，
+        // 所以这里没有直接使用 common_pull_value.EmbedxWIndex()
+        *(dest_val + 3 + j) = src_val[common_pull_value.EmbedxWIndex() + j];
+      }
+    }
+  }
+
+  __host__ __device__ std::string ParseToString(const float* v,
+                                                int param_size) {
     /*
         uint64_t cpu_ptr; // 2float
         float delta_score;
@@ -245,21 +539,21 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor {
         std::vector<float> embedx_w;
     */
     std::stringstream os;
-    os << "cpuptr: " << common_feature_value.CpuPtr(const_cast<float*>(v)) << " delta_score: " << v[2] 
-        << " show: " << v[3] << " click: " << v[4] 
-        << " embed_w:" << v[5] << " embed_g2sum:";
+    os << "cpuptr: " << common_feature_value.CpuPtr(const_cast<float*>(v))
+       << " delta_score: " << v[2] << " show: " << v[3] << " click: " << v[4]
+       << " embed_w:" << v[5] << " embed_g2sum:";
     for (int i = common_feature_value.EmbedG2SumIndex();
-        i < common_feature_value.SlotIndex(); i++) {
+         i < common_feature_value.SlotIndex(); i++) {
       os << " " << v[i];
     }
     int mf_dim = int(common_feature_value.MfDim(const_cast<float*>(v)));
-    os << " slot: " << common_feature_value.Slot(const_cast<float*>(v)) 
-      << " mf_dim: " << mf_dim
-      << " mf_size: " << common_feature_value.MfSize(const_cast<float*>(v))
-      << " mf: ";
+    os << " slot: " << common_feature_value.Slot(const_cast<float*>(v))
+       << " mf_dim: " << mf_dim
+       << " mf_size: " << common_feature_value.MfSize(const_cast<float*>(v))
+       << " mf: ";
     if (param_size > common_feature_value.EmbedxG2SumIndex()) {
       for (auto i = common_feature_value.EmbedxG2SumIndex();
-          i < common_feature_value.Dim(mf_dim); ++i) {
+           i < common_feature_value.Dim(mf_dim); ++i) {
         os << " " << v[i];
       }
     }
@@ -272,7 +566,6 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor {
   CommonPullValue common_pull_value;
 };
 
-
 struct FeatureValue {
   float delta_score;
   float show;
@@ -344,6 +637,253 @@ struct FeaturePushValue {
   }
 };
 
+class VirtualAccessor {
+ public:
+  virtual int Configure(std::unordered_map<std::string, float> config) = 0;
+
+  virtual size_t GetFeatureValueSize(int& mf_dim) = 0;
+
+  virtual size_t GetPushValueSize(int& mf_dim) = 0;
+
+  virtual size_t GetPullValueSize(int& mf_dim) = 0;
+
+  virtual void BuildFill(void* gpu_val, void* cpu_val,
+                         paddle::distributed::ValueAccessor* cpu_table_accessor,
+                         int mf_dim) = 0;
+
+  virtual void DumpFill(float* gpu_val,
+                        paddle::distributed::ValueAccessor* cpu_table_accessor,
+                        int mf_dim) = 0;
+
+  virtual void CopyForPull(const paddle::platform::Place& place,
+                           uint64_t** gpu_keys,
+                           const std::vector<float*>& values,
+                           const float* total_values_gpu,
+                           const int64_t* gpu_len, const int slot_num,
+                           const int hidden_size, const int64_t total_length,
+                           int* gpu_dim, int feature_value_size) = 0;
+  // dedup
+  virtual void CopyForPull(const paddle::platform::Place& place,
+                           const uint64_t* total_keys, float** gpu_values,
+                           const float* total_values_gpu,
+                           const int64_t* slot_lens, const int* key2slot,
+                           const int hidden_size, const int64_t total_length,
+                           const int* slot_dims,
+                           const uint32_t* gpu_restore_idx,
+                           int pull_value_size) = 0;
+
+  virtual void CopyForPush(const paddle::platform::Place& place,
+                           const std::vector<const float*>& grad_values,
+                           float* total_grad_values_gpu,
+                           const std::vector<int64_t>& slot_lengths,
+                           const uint64_t total_length, const int batch_size,
+                           size_t grad_value_size,
+                           std::vector<int>& slot_vector,
+                           std::vector<int>& slot_mf_dim_vector) = 0;
+
+  // dedup
+  virtual void CopyForPush(const paddle::platform::Place& place,
+                           const uint64_t* total_keys, float** grad_values,
+                           float* total_grad_values_gpu, const int* slots,
+                           const int64_t* slot_lens, const int hidden_size,
+                           const int64_t total_length,
+                           const int64_t dedup_length, const int batch_size,
+                           const int* slot_dims, const int* key2slot,
+                           const uint32_t* d_restore_idx,
+                           const size_t grad_value_size) = 0;
+
+  virtual void CopyForPush(
+      const paddle::platform::Place& place, const uint64_t* total_keys,
+      float** grad_values, float* total_grad_values_gpu, const int* slots,
+      const int64_t* slot_lens, const int hidden_size,
+      const int64_t total_length, const int64_t dedup_length,
+      const int batch_size, const int* slot_dims, const int* key2slot,
+      const uint32_t* gpu_sort_idx, const uint32_t* gpu_sort_offset,
+      const uint32_t* gpu_sort_lens, const size_t grad_value_size) = 0;
+
+  virtual std::string ParseToString(const float* v, int param_size) = 0;
+};
+
+template <typename GPUAccessor>
+class AccessorWrapper : public VirtualAccessor {
+ public:
+  explicit AccessorWrapper() {}
+  virtual ~AccessorWrapper() {}
+  AccessorWrapper(const AccessorWrapper&) = delete;
+  AccessorWrapper& operator=(const AccessorWrapper&) = delete;
+
+  virtual int Configure(std::unordered_map<std::string, float> config) {
+    return gpu_accessor_.Configure(config);
+  }
+
+  virtual size_t GetFeatureValueSize(int& mf_dim) {
+    return gpu_accessor_.common_feature_value.Size(mf_dim);
+  }
+
+  virtual size_t GetPushValueSize(int& mf_dim) {
+    return gpu_accessor_.common_push_value.Size(mf_dim);
+  }
+
+  virtual size_t GetPullValueSize(int& mf_dim) {
+    return gpu_accessor_.common_pull_value.Size(mf_dim);
+  }
+
+  virtual void BuildFill(void* gpu_val, void* cpu_val,
+                         paddle::distributed::ValueAccessor* cpu_table_accessor,
+                         int mf_dim) {
+    gpu_accessor_.BuildFill((float*)(gpu_val), cpu_val, cpu_table_accessor,
+                            mf_dim);
+  }
+
+  virtual void DumpFill(float* gpu_val,
+                        paddle::distributed::ValueAccessor* cpu_table_accessor,
+                        int mf_dim) {
+    gpu_accessor_.DumpFill(gpu_val, cpu_table_accessor, mf_dim);
+  }
+
+  virtual void CopyForPull(const paddle::platform::Place& place,
+                           uint64_t** gpu_keys,
+                           const std::vector<float*>& values,
+                           const float* total_values_gpu,
+                           const int64_t* gpu_len, const int slot_num,
+                           const int hidden_size, const int64_t total_length,
+                           int* gpu_dim, int feature_value_size) {
+    CopyForPullImpl(place, gpu_keys, values, total_values_gpu, gpu_len,
+                    slot_num, hidden_size, total_length, gpu_dim,
+                    feature_value_size);
+  }
+
+  virtual void CopyForPull(const paddle::platform::Place& place,
+                           const uint64_t* total_keys, float** gpu_values,
+                           const float* total_values_gpu,
+                           const int64_t* slot_lens, const int* key2slot,
+                           const int hidden_size, const int64_t total_length,
+                           const int* slot_dims,
+                           const uint32_t* gpu_restore_idx,
+                           int pull_value_size) {
+    CopyForPullDedupImpl(place, total_keys, gpu_values, total_values_gpu,
+                         slot_lens, key2slot, hidden_size, total_length,
+                         slot_dims, gpu_restore_idx, pull_value_size);
+  }
+
+  virtual void CopyForPush(const paddle::platform::Place& place,
+                           const std::vector<const float*>& grad_values,
+                           float* total_grad_values_gpu,
+                           const std::vector<int64_t>& slot_lengths,
+                           const uint64_t total_length, const int batch_size,
+                           size_t grad_value_size,
+                           std::vector<int>& slot_vector,
+                           std::vector<int>& slot_mf_dim_vector) {
+    CopyForPushImpl(place, grad_values, total_grad_values_gpu, slot_lengths,
+                    total_length, batch_size, grad_value_size, slot_vector,
+                    slot_mf_dim_vector);
+  }
+
+  virtual void CopyForPush(const paddle::platform::Place& place,
+                           const uint64_t* total_keys, float** grad_values,
+                           float* total_grad_values_gpu, const int* slots,
+                           const int64_t* slot_lens, const int hidden_size,
+                           const int64_t total_length,
+                           const int64_t dedup_length, const int batch_size,
+                           const int* slot_dims, const int* key2slot,
+                           const uint32_t* d_restore_idx,
+                           const size_t grad_value_size) {
+    CopyForPushDedupImpl(place, total_keys, grad_values, total_grad_values_gpu,
+                         slots, slot_lens, hidden_size, total_length,
+                         dedup_length, batch_size, slot_dims, key2slot,
+                         d_restore_idx, grad_value_size);
+  }
+
+  virtual void CopyForPush(
+      const paddle::platform::Place& place, const uint64_t* total_keys,
+      float** grad_values, float* total_grad_values_gpu, const int* slots,
+      const int64_t* slot_lens, const int hidden_size,
+      const int64_t total_length, const int64_t dedup_length,
+      const int batch_size, const int* slot_dims, const int* key2slot,
+      const uint32_t* gpu_sort_idx, const uint32_t* gpu_sort_offset,
+      const uint32_t* gpu_sort_lens, const size_t grad_value_size) {
+    CopyForPushDedupImpl(place, total_keys, grad_values, total_grad_values_gpu,
+                         slots, slot_lens, hidden_size, total_length,
+                         dedup_length, batch_size, slot_dims, key2slot,
+                         gpu_sort_idx, gpu_sort_offset, gpu_sort_lens,
+                         grad_value_size);
+  }
+
+  void CopyForPullImpl(const paddle::platform::Place& place,
+                       uint64_t** gpu_keys, const std::vector<float*>& values,
+                       const float* total_values_gpu, const int64_t* gpu_len,
+                       const int slot_num, const int hidden_size,
+                       const int64_t total_length, int* gpu_dim,
+                       int feature_value_size);
+
+  void CopyForPushImpl(const paddle::platform::Place& place,
+                       const std::vector<const float*>& grad_values,
+                       float* total_grad_values_gpu,
+                       const std::vector<int64_t>& slot_lengths,
+                       const uint64_t total_length, const int batch_size,
+                       size_t grad_value_size, std::vector<int>& slot_vector,
+                       std::vector<int>& slot_mf_dim_vector);
+
+  void CopyForPullDedupImpl(const paddle::platform::Place& place,
+                            const uint64_t* total_keys, float** gpu_values,
+                            const float* total_values_gpu,
+                            const int64_t* slot_lens, const int* key2slot,
+                            const int hidden_size, const int64_t total_length,
+                            const int* slot_dims,
+                            const uint32_t* gpu_restore_idx,
+                            int pull_value_size);
+
+  void CopyForPushDedupImpl(const paddle::platform::Place& place,
+                            const uint64_t* total_keys, float** grad_values,
+                            float* total_grad_values_gpu, const int* slots,
+                            const int64_t* slot_lens, const int hidden_size,
+                            const int64_t total_length,
+                            const int64_t dedup_length, const int batch_size,
+                            const int* slot_dims, const int* key2slot,
+                            const uint32_t* d_restore_idx,
+                            const size_t grad_value_size);
+
+  void CopyForPushDedupImpl(
+      const paddle::platform::Place& place, const uint64_t* total_keys,
+      float** grad_values, float* total_grad_values_gpu, const int* slots,
+      const int64_t* slot_lens, const int hidden_size,
+      const int64_t total_length, const int64_t dedup_length,
+      const int batch_size, const int* slot_dims, const int* key2slot,
+      const uint32_t* gpu_sort_idx, const uint32_t* gpu_sort_offset,
+      const uint32_t* gpu_sort_lens, const size_t grad_value_size);
+
+  virtual std::string ParseToString(const float* v, int param_size) {
+    return gpu_accessor_.ParseToString(v, param_size);
+  }
+
+  GPUAccessor gpu_accessor_;
+};
+
+class GlobalAccessorTransfor {
+ public:
+  static GlobalAccessorTransfor& GetInstance() {
+    static GlobalAccessorTransfor ins;
+    return ins;
+  }
+  void Init(std::string accessor_type) {
+    if (accessor_wrapper_ptr_ != nullptr) {
+      return;
+    }
+    if (accessor_type == "CtrDymfAccessor") {
+      accessor_wrapper_ptr_ = new AccessorWrapper<CommonFeatureValueAccessor>();
+    } else {
+      VLOG(0) << "GlobalAccessorTransfor Init not support accessor_type:"
+              << accessor_type;
+      accessor_wrapper_ptr_ = new AccessorWrapper<CommonFeatureValueAccessor>();
+    }
+  }
+  VirtualAccessor* GetAccessorWrapper() { return accessor_wrapper_ptr_; }
+
+ private:
+  VirtualAccessor* accessor_wrapper_ptr_ = nullptr;
+};
+
 }  // end namespace framework
 }  // end namespace paddle
+
 #endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
index 11a52d631729c..a0acad9563ef0 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
@@ -27,7 +27,8 @@ DECLARE_double(gpugraph_hbm_table_load_factor);
 namespace paddle {
 namespace framework {
 enum GraphTableType { EDGE_TABLE, FEATURE_TABLE };
-class GpuPsGraphTable : public HeterComm<uint64_t, uint64_t, int> {
+class GpuPsGraphTable
+    : public HeterComm<uint64_t, uint64_t, int, CommonFeatureValueAccessor> {
  public:
   int get_table_offset(int gpu_id, GraphTableType type, int idx) const {
     int type_id = type;
@@ -36,7 +37,8 @@ class GpuPsGraphTable : public HeterComm<uint64_t, uint64_t, int> {
   }
   GpuPsGraphTable(std::shared_ptr<HeterPsResource> resource, int topo_aware,
                   int graph_table_num)
-      : HeterComm<uint64_t, uint64_t, int>(1, resource) {
+      : HeterComm<uint64_t, uint64_t, int, CommonFeatureValueAccessor>(
+            1, resource) {
     load_factor_ = FLAGS_gpugraph_hbm_table_load_factor;
     VLOG(0) << "load_factor = " << load_factor_;
 
@@ -108,8 +110,7 @@ class GpuPsGraphTable : public HeterComm<uint64_t, uint64_t, int> {
       }
     }
   }
-  ~GpuPsGraphTable() {
-  }
+  ~GpuPsGraphTable() {}
   void build_graph_on_single_gpu(const GpuPsCommGraph &g, int gpu_id, int idx);
   void build_graph_fea_on_single_gpu(const GpuPsCommGraphFea &g, int gpu_id);
   void clear_graph_info(int gpu_id, int index);
@@ -118,8 +119,8 @@ class GpuPsGraphTable : public HeterComm<uint64_t, uint64_t, int> {
   void clear_feature_info(int index);
   void build_graph_from_cpu(const std::vector<GpuPsCommGraph> &cpu_node_list,
                             int idx);
-  void build_graph_fea_from_cpu(const std::vector<GpuPsCommGraphFea> &cpu_node_list,
-                                int idx);
+  void build_graph_fea_from_cpu(
+      const std::vector<GpuPsCommGraphFea> &cpu_node_list, int idx);
   NodeQueryResult graph_node_sample(int gpu_id, int sample_size);
   NeighborSampleResult graph_neighbor_sample_v3(NeighborSampleQuery q,
                                                 bool cpu_switch);
@@ -129,17 +130,16 @@ class GpuPsGraphTable : public HeterComm<uint64_t, uint64_t, int> {
                                                 uint64_t *key, int sample_size,
                                                 int len, bool cpu_query_switch);
 
-  int get_feature_of_nodes(int gpu_id, uint64_t* d_walk,
-                        uint64_t* d_offset, int size, int slot_num);
+  int get_feature_of_nodes(int gpu_id, uint64_t *d_walk, uint64_t *d_offset,
+                           int size, int slot_num);
 
   NodeQueryResult query_node_list(int gpu_id, int idx, int start,
                                   int query_size);
   void display_sample_res(void *key, void *val, int len, int sample_len);
-  void move_result_to_source_gpu(int gpu_id, int gpu_num,
-                                                 int sample_size, int *h_left,
-                                                 int *h_right,
-                                                 uint64_t *src_sample_res,
-                                                 int *actual_sample_size);
+  void move_result_to_source_gpu(int gpu_id, int gpu_num, int sample_size,
+                                 int *h_left, int *h_right,
+                                 uint64_t *src_sample_res,
+                                 int *actual_sample_size);
   int init_cpu_table(const paddle::distributed::GraphParameter &graph);
 
   int gpu_num;
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable.h b/paddle/fluid/framework/fleet/heter_ps/hashtable.h
index d63060cc5e391..2e4fd943b728e 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable.h
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable.h
@@ -126,8 +126,9 @@ class HashTable {
   void get(const KeyType* d_keys, ValType* d_vals, size_t len,
            StreamType stream);
 
-  template <typename StreamType>
-  void get(const KeyType* d_keys, char* d_vals, size_t len, StreamType stream);
+  template <typename StreamType, typename FVAccessor>
+  void get(const KeyType* d_keys, char* d_vals, size_t len, StreamType stream,
+           FVAccessor& fv_accessor);
 
   void show();
 
@@ -140,8 +141,8 @@ class HashTable {
 #if defined(PADDLE_WITH_CUDA)
 
   template <typename Sgd, typename StreamType>
-  void update(const KeyType* d_keys, const float* d_grads, size_t len,
-              Sgd sgd, StreamType stream);
+  void update(const KeyType* d_keys, const float* d_grads, size_t len, Sgd sgd,
+              StreamType stream);
 
   template <typename Sgd, typename StreamType>
   void update(const KeyType* d_keys, const char* d_grads, size_t len, Sgd sgd,
@@ -168,14 +169,10 @@ class HashTable {
             << " push value size: " << push_grad_value_size_;
   }
 
-  void set_accessor(CommonFeatureValueAccessor& accessor) {
-   feature_value_accessor_ = accessor;
-  }
 
   void show_collision(int id) { return container_->print_collision(id); }
 
   std::unique_ptr<phi::RWLock> rwlock_{nullptr};
-  CommonFeatureValueAccessor feature_value_accessor_;
 
  private:
 #if defined(PADDLE_WITH_CUDA)
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
index 3d083cf996553..2586d716ada48 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
@@ -80,12 +80,12 @@ __global__ void search_kernel(Table* table,
   }
 }
 
-template <typename Table>
+template <typename Table, typename FVAccessor>
 __global__ void dy_mf_search_kernel(Table* table,
                                     const typename Table::key_type* const keys,
                                     char* vals, size_t len,
                                     size_t pull_feature_value_size,
-                                    CommonFeatureValueAccessor feature_value_accessor) {
+                                    FVAccessor feature_value_accessor) {
   const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
   // return;
   if (i < len) {
@@ -94,28 +94,7 @@ __global__ void dy_mf_search_kernel(Table* table,
       uint64_t offset = i * pull_feature_value_size;
       float* cur = (float*)(vals + offset);
       float* input = it->second;
-      
-      cur[feature_value_accessor.common_pull_value.ShowIndex()] =
-         input[feature_value_accessor.common_feature_value.ShowIndex()];
-      cur[feature_value_accessor.common_pull_value.ClickIndex()] =
-         input[feature_value_accessor.common_feature_value.ClickIndex()];
-      cur[feature_value_accessor.common_pull_value.EmbedWIndex()] =
-         input[feature_value_accessor.common_feature_value.EmbedWIndex()];
-      
-      int mf_size = int(input[feature_value_accessor.common_feature_value.MfSizeIndex()]);
-      if (mf_size == 0) {
-          cur[feature_value_accessor.common_pull_value.MfSizeIndex()] = 0;
-          return;
-      }
-      // set pull value real dim size
-      int mf_dim = int(input[feature_value_accessor.common_feature_value.MfDimIndex()]);
-      cur[feature_value_accessor.common_pull_value.MfSizeIndex()] = mf_dim;
-       
-      int embedx_off = feature_value_accessor.common_pull_value.EmbedxWIndex();
-      int value_off = feature_value_accessor.common_feature_value.EmbedxWIndex();
-      for (int k = 0; k < mf_dim; ++k) {
-          cur[embedx_off + k] = input[value_off + k];
-      }
+      feature_value_accessor.PullValueFill(cur, input);
     }
   }
 }
@@ -201,15 +180,16 @@ void HashTable<KeyType, ValType>::get(const KeyType* d_keys, ValType* d_vals,
 }
 
 template <typename KeyType, typename ValType>
-template <typename StreamType>
+template <typename StreamType, typename FVAccessor>
 void HashTable<KeyType, ValType>::get(const KeyType* d_keys, char* d_vals,
-                                      size_t len, StreamType stream) {
+                                      size_t len, StreamType stream,
+                                      FVAccessor& fv_accessor) {
   if (len == 0) {
     return;
   }
   const int grid_size = (len - 1) / BLOCK_SIZE_ + 1;
   dy_mf_search_kernel<<<grid_size, BLOCK_SIZE_, 0, stream>>>(
-      container_, d_keys, d_vals, len, pull_feature_value_size_, feature_value_accessor_);
+      container_, d_keys, d_vals, len, pull_feature_value_size_, fv_accessor);
 }
 
 template <typename KeyType, typename ValType>
@@ -337,14 +317,14 @@ template class HashTable<long, long>;
 template class HashTable<long, unsigned long>;
 template class HashTable<long, unsigned int>;
 
-template void HashTable<unsigned long, float>::get<
-    cudaStream_t>(const unsigned long* d_keys,
-                  float* d_vals, size_t len,
-                  cudaStream_t stream);
+template void HashTable<unsigned long, float>::get<cudaStream_t>(
+    const unsigned long* d_keys, float* d_vals, size_t len,
+    cudaStream_t stream);
 
 template void
-HashTable<unsigned long, float*>::get<cudaStream_t>(
-    const unsigned long* d_keys, char* d_vals, size_t len, cudaStream_t stream);
+HashTable<unsigned long, float*>::get<cudaStream_t, CommonFeatureValueAccessor>(
+    const unsigned long* d_keys, char* d_vals, size_t len, cudaStream_t stream,
+    CommonFeatureValueAccessor& fv_accessor);
 
 template void HashTable<long, int>::get<cudaStream_t>(const long* d_keys,
                                                       int* d_vals, size_t len,
@@ -353,7 +333,8 @@ template void HashTable<long, int>::get<cudaStream_t>(const long* d_keys,
 template void HashTable<unsigned long, int>::get<cudaStream_t>(
     const unsigned long* d_keys, int* d_vals, size_t len, cudaStream_t stream);
 template void HashTable<unsigned long, unsigned long>::get<cudaStream_t>(
-    const unsigned long* d_keys, unsigned long* d_vals, size_t len, cudaStream_t stream);
+    const unsigned long* d_keys, unsigned long* d_vals, size_t len,
+    cudaStream_t stream);
 template void HashTable<unsigned long, long>::get<cudaStream_t>(
     const unsigned long* d_keys, long* d_vals, size_t len, cudaStream_t stream);
 template void HashTable<long, unsigned long>::get<cudaStream_t>(
@@ -368,15 +349,13 @@ template void HashTable<long, unsigned int>::get<cudaStream_t>(
 //    const unsigned long* d_keys, char* d_vals, size_t len, cudaStream_t
 //    stream);
 
-template void HashTable<unsigned long, float>::insert<
-    cudaStream_t>(const unsigned long* d_keys,
-                  const float* d_vals, size_t len,
-                  cudaStream_t stream);
+template void HashTable<unsigned long, float>::insert<cudaStream_t>(
+    const unsigned long* d_keys, const float* d_vals, size_t len,
+    cudaStream_t stream);
 
-template void HashTable<unsigned long, float*>::
-    insert<cudaStream_t>(const unsigned long* d_keys, size_t len, char* pool,
-                         size_t feature_value_size, size_t start_index,
-                         cudaStream_t stream);
+template void HashTable<unsigned long, float*>::insert<cudaStream_t>(
+    const unsigned long* d_keys, size_t len, char* pool,
+    size_t feature_value_size, size_t start_index, cudaStream_t stream);
 
 template void HashTable<long, int>::insert<cudaStream_t>(const long* d_keys,
                                                          const int* d_vals,
@@ -402,26 +381,27 @@ template void HashTable<long, unsigned long>::insert<cudaStream_t>(
 template void HashTable<long, unsigned int>::insert<cudaStream_t>(
     const long* d_keys, const unsigned int* d_vals, size_t len,
     cudaStream_t stream);
-    
+
 template void HashTable<unsigned long, unsigned long>::insert<cudaStream_t>(
     const unsigned long* d_keys, const unsigned long* d_vals, size_t len,
-    cudaStream_t stream);   
+    cudaStream_t stream);
 
-template void HashTable<unsigned long, float*>::
-    dump_to_cpu<cudaStream_t>(int devid, cudaStream_t stream);
+template void HashTable<unsigned long, float*>::dump_to_cpu<cudaStream_t>(
+    int devid, cudaStream_t stream);
 
 template void
-HashTable<unsigned long, float*>::update<SparseAdagradOptimizer, cudaStream_t>(const unsigned long* d_keys, const char* d_grads, size_t len,
-                  SparseAdagradOptimizer sgd,
-                  cudaStream_t stream);
-template void
-HashTable<unsigned long, float*>::update<SparseAdamOptimizer, cudaStream_t>(const unsigned long* d_keys, const char* d_grads, size_t len,
-                  SparseAdamOptimizer sgd,
-                  cudaStream_t stream);
+HashTable<unsigned long, float*>::update<SparseAdagradOptimizer, cudaStream_t>(
+    const unsigned long* d_keys, const char* d_grads, size_t len,
+    SparseAdagradOptimizer sgd, cudaStream_t stream);
 template void
-HashTable<unsigned long, float*>::update<SparseAdamSharedOptimizer, cudaStream_t>(const unsigned long* d_keys, const char* d_grads, size_t len,
-                  SparseAdamSharedOptimizer sgd,
-                  cudaStream_t stream);
+HashTable<unsigned long, float*>::update<SparseAdamOptimizer, cudaStream_t>(
+    const unsigned long* d_keys, const char* d_grads, size_t len,
+    SparseAdamOptimizer sgd, cudaStream_t stream);
+template void HashTable<unsigned long, float*>::update<
+    SparseAdamSharedOptimizer, cudaStream_t>(const unsigned long* d_keys,
+                                             const char* d_grads, size_t len,
+                                             SparseAdamSharedOptimizer sgd,
+                                             cudaStream_t stream);
 
 // template void HashTable<unsigned long,
 // paddle::framework::FeatureValue>::update<
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
index 72dfb476efe0a..0557b66d8655d 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
@@ -45,23 +45,20 @@ namespace framework {
 #define TYPEALIGN(ALIGNVAL, LEN) \
   (((uint64_t)(LEN) + ((ALIGNVAL)-1)) & ~((uint64_t)((ALIGNVAL)-1)))
 
-template <typename KeyType, typename ValType, typename GradType>
+template <typename KeyType, typename ValType, typename GradType,
+          typename FVAccessor>
 class HeterComm {
  public:
   HeterComm(size_t capacity, std::shared_ptr<HeterPsResource> resource);
-  HeterComm(size_t capacity, std::shared_ptr<HeterPsResource> resource,
-            CommonFeatureValueAccessor& accessor);
   virtual ~HeterComm();
   HeterComm(const HeterComm&) = delete;
   HeterComm& operator=(const HeterComm&) = delete;
 
   void split_input_to_shard(KeyType* d_keys, int* d_idx_ptr, size_t len,
-          int* left, int* right, int gpu_num);
+                            int* left, int* right, int gpu_num);
   void merge_keys(int gpu_num, const KeyType* d_keys, size_t len,
-          KeyType* d_sorted_keys,
-          KeyType* d_merged_keys,
-          uint32_t* d_restore_idx,
-          size_t & uniq_len);
+                  KeyType* d_sorted_keys, KeyType* d_merged_keys,
+                  uint32_t* d_restore_idx, size_t& uniq_len);
   void merge_grad(int gpu_num, KeyType* d_keys, GradType* d_grads, size_t len,
                   int& uniq_len);  // NOLINT
   void dynamic_merge_grad(int gpu_num, KeyType* d_keys, float* d_grads,
@@ -126,7 +123,7 @@ class HeterComm {
     max_mf_dim_ = max_mf_dim;
   }
 
-  void set_accessor(CommonFeatureValueAccessor& accessor) {
+  void set_accessor(FVAccessor& accessor) {
     feature_value_accessor_ = accessor;
   }
 #endif
@@ -142,16 +139,12 @@ class HeterComm {
   void end_pass();
 #if defined(PADDLE_WITH_CUDA)
   // dedup
-  int dedup_keys_and_fillidx(const int gpu_id,
-         const int total_fea_num,
-         const KeyType* d_keys,   // input
-         KeyType* d_merged_keys,  // output
-         KeyType* d_sorted_keys,
-         uint32_t* d_restore_idx,
-         uint32_t* d_sorted_idx,
-         uint32_t* d_offset,
-         uint32_t* d_merged_cnts,
-         bool filter_zero);
+  int dedup_keys_and_fillidx(const int gpu_id, const int total_fea_num,
+                             const KeyType* d_keys,   // input
+                             KeyType* d_merged_keys,  // output
+                             KeyType* d_sorted_keys, uint32_t* d_restore_idx,
+                             uint32_t* d_sorted_idx, uint32_t* d_offset,
+                             uint32_t* d_merged_cnts, bool filter_zero);
 #endif
 
   struct Node {
@@ -259,11 +252,11 @@ class HeterComm {
                    ValType* src_val);
   void walk_to_src(int start_index, int gpu_num, int* h_left, int* h_right,
                    char* src_val, size_t val_size);
+
  protected:
   void pull_merge_sparse(int num, KeyType* d_keys, float* d_vals, size_t len);
   void pull_normal_sparse(int num, KeyType* d_keys, float* d_vals, size_t len);
 
- protected:
   using Table = HashTable<KeyType, ValType>;
   using PtrTable = HashTable<KeyType, float*>;
   std::vector<Table*> tables_;
@@ -274,7 +267,7 @@ class HeterComm {
   int block_size_{256};
   std::unique_ptr<HeterCommKernel> heter_comm_kernel_;
 
-  CommonFeatureValueAccessor feature_value_accessor_;
+  FVAccessor feature_value_accessor_;
 
  private:
   int topo_aware_{0};
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
index e97bcb561675b..3ce8a315af9fb 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
@@ -31,8 +31,12 @@ DECLARE_int32(gpugraph_dedup_pull_push_mode);
 
 namespace paddle {
 namespace framework {
-template <typename KeyType, typename ValType, typename GradType>
-HeterComm<KeyType, ValType, GradType>::HeterComm(
+
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+HeterComm<KeyType, ValType, GradType, FVAccessor>::HeterComm(
     size_t capacity, std::shared_ptr<HeterPsResource> resource) {
   VLOG(1) << "Construct new HeterComm";
   resource_ = resource;
@@ -45,52 +49,26 @@ HeterComm<KeyType, ValType, GradType>::HeterComm(
     platform::CUDADeviceGuard guard(resource_->dev_id(i));
     allocators_.push_back(std::make_shared<cub::CachingDeviceAllocator>(
         8, 1, (unsigned int)-1, (size_t)-1, false, false));  // NOLINT
-#endif
-    if (!multi_mf_dim_) {
-      auto table = new Table(capacity / load_factor_);
-      tables_.push_back(table);
-    } else {
-      VLOG(0) << "Error:use HeterComm Construct with accessor";
-      return;
-    }
-    if (multi_node_) {
-      storage_[i].init(feanum_, resource_->dev_id(i));
-    }
-  }
-  heter_comm_kernel_ = std::make_unique<HeterCommKernel>(block_size_);
-  init_path();
-}
-
-template <typename KeyType, typename ValType, typename GradType>
-HeterComm<KeyType, ValType, GradType>::HeterComm(
-    size_t capacity, std::shared_ptr<HeterPsResource> resource,
-    CommonFeatureValueAccessor& feature_value_accessor) {
-  VLOG(1) << "Construct new HeterComm";
-  resource_ = resource;
-  storage_.resize(resource_->total_device());
-  multi_mf_dim_ = resource->multi_mf();
-  for (int i = 0; i < resource_->total_device(); ++i) {
-#if defined(PADDLE_WITH_CUDA)
-    platform::CUDADeviceGuard guard(resource_->dev_id(i));
-    allocators_.push_back(std::make_shared<cub::CachingDeviceAllocator>(
-        8, 1, (unsigned int)-1, (size_t)-1, false, false));  // NOLINT
 #endif
     if (!multi_mf_dim_) {
       auto table = new Table(capacity / load_factor_);
       tables_.push_back(table);
     } else {
       max_mf_dim_ = resource_->max_mf_dim();
-      feature_value_accessor_ = feature_value_accessor;
-      size_t val_type_size = TYPEALIGN(
-          8, feature_value_accessor_.common_feature_value.Size(max_mf_dim_));
-      size_t grad_type_size = TYPEALIGN(
-          8, feature_value_accessor_.common_push_value.Size(max_mf_dim_));
-      size_t pull_type_size = feature_value_accessor_.common_pull_value.Size(max_mf_dim_);
+      auto accessor_wrapper_ptr =
+          GlobalAccessorTransfor::GetInstance().GetAccessorWrapper();
+      size_t val_type_size =
+          accessor_wrapper_ptr->GetFeatureValueSize(max_mf_dim_);
+      size_t grad_type_size =
+          accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_);
+      size_t pull_type_size =
+          accessor_wrapper_ptr->GetPullValueSize(max_mf_dim_);
+
       VLOG(0) << " HeterComm init, max feature_value_size:" << val_type_size
               << ", feature_value_push_size:" << grad_type_size
               << ", feature_pull_type_size:" << pull_type_size;
       auto ptr_table = new PtrTable(capacity / load_factor_);
-      ptr_table->set_accessor(feature_value_accessor_);
+      // ptr_table->set_accessor(feature_value_accessor_);
       ptr_table->set_feature_value_size(pull_type_size, grad_type_size);
       ptr_tables_.push_back(ptr_table);
     }
@@ -98,13 +76,15 @@ HeterComm<KeyType, ValType, GradType>::HeterComm(
       storage_[i].init(feanum_, resource_->dev_id(i));
     }
   }
-  heter_comm_kernel_ =
-      std::make_unique<HeterCommKernel>(block_size_, feature_value_accessor_);
+  heter_comm_kernel_ = std::make_unique<HeterCommKernel>(block_size_);
   init_path();
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::init_path() {
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+void HeterComm<KeyType, ValType, GradType, FVAccessor>::init_path() {
   int total_device = resource_->total_device();
   path_.resize(total_device);
   if (!topo_aware_) {
@@ -156,11 +136,18 @@ void HeterComm<KeyType, ValType, GradType>::init_path() {
   }
 }
 
-template <typename KeyType, typename ValType, typename GradType>
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
 template <typename DstPlace, typename SrcPlace, typename StreamType>
-void HeterComm<KeyType, ValType, GradType>::memory_copy(
-    DstPlace dst_place, void* dst, SrcPlace src_place, const void* src,
-    size_t count, StreamType stream) {
+void HeterComm<KeyType, ValType, GradType, FVAccessor>::memory_copy(
+    DstPlace dst_place,
+    void* dst,
+    SrcPlace src_place,
+    const void* src,
+    size_t count,
+    StreamType stream) {
 #if defined(PADDLE_WITH_CUDA)
   CUDA_CHECK(cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream));
   if (stream == 0) {
@@ -171,11 +158,12 @@ void HeterComm<KeyType, ValType, GradType>::memory_copy(
 #endif
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::create_storage(int start_index,
-                                                           int end_index,
-                                                           size_t keylen,
-                                                           size_t vallen) {
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+void HeterComm<KeyType, ValType, GradType, FVAccessor>::create_storage(
+    int start_index, int end_index, size_t keylen, size_t vallen) {
 #if defined(PADDLE_WITH_CUDA)
   auto& allocator = allocators_[start_index];
   auto& nodes = path_[start_index][end_index].nodes_;
@@ -184,11 +172,13 @@ void HeterComm<KeyType, ValType, GradType>::create_storage(int start_index,
     PADDLE_ENFORCE_GPU_SUCCESS(allocator->DeviceAllocate(
         resource_->dev_id(nodes[i].dev_num),
         (void**)&(nodes[i].key_storage),  // NOLINT
-        keylen, resource_->remote_stream(nodes[i].dev_num, start_index)));
+        keylen,
+        resource_->remote_stream(nodes[i].dev_num, start_index)));
     PADDLE_ENFORCE_GPU_SUCCESS(allocator->DeviceAllocate(
         resource_->dev_id(nodes[i].dev_num),
         (void**)&(nodes[i].val_storage),  // NOLINT
-        vallen, resource_->remote_stream(nodes[i].dev_num, start_index)));
+        vallen,
+        resource_->remote_stream(nodes[i].dev_num, start_index)));
     nodes[i].key_bytes_len = keylen;
     nodes[i].val_bytes_len = vallen;
   }
@@ -207,9 +197,12 @@ void HeterComm<KeyType, ValType, GradType>::create_storage(int start_index,
 #endif
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::destroy_storage(int start_index,
-                                                            int end_index) {
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+void HeterComm<KeyType, ValType, GradType, FVAccessor>::destroy_storage(
+    int start_index, int end_index) {
 #if defined(PADDLE_WITH_CUDA)
   auto& allocator = allocators_[start_index];
   auto& nodes = path_[start_index][end_index].nodes_;
@@ -224,12 +217,17 @@ void HeterComm<KeyType, ValType, GradType>::destroy_storage(int start_index,
 #endif
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::walk_to_dest(int start_index,
-                                                         int num, int* h_left,
-                                                         int* h_right,
-                                                         KeyType* src_key,
-                                                         GradType* src_val) {
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+void HeterComm<KeyType, ValType, GradType, FVAccessor>::walk_to_dest(
+    int start_index,
+    int num,
+    int* h_left,
+    int* h_right,
+    KeyType* src_key,
+    GradType* src_val) {
   int need_copy_val = 0;
   if (src_val) {
     need_copy_val = 1;
@@ -249,18 +247,24 @@ void HeterComm<KeyType, ValType, GradType>::walk_to_dest(int start_index,
     auto src_place = DevPlace(src_dev_id);
     auto dst_place = DevPlace(dst_dev_id);
 
-    memory_copy(dst_place, node.key_storage, src_place,
+    memory_copy(dst_place,
+                node.key_storage,
+                src_place,
                 reinterpret_cast<char*>(src_key + h_left[i]),
-                node.key_bytes_len, node.in_stream);
+                node.key_bytes_len,
+                node.in_stream);
     // #if defined(PADDLE_WITH_CUDA)  // adapt for gpu-graph
     //     cudaMemsetAsync(node.val_storage, -1, node.val_bytes_len,
     //     node.in_stream);
     // #endif
 
     if (need_copy_val) {
-      memory_copy(dst_place, node.val_storage, src_place,
+      memory_copy(dst_place,
+                  node.val_storage,
+                  src_place,
                   reinterpret_cast<char*>(src_val + h_left[i]),
-                  node.val_bytes_len, node.in_stream);
+                  node.val_bytes_len,
+                  node.in_stream);
     }
   }
   while (!que.empty()) {
@@ -282,13 +286,17 @@ void HeterComm<KeyType, ValType, GradType>::walk_to_dest(int start_index,
       auto src_place = DevPlace(src_dev_id);
       auto dst_place = DevPlace(dst_dev_id);
 
-      memory_copy(dst_place, cur_task.path->nodes_[cur_step + 1].key_storage,
-                  src_place, cur_task.path->nodes_[cur_step].key_storage,
+      memory_copy(dst_place,
+                  cur_task.path->nodes_[cur_step + 1].key_storage,
+                  src_place,
+                  cur_task.path->nodes_[cur_step].key_storage,
                   cur_task.path->nodes_[cur_step + 1].key_bytes_len,
                   cur_task.path->nodes_[cur_step + 1].in_stream);
       if (need_copy_val) {
-        memory_copy(dst_place, cur_task.path->nodes_[cur_step + 1].val_storage,
-                    src_place, cur_task.path->nodes_[cur_step].val_storage,
+        memory_copy(dst_place,
+                    cur_task.path->nodes_[cur_step + 1].val_storage,
+                    src_place,
+                    cur_task.path->nodes_[cur_step].val_storage,
                     cur_task.path->nodes_[cur_step + 1].val_bytes_len,
                     cur_task.path->nodes_[cur_step + 1].in_stream);
       }
@@ -296,10 +304,18 @@ void HeterComm<KeyType, ValType, GradType>::walk_to_dest(int start_index,
   }
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::walk_to_dest(
-    int start_index, int gpu_num, int* h_left, int* h_right, KeyType* src_key,
-    char* src_val, size_t val_size) {
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+void HeterComm<KeyType, ValType, GradType, FVAccessor>::walk_to_dest(
+    int start_index,
+    int gpu_num,
+    int* h_left,
+    int* h_right,
+    KeyType* src_key,
+    char* src_val,
+    size_t val_size) {
   int need_copy_val = 0;
   if (src_val) {
     need_copy_val = 1;
@@ -313,13 +329,18 @@ void HeterComm<KeyType, ValType, GradType>::walk_to_dest(
     auto& node = path_[start_index][i].nodes_[0];
     CopyTask t(&path_[start_index][i], 0);
     que.push(t);
-    CUDA_CHECK(cudaMemcpyAsync(
-        node.key_storage, reinterpret_cast<char*>(src_key + h_left[i]),
-        node.key_bytes_len, cudaMemcpyDefault, node.in_stream));
+    CUDA_CHECK(cudaMemcpyAsync(node.key_storage,
+                               reinterpret_cast<char*>(src_key + h_left[i]),
+                               node.key_bytes_len,
+                               cudaMemcpyDefault,
+                               node.in_stream));
     if (need_copy_val) {
-      CUDA_CHECK(cudaMemcpyAsync(
-          node.val_storage, src_val + uint64_t(h_left[i]) * uint64_t(val_size),
-          node.val_bytes_len, cudaMemcpyDefault, node.in_stream));
+      CUDA_CHECK(
+          cudaMemcpyAsync(node.val_storage,
+                          src_val + uint64_t(h_left[i]) * uint64_t(val_size),
+                          node.val_bytes_len,
+                          cudaMemcpyDefault,
+                          node.in_stream));
     }
   }
   while (!que.empty()) {
@@ -333,25 +354,34 @@ void HeterComm<KeyType, ValType, GradType>::walk_to_dest(
       int cur_step = cur_task.step;
       CopyTask c(cur_task.path, cur_step + 1);
       que.push(c);
-      CUDA_CHECK(cudaMemcpyAsync(
-          cur_task.path->nodes_[cur_step + 1].key_storage,
-          cur_task.path->nodes_[cur_step].key_storage,
-          cur_task.path->nodes_[cur_step + 1].key_bytes_len, cudaMemcpyDefault,
-          cur_task.path->nodes_[cur_step + 1].in_stream));
+      CUDA_CHECK(
+          cudaMemcpyAsync(cur_task.path->nodes_[cur_step + 1].key_storage,
+                          cur_task.path->nodes_[cur_step].key_storage,
+                          cur_task.path->nodes_[cur_step + 1].key_bytes_len,
+                          cudaMemcpyDefault,
+                          cur_task.path->nodes_[cur_step + 1].in_stream));
       if (need_copy_val) {
-        CUDA_CHECK(cudaMemcpyAsync(
-            cur_task.path->nodes_[cur_step + 1].val_storage,
-            cur_task.path->nodes_[cur_step].val_storage,
-            cur_task.path->nodes_[cur_step + 1].val_bytes_len,
-            cudaMemcpyDefault, cur_task.path->nodes_[cur_step + 1].in_stream));
+        CUDA_CHECK(
+            cudaMemcpyAsync(cur_task.path->nodes_[cur_step + 1].val_storage,
+                            cur_task.path->nodes_[cur_step].val_storage,
+                            cur_task.path->nodes_[cur_step + 1].val_bytes_len,
+                            cudaMemcpyDefault,
+                            cur_task.path->nodes_[cur_step + 1].in_stream));
       }
     }
   }
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::walk_to_src(
-    int start_index, int gpu_num, int* h_left, int* h_right, char* src_val,
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+void HeterComm<KeyType, ValType, GradType, FVAccessor>::walk_to_src(
+    int start_index,
+    int gpu_num,
+    int* h_left,
+    int* h_right,
+    char* src_val,
     size_t val_size) {
   std::queue<CopyTask> que;
   for (int i = 0; i < gpu_num; i++) {
@@ -362,8 +392,10 @@ void HeterComm<KeyType, ValType, GradType>::walk_to_src(
     auto& node = path_[start_index][i].nodes_[cur_step];
     if (cur_step == 0) {
       CUDA_CHECK(cudaMemcpyAsync(src_val + uint64_t(h_left[i]) * val_size,
-                                 node.val_storage, node.val_bytes_len,
-                                 cudaMemcpyDefault, node.out_stream));
+                                 node.val_storage,
+                                 node.val_bytes_len,
+                                 cudaMemcpyDefault,
+                                 node.out_stream));
     } else {
       CopyTask t(&path_[start_index][i], cur_step - 1);
       que.push(t);
@@ -385,24 +417,29 @@ void HeterComm<KeyType, ValType, GradType>::walk_to_src(
     if (cur_step > 0) {
       CopyTask c(cur_task.path, cur_step - 1);
       que.push(c);
-      CUDA_CHECK(cudaMemcpyAsync(
-          cur_task.path->nodes_[cur_step - 1].val_storage,
-          cur_task.path->nodes_[cur_step].val_storage,
-          cur_task.path->nodes_[cur_step - 1].val_bytes_len, cudaMemcpyDefault,
-          cur_task.path->nodes_[cur_step - 1].out_stream));
+      CUDA_CHECK(
+          cudaMemcpyAsync(cur_task.path->nodes_[cur_step - 1].val_storage,
+                          cur_task.path->nodes_[cur_step].val_storage,
+                          cur_task.path->nodes_[cur_step - 1].val_bytes_len,
+                          cudaMemcpyDefault,
+                          cur_task.path->nodes_[cur_step - 1].out_stream));
     } else if (cur_step == 0) {
       int end_index = cur_task.path->nodes_.back().dev_num;
-      CUDA_CHECK(cudaMemcpyAsync(
-          src_val + uint64_t(h_left[end_index]) * val_size,
-          cur_task.path->nodes_[cur_step].val_storage,
-          cur_task.path->nodes_[cur_step].val_bytes_len, cudaMemcpyDefault,
-          cur_task.path->nodes_[cur_step].out_stream));
+      CUDA_CHECK(
+          cudaMemcpyAsync(src_val + uint64_t(h_left[end_index]) * val_size,
+                          cur_task.path->nodes_[cur_step].val_storage,
+                          cur_task.path->nodes_[cur_step].val_bytes_len,
+                          cudaMemcpyDefault,
+                          cur_task.path->nodes_[cur_step].out_stream));
     }
   }
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-HeterComm<KeyType, ValType, GradType>::~HeterComm() {
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+HeterComm<KeyType, ValType, GradType, FVAccessor>::~HeterComm() {
   if (!multi_mf_dim_) {
     for (auto& table : tables_) {
       delete table;
@@ -420,15 +457,23 @@ HeterComm<KeyType, ValType, GradType>::~HeterComm() {
   }
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::show_one_table(int gpu_num) {
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+void HeterComm<KeyType, ValType, GradType, FVAccessor>::show_one_table(
+    int gpu_num) {
   if (!multi_mf_dim_) {
     tables_[gpu_num]->show();
   }
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::show_table_collisions() {
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+void HeterComm<KeyType, ValType, GradType, FVAccessor>::
+    show_table_collisions() {
   size_t idx = 0;
   for (auto& table : tables_) {
     if (table != nullptr) {
@@ -443,8 +488,11 @@ void HeterComm<KeyType, ValType, GradType>::show_table_collisions() {
   }
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-int HeterComm<KeyType, ValType, GradType>::log2i(int x) {
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+int HeterComm<KeyType, ValType, GradType, FVAccessor>::log2i(int x) {
   unsigned res = 0;
   while (x >>= 1) {
     ++res;
@@ -452,13 +500,20 @@ int HeterComm<KeyType, ValType, GradType>::log2i(int x) {
   return res;
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-int HeterComm<KeyType, ValType, GradType>::get_index_by_devid(int devid) {
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+int HeterComm<KeyType, ValType, GradType, FVAccessor>::get_index_by_devid(
+    int devid) {
   return resource_->get_index_by_devid(devid);
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::set_sparse_sgd(
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+void HeterComm<KeyType, ValType, GradType, FVAccessor>::set_sparse_sgd(
     const OptimizerConfig& optimizer_config) {
   for (int i = 0; i < resource_->total_device(); ++i) {
     AnyDeviceGuard guard(resource_->dev_id(i));
@@ -466,8 +521,11 @@ void HeterComm<KeyType, ValType, GradType>::set_sparse_sgd(
   }
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::set_embedx_sgd(
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+void HeterComm<KeyType, ValType, GradType, FVAccessor>::set_embedx_sgd(
     const OptimizerConfig& optimizer_config) {
   for (int i = 0; i < resource_->total_device(); ++i) {
     AnyDeviceGuard guard(resource_->dev_id(i));
@@ -475,10 +533,18 @@ void HeterComm<KeyType, ValType, GradType>::set_embedx_sgd(
   }
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::build_ps(
-    int dev_num, KeyType* h_keys, ValType* h_vals, size_t len,
-    size_t chunk_size, int stream_num, int offset) {
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+void HeterComm<KeyType, ValType, GradType, FVAccessor>::build_ps(
+    int dev_num,
+    KeyType* h_keys,
+    ValType* h_vals,
+    size_t len,
+    size_t chunk_size,
+    int stream_num,
+    int offset) {
   if (len <= 0) {
     return;
   }
@@ -513,17 +579,24 @@ void HeterComm<KeyType, ValType, GradType>::build_ps(
     auto dst_place = place;
     auto src_place = platform::CPUPlace();
 
-    memory_copy(
-        dst_place, reinterpret_cast<char*>(d_key_bufs[cur_stream]->ptr()),
-        src_place, h_keys + cur_len, sizeof(KeyType) * tmp_len, cur_use_stream);
-    memory_copy(
-        dst_place, reinterpret_cast<char*>(d_val_bufs[cur_stream]->ptr()),
-        src_place, h_vals + cur_len, sizeof(ValType) * tmp_len, cur_use_stream);
+    memory_copy(dst_place,
+                reinterpret_cast<char*>(d_key_bufs[cur_stream]->ptr()),
+                src_place,
+                h_keys + cur_len,
+                sizeof(KeyType) * tmp_len,
+                cur_use_stream);
+    memory_copy(dst_place,
+                reinterpret_cast<char*>(d_val_bufs[cur_stream]->ptr()),
+                src_place,
+                h_vals + cur_len,
+                sizeof(ValType) * tmp_len,
+                cur_use_stream);
     if (offset == -1) offset = dev_num;
     tables_[offset]->insert(
         reinterpret_cast<KeyType*>(d_key_bufs[cur_stream]->ptr()),
         reinterpret_cast<ValType*>(d_val_bufs[cur_stream]->ptr()),
-        (size_t)tmp_len, cur_use_stream);
+        (size_t)tmp_len,
+        cur_use_stream);
 
     cur_stream += 1;
     cur_len += tmp_len;
@@ -534,12 +607,18 @@ void HeterComm<KeyType, ValType, GradType>::build_ps(
   }
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::build_ps(int num, KeyType* h_keys,
-                                                     char* pool, size_t len,
-                                                     size_t feature_value_size,
-                                                     size_t chunk_size,
-                                                     int stream_num) {
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+void HeterComm<KeyType, ValType, GradType, FVAccessor>::build_ps(
+    int num,
+    KeyType* h_keys,
+    char* pool,
+    size_t len,
+    size_t feature_value_size,
+    size_t chunk_size,
+    int stream_num) {
   if (len <= 0) {
     return;
   }
@@ -572,12 +651,19 @@ void HeterComm<KeyType, ValType, GradType>::build_ps(int num, KeyType* h_keys,
     auto dst_place = place;
     auto src_place = platform::CPUPlace();
 
-    memory_copy(
-        dst_place, reinterpret_cast<char*>(d_key_bufs[cur_stream]->ptr()),
-        src_place, h_keys + cur_len, sizeof(KeyType) * tmp_len, cur_use_stream);
+    memory_copy(dst_place,
+                reinterpret_cast<char*>(d_key_bufs[cur_stream]->ptr()),
+                src_place,
+                h_keys + cur_len,
+                sizeof(KeyType) * tmp_len,
+                cur_use_stream);
     ptr_tables_[num]->insert(
-        reinterpret_cast<KeyType*>(d_key_bufs[cur_stream]->ptr()), tmp_len,
-        pool, feature_value_size, cur_len, cur_use_stream);
+        reinterpret_cast<KeyType*>(d_key_bufs[cur_stream]->ptr()),
+        tmp_len,
+        pool,
+        feature_value_size,
+        cur_len,
+        cur_use_stream);
     cur_stream += 1;
     cur_len += tmp_len;
   }
@@ -587,9 +673,15 @@ void HeterComm<KeyType, ValType, GradType>::build_ps(int num, KeyType* h_keys,
   }
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::merge_grad(
-    int dev_num, KeyType* d_keys, GradType* d_grads, size_t len,
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+void HeterComm<KeyType, ValType, GradType, FVAccessor>::merge_grad(
+    int dev_num,
+    KeyType* d_keys,
+    GradType* d_grads,
+    size_t len,
     int& uniq_len) {  // NOLINT
   int dev_id = resource_->dev_id(dev_num);
   DevPlace place = DevPlace(dev_id);
@@ -601,37 +693,75 @@ void HeterComm<KeyType, ValType, GradType>::merge_grad(
   auto d_merge_grads = memory::Alloc(place, len * sizeof(GradType));
   GradType* d_merge_grads_ptr =
       reinterpret_cast<GradType*>(d_merge_grads->ptr());
-  heter_comm_kernel_->sort_pairs(NULL, temp_storage_bytes, d_keys,
-                                 d_merge_keys_ptr, d_grads, d_merge_grads_ptr,
-                                 len, 0, 8 * sizeof(KeyType), stream, false);
+  heter_comm_kernel_->sort_pairs(NULL,
+                                 temp_storage_bytes,
+                                 d_keys,
+                                 d_merge_keys_ptr,
+                                 d_grads,
+                                 d_merge_grads_ptr,
+                                 len,
+                                 0,
+                                 8 * sizeof(KeyType),
+                                 stream,
+                                 false);
   auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
-  heter_comm_kernel_->sort_pairs(
-      d_temp_storage->ptr(), temp_storage_bytes, d_keys, d_merge_keys_ptr,
-      d_grads, d_merge_grads_ptr, len, 0, 8 * sizeof(KeyType), stream, false);
+  heter_comm_kernel_->sort_pairs(d_temp_storage->ptr(),
+                                 temp_storage_bytes,
+                                 d_keys,
+                                 d_merge_keys_ptr,
+                                 d_grads,
+                                 d_merge_grads_ptr,
+                                 len,
+                                 0,
+                                 8 * sizeof(KeyType),
+                                 stream,
+                                 false);
   temp_storage_bytes = 0;
   auto d_num_runs_out_mem = memory::Alloc(place, sizeof(int));
   int* d_num_runs_out = reinterpret_cast<int*>(d_num_runs_out_mem->ptr());
-  heter_comm_kernel_->reduce_by_key(NULL, temp_storage_bytes, d_merge_keys_ptr,
-                                    d_keys, d_merge_grads_ptr, d_grads,
-                                    d_num_runs_out, len, stream, false);
+  heter_comm_kernel_->reduce_by_key(NULL,
+                                    temp_storage_bytes,
+                                    d_merge_keys_ptr,
+                                    d_keys,
+                                    d_merge_grads_ptr,
+                                    d_grads,
+                                    d_num_runs_out,
+                                    len,
+                                    stream,
+                                    false);
   if (d_temp_storage->size() < temp_storage_bytes) {
     d_temp_storage = NULL;
     d_temp_storage = memory::Alloc(place, temp_storage_bytes);
   }
-  heter_comm_kernel_->reduce_by_key(
-      d_temp_storage->ptr(), temp_storage_bytes, d_merge_keys_ptr, d_keys,
-      d_merge_grads_ptr, d_grads, d_num_runs_out, len, stream, false);
+  heter_comm_kernel_->reduce_by_key(d_temp_storage->ptr(),
+                                    temp_storage_bytes,
+                                    d_merge_keys_ptr,
+                                    d_keys,
+                                    d_merge_grads_ptr,
+                                    d_grads,
+                                    d_num_runs_out,
+                                    len,
+                                    stream,
+                                    false);
   auto dst_place = platform::CPUPlace();
   auto src_place = place;
-  memory_copy(dst_place, &uniq_len, src_place, d_num_runs_out, sizeof(int),
-              stream);
+  memory_copy(
+      dst_place, &uniq_len, src_place, d_num_runs_out, sizeof(int), stream);
   sync_stream(stream);
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::dynamic_merge_grad(
-    int gpu_num, KeyType* d_keys, float* d_grads, size_t len, int& uniq_len,
-    size_t& segment_len, bool enable_segment_merge_grad) {
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+void HeterComm<KeyType, ValType, GradType, FVAccessor>::dynamic_merge_grad(
+    int gpu_num,
+    KeyType* d_keys,
+    float* d_grads,
+    size_t len,
+    int& uniq_len,
+    size_t& segment_len,
+    bool enable_segment_merge_grad) {
   int dev_id = resource_->dev_id(gpu_num);
   platform::CUDAPlace place = platform::CUDAPlace(dev_id);
   platform::CUDADeviceGuard guard(dev_id);
@@ -639,8 +769,9 @@ void HeterComm<KeyType, ValType, GradType>::dynamic_merge_grad(
 
   size_t temp_storage_bytes;
   size_t grad_dim = max_mf_dim_;
-  size_t grad_value_size =
-      TYPEALIGN(8, feature_value_accessor_.common_push_value.Size(max_mf_dim_));
+  auto accessor_wrapper_ptr =
+      GlobalAccessorTransfor::GetInstance().GetAccessorWrapper();
+  size_t grad_value_size = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_);
 
   auto d_merge_keys = memory::Alloc(place, len * sizeof(KeyType));
   KeyType* d_merge_keys_ptr = reinterpret_cast<KeyType*>(d_merge_keys->ptr());
@@ -652,70 +783,129 @@ void HeterComm<KeyType, ValType, GradType>::dynamic_merge_grad(
   int* d_merged_size = (int*)&d_idx[len];
   heter_comm_kernel_->fill_idx(d_idx, len, stream);
 
-  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs(
-      NULL, temp_storage_bytes, d_keys, d_merge_keys_ptr, d_idx, d_index, len,
-      0, 8 * sizeof(KeyType), stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cub::DeviceRadixSort::SortPairs(NULL,
+                                      temp_storage_bytes,
+                                      d_keys,
+                                      d_merge_keys_ptr,
+                                      d_idx,
+                                      d_index,
+                                      len,
+                                      0,
+                                      8 * sizeof(KeyType),
+                                      stream));
   auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
-  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs(
-      d_temp_storage->ptr(), temp_storage_bytes, d_keys, d_merge_keys_ptr,
-      d_idx, d_index, len, 0, 8 * sizeof(KeyType), stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cub::DeviceRadixSort::SortPairs(d_temp_storage->ptr(),
+                                      temp_storage_bytes,
+                                      d_keys,
+                                      d_merge_keys_ptr,
+                                      d_idx,
+                                      d_index,
+                                      len,
+                                      0,
+                                      8 * sizeof(KeyType),
+                                      stream));
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
 
   temp_storage_bytes = 0;
-  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRunLengthEncode::Encode(
-      NULL, temp_storage_bytes, d_merge_keys_ptr, d_keys, d_fea_num_info_ptr,
-      d_merged_size, len, stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cub::DeviceRunLengthEncode::Encode(NULL,
+                                         temp_storage_bytes,
+                                         d_merge_keys_ptr,
+                                         d_keys,
+                                         d_fea_num_info_ptr,
+                                         d_merged_size,
+                                         len,
+                                         stream));
   if (d_temp_storage->size() < temp_storage_bytes) {
     d_temp_storage = NULL;
     d_temp_storage = memory::Alloc(place, temp_storage_bytes);
   }
-  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRunLengthEncode::Encode(
-      d_temp_storage->ptr(), temp_storage_bytes, d_merge_keys_ptr, d_keys,
-      d_fea_num_info_ptr, d_merged_size, len, stream));
-
-  cudaMemcpyAsync((void*)&uniq_len, d_merged_size, sizeof(int),
-                  cudaMemcpyDeviceToHost, stream);
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cub::DeviceRunLengthEncode::Encode(d_temp_storage->ptr(),
+                                         temp_storage_bytes,
+                                         d_merge_keys_ptr,
+                                         d_keys,
+                                         d_fea_num_info_ptr,
+                                         d_merged_size,
+                                         len,
+                                         stream));
+
+  cudaMemcpyAsync((void*)&uniq_len,
+                  d_merged_size,
+                  sizeof(int),
+                  cudaMemcpyDeviceToHost,
+                  stream);
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
 
   assert(d_merged_size > 0);
   uint32_t* d_offset = (uint32_t*)&d_index[len];
   temp_storage_bytes = 0;
-  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum(
-      NULL, temp_storage_bytes, d_fea_num_info_ptr, d_offset, uniq_len,
-      stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum(NULL,
+                                                           temp_storage_bytes,
+                                                           d_fea_num_info_ptr,
+                                                           d_offset,
+                                                           uniq_len,
+                                                           stream));
   if (d_temp_storage->size() < temp_storage_bytes) {
     d_temp_storage = NULL;
     d_temp_storage = memory::Alloc(place, temp_storage_bytes);
   }
-  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum(
-      d_temp_storage->ptr(), temp_storage_bytes, d_fea_num_info_ptr, d_offset,
-      uniq_len, stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cub::DeviceScan::ExclusiveSum(d_temp_storage->ptr(),
+                                    temp_storage_bytes,
+                                    d_fea_num_info_ptr,
+                                    d_offset,
+                                    uniq_len,
+                                    stream));
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
 
   if (enable_segment_merge_grad) {
-    segment_merge_grad(gpu_num, d_merge_keys_ptr, d_grads, d_index, len,
-                       d_fea_num_info_ptr, uniq_len, segment_len);
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        cudaMemcpyAsync(d_keys, d_merge_keys_ptr, sizeof(KeyType) * segment_len,
-                        cudaMemcpyDeviceToDevice, stream));
+    segment_merge_grad(gpu_num,
+                       d_merge_keys_ptr,
+                       d_grads,
+                       d_index,
+                       len,
+                       d_fea_num_info_ptr,
+                       uniq_len,
+                       segment_len);
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(d_keys,
+                                               d_merge_keys_ptr,
+                                               sizeof(KeyType) * segment_len,
+                                               cudaMemcpyDeviceToDevice,
+                                               stream));
     PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
   } else {
     auto d_merge_grads = memory::Alloc(place, len * grad_value_size);
     float* d_merge_grads_ptr = reinterpret_cast<float*>(d_merge_grads->ptr());
 
-    heter_comm_kernel_->merge_gradient(
-        d_keys, d_offset, d_fea_num_info_ptr, d_index, (char*)d_grads,
-        (char*)d_merge_grads_ptr, uniq_len, grad_dim, grad_value_size, merger_,
-        stream);
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        cudaMemcpyAsync(d_grads, d_merge_grads_ptr, grad_value_size * uniq_len,
-                        cudaMemcpyDeviceToDevice, stream));
+    heter_comm_kernel_->merge_gradient(d_keys,
+                                       d_offset,
+                                       d_fea_num_info_ptr,
+                                       d_index,
+                                       (char*)d_grads,
+                                       (char*)d_merge_grads_ptr,
+                                       uniq_len,
+                                       grad_dim,
+                                       grad_value_size,
+                                       merger_,
+                                       stream,
+                                       feature_value_accessor_);
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(d_grads,
+                                               d_merge_grads_ptr,
+                                               grad_value_size * uniq_len,
+                                               cudaMemcpyDeviceToDevice,
+                                               stream));
     PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
   }
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::segment_merge_grad(
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+void HeterComm<KeyType, ValType, GradType, FVAccessor>::segment_merge_grad(
     int gpu_num,  // the device number
     KeyType*
         d_keys,  // the sorted keys list, which will be modified after merged
@@ -734,8 +924,9 @@ void HeterComm<KeyType, ValType, GradType>::segment_merge_grad(
   auto stream = resource_->local_stream(gpu_num, 0);
 
   auto grad_dim = max_mf_dim_;
-  auto grad_value_size =
-      TYPEALIGN(8, feature_value_accessor_.common_push_value.Size(max_mf_dim_));
+  auto accessor_wrapper_ptr =
+      GlobalAccessorTransfor::GetInstance().GetAccessorWrapper();
+  size_t grad_value_size = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_);
 
   auto d_buffer1 = memory::Alloc(place, sizeof(uint32_t) * len);
   auto d_segments = reinterpret_cast<uint32_t*>(d_buffer1->ptr());
@@ -751,77 +942,127 @@ void HeterComm<KeyType, ValType, GradType>::segment_merge_grad(
   CUDA_CHECK(cudaMemsetAsync(d_segments_num, 0, sizeof(uint32_t), stream));
 
   uint32_t segment_size = FLAGS_gpugraph_merge_grads_segment_size;
-  heter_comm_kernel_->split_segments(d_fea_num_info, uniq_len, d_segments,
-                                     d_segments_num, segment_size, stream);
+  heter_comm_kernel_->split_segments(d_fea_num_info,
+                                     uniq_len,
+                                     d_segments,
+                                     d_segments_num,
+                                     segment_size,
+                                     stream);
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
 
   size_t temp_storage_bytes = 0;
   PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceReduce::Sum(
       NULL, temp_storage_bytes, d_segments, d_segments_num, uniq_len, stream));
   auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      cub::DeviceReduce::Sum(d_temp_storage->ptr(), temp_storage_bytes,
-                             d_segments, d_segments_num, uniq_len, stream));
-  CUDA_CHECK(cudaMemcpyAsync(&segments_num, d_segments_num, sizeof(uint32_t),
-                             cudaMemcpyDeviceToHost, stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceReduce::Sum(d_temp_storage->ptr(),
+                                                    temp_storage_bytes,
+                                                    d_segments,
+                                                    d_segments_num,
+                                                    uniq_len,
+                                                    stream));
+  CUDA_CHECK(cudaMemcpyAsync(&segments_num,
+                             d_segments_num,
+                             sizeof(uint32_t),
+                             cudaMemcpyDeviceToHost,
+                             stream));
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
 
   temp_storage_bytes = 0;
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      cub::DeviceScan::ExclusiveSum(NULL, temp_storage_bytes, d_segments,
-                                    d_segments_offset, uniq_len, stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum(NULL,
+                                                           temp_storage_bytes,
+                                                           d_segments,
+                                                           d_segments_offset,
+                                                           uniq_len,
+                                                           stream));
   if (d_temp_storage->size() < temp_storage_bytes) {
     d_temp_storage = NULL;
     d_temp_storage = memory::Alloc(place, temp_storage_bytes);
   }
-  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum(
-      d_temp_storage->ptr(), temp_storage_bytes, d_segments, d_segments_offset,
-      uniq_len, stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cub::DeviceScan::ExclusiveSum(d_temp_storage->ptr(),
+                                    temp_storage_bytes,
+                                    d_segments,
+                                    d_segments_offset,
+                                    uniq_len,
+                                    stream));
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
 
-  heter_comm_kernel_->expand_segments(d_fea_num_info, d_segments_offset,
-                                      uniq_len, d_segments_fea_num_info,
-                                      segment_size, stream);
+  heter_comm_kernel_->expand_segments(d_fea_num_info,
+                                      d_segments_offset,
+                                      uniq_len,
+                                      d_segments_fea_num_info,
+                                      segment_size,
+                                      stream);
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
 
-  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum(
-      NULL, temp_storage_bytes, d_segments_fea_num_info,
-      d_segments_fea_num_offset, segments_num, stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cub::DeviceScan::ExclusiveSum(NULL,
+                                    temp_storage_bytes,
+                                    d_segments_fea_num_info,
+                                    d_segments_fea_num_offset,
+                                    segments_num,
+                                    stream));
   if (d_temp_storage->size() < temp_storage_bytes) {
     d_temp_storage = NULL;
     d_temp_storage = memory::Alloc(place, temp_storage_bytes);
   }
-  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum(
-      d_temp_storage->ptr(), temp_storage_bytes, d_segments_fea_num_info,
-      d_segments_fea_num_offset, segments_num, stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cub::DeviceScan::ExclusiveSum(d_temp_storage->ptr(),
+                                    temp_storage_bytes,
+                                    d_segments_fea_num_info,
+                                    d_segments_fea_num_offset,
+                                    segments_num,
+                                    stream));
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
 
   auto d_segments_keys = memory::Alloc(place, sizeof(KeyType) * segments_num);
   auto d_segments_keys_ptr = reinterpret_cast<KeyType*>(d_segments_keys->ptr());
-  heter_comm_kernel_->shrink_keys(d_keys, d_segments_fea_num_offset,
-                                  d_segments_keys_ptr, segments_num, stream);
+  heter_comm_kernel_->shrink_keys(d_keys,
+                                  d_segments_fea_num_offset,
+                                  d_segments_keys_ptr,
+                                  segments_num,
+                                  stream);
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
 
   auto d_segment_grads = memory::Alloc(place, segments_num * grad_value_size);
   auto d_segment_grads_ptr = reinterpret_cast<float*>(d_segment_grads->ptr());
-  heter_comm_kernel_->merge_gradient(
-      d_segments_keys_ptr, d_segments_fea_num_offset, d_segments_fea_num_info,
-      d_index, (char*)d_grads, (char*)d_segment_grads_ptr, segments_num,
-      grad_dim, grad_value_size, merger_, stream);
+  heter_comm_kernel_->merge_gradient(d_segments_keys_ptr,
+                                     d_segments_fea_num_offset,
+                                     d_segments_fea_num_info,
+                                     d_index,
+                                     (char*)d_grads,
+                                     (char*)d_segment_grads_ptr,
+                                     segments_num,
+                                     grad_dim,
+                                     grad_value_size,
+                                     merger_,
+                                     stream,
+                                     feature_value_accessor_);
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
 
-  PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(d_keys, d_segments_keys_ptr,
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(d_keys,
+                                             d_segments_keys_ptr,
                                              sizeof(KeyType) * segments_num,
-                                             cudaMemcpyDeviceToDevice, stream));
-  PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(d_grads, d_segment_grads_ptr,
+                                             cudaMemcpyDeviceToDevice,
+                                             stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(d_grads,
+                                             d_segment_grads_ptr,
                                              grad_value_size * segments_num,
-                                             cudaMemcpyDeviceToDevice, stream));
+                                             cudaMemcpyDeviceToDevice,
+                                             stream));
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::split_input_to_shard(
-    KeyType* d_keys, int* d_idx_ptr, size_t len, int* left, int* right,
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+void HeterComm<KeyType, ValType, GradType, FVAccessor>::split_input_to_shard(
+    KeyType* d_keys,
+    int* d_idx_ptr,
+    size_t len,
+    int* left,
+    int* right,
     int dev_num) {
   int total_device = resource_->total_device();
   int dev_id = resource_->dev_id(dev_num);
@@ -839,42 +1080,64 @@ void HeterComm<KeyType, ValType, GradType>::split_input_to_shard(
   int* d_shard_index_tmp_ptr = reinterpret_cast<int*>(d_shard_index_tmp->ptr());
 
   heter_comm_kernel_->fill_idx(d_idx_tmp_ptr, len, stream);
-  heter_comm_kernel_->calc_shard_index(d_keys, len, d_shard_index_tmp_ptr,
-                                       total_device, stream);
+  heter_comm_kernel_->calc_shard_index(
+      d_keys, len, d_shard_index_tmp_ptr, total_device, stream);
 
   size_t temp_storage_bytes;
   const int num_bits = 1 + log2i(total_device);
-  heter_comm_kernel_->sort_pairs(
-      NULL, temp_storage_bytes, d_shard_index_tmp_ptr, d_shard_index_ptr,
-      d_idx_tmp_ptr, d_idx_ptr, len, 0, num_bits, stream);
+  heter_comm_kernel_->sort_pairs(NULL,
+                                 temp_storage_bytes,
+                                 d_shard_index_tmp_ptr,
+                                 d_shard_index_ptr,
+                                 d_idx_tmp_ptr,
+                                 d_idx_ptr,
+                                 len,
+                                 0,
+                                 num_bits,
+                                 stream);
 
   auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
-  heter_comm_kernel_->sort_pairs(
-      d_temp_storage->ptr(), temp_storage_bytes, d_shard_index_tmp_ptr,
-      d_shard_index_ptr, d_idx_tmp_ptr, d_idx_ptr, len, 0, num_bits, stream);
-
-  heter_comm_kernel_->calc_shard_offset(d_shard_index_ptr, left, right, len,
-                                        total_device, stream);
+  heter_comm_kernel_->sort_pairs(d_temp_storage->ptr(),
+                                 temp_storage_bytes,
+                                 d_shard_index_tmp_ptr,
+                                 d_shard_index_ptr,
+                                 d_idx_tmp_ptr,
+                                 d_idx_ptr,
+                                 len,
+                                 0,
+                                 num_bits,
+                                 stream);
+
+  heter_comm_kernel_->calc_shard_offset(
+      d_shard_index_ptr, left, right, len, total_device, stream);
   sync_stream(stream);
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::merge_keys(
-        int gpu_num, const KeyType* d_keys, size_t len, // input
-        KeyType* d_sorted_keys, // output
-        KeyType* d_merged_keys, // output
-        uint32_t* d_restore_idx, // output
-        size_t& uniq_len) { // output
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+void HeterComm<KeyType, ValType, GradType, FVAccessor>::merge_keys(
+    int gpu_num,
+    const KeyType* d_keys,
+    size_t len,               // input
+    KeyType* d_sorted_keys,   // output
+    KeyType* d_merged_keys,   // output
+    uint32_t* d_restore_idx,  // output
+    size_t& uniq_len) {       // output
   int dev_id = resource_->dev_id(gpu_num);
   platform::CUDAPlace place = platform::CUDAPlace(dev_id);
   platform::CUDADeviceGuard guard(dev_id);
   auto stream = resource_->local_stream(gpu_num, 0);
 
   size_t grad_dim = max_mf_dim_;
-  size_t grad_value_size = TYPEALIGN(8, feature_value_accessor_.common_push_value.Size(max_mf_dim_));
+  auto accessor_wrapper_ptr =
+      GlobalAccessorTransfor::GetInstance().GetAccessorWrapper();
+  size_t grad_value_size = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_);
 
   auto d_fea_num_info = memory::Alloc(place, sizeof(uint32_t) * (len * 4 + 1));
-  uint32_t* d_fea_num_info_ptr = reinterpret_cast<uint32_t*>(d_fea_num_info->ptr());
+  uint32_t* d_fea_num_info_ptr =
+      reinterpret_cast<uint32_t*>(d_fea_num_info->ptr());
   uint32_t* d_idx = (uint32_t*)&d_fea_num_info_ptr[len];
   uint32_t* d_index = (uint32_t*)&d_idx[len];
   uint32_t* d_offset = (uint32_t*)&d_index[len];
@@ -882,52 +1145,99 @@ void HeterComm<KeyType, ValType, GradType>::merge_keys(
   heter_comm_kernel_->fill_idx(d_idx, len, stream);
 
   size_t temp_storage_bytes;
-  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs(
-      NULL, temp_storage_bytes, d_keys, d_sorted_keys, d_idx, d_index, len,
-      0, 8 * sizeof(KeyType), stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cub::DeviceRadixSort::SortPairs(NULL,
+                                      temp_storage_bytes,
+                                      d_keys,
+                                      d_sorted_keys,
+                                      d_idx,
+                                      d_index,
+                                      len,
+                                      0,
+                                      8 * sizeof(KeyType),
+                                      stream));
   auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
-  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs(
-      d_temp_storage->ptr(), temp_storage_bytes, d_keys, d_sorted_keys,
-      d_idx, d_index, len, 0, 8 * sizeof(KeyType), stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cub::DeviceRadixSort::SortPairs(d_temp_storage->ptr(),
+                                      temp_storage_bytes,
+                                      d_keys,
+                                      d_sorted_keys,
+                                      d_idx,
+                                      d_index,
+                                      len,
+                                      0,
+                                      8 * sizeof(KeyType),
+                                      stream));
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
 
   temp_storage_bytes = 0;
-  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRunLengthEncode::Encode(
-      NULL, temp_storage_bytes, d_sorted_keys, d_merged_keys, d_fea_num_info_ptr,
-      d_merged_size, len, stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cub::DeviceRunLengthEncode::Encode(NULL,
+                                         temp_storage_bytes,
+                                         d_sorted_keys,
+                                         d_merged_keys,
+                                         d_fea_num_info_ptr,
+                                         d_merged_size,
+                                         len,
+                                         stream));
   if (d_temp_storage->size() < temp_storage_bytes) {
     d_temp_storage = NULL;
     d_temp_storage = memory::Alloc(place, temp_storage_bytes);
   }
-  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRunLengthEncode::Encode(
-      d_temp_storage->ptr(), temp_storage_bytes, d_sorted_keys, d_merged_keys,
-      d_fea_num_info_ptr, d_merged_size, len, stream));
-  cudaMemcpyAsync((void*)&uniq_len, d_merged_size, sizeof(int),
-                  cudaMemcpyDeviceToHost, stream);
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cub::DeviceRunLengthEncode::Encode(d_temp_storage->ptr(),
+                                         temp_storage_bytes,
+                                         d_sorted_keys,
+                                         d_merged_keys,
+                                         d_fea_num_info_ptr,
+                                         d_merged_size,
+                                         len,
+                                         stream));
+  cudaMemcpyAsync((void*)&uniq_len,
+                  d_merged_size,
+                  sizeof(int),
+                  cudaMemcpyDeviceToHost,
+                  stream);
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
 
   temp_storage_bytes = 0;
-  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum(
-      NULL, temp_storage_bytes, d_fea_num_info_ptr, d_offset, uniq_len,
-      stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum(NULL,
+                                                           temp_storage_bytes,
+                                                           d_fea_num_info_ptr,
+                                                           d_offset,
+                                                           uniq_len,
+                                                           stream));
   if (d_temp_storage->size() < temp_storage_bytes) {
     d_temp_storage = NULL;
     d_temp_storage = memory::Alloc(place, temp_storage_bytes);
   }
-  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum(
-      d_temp_storage->ptr(), temp_storage_bytes, d_fea_num_info_ptr, d_offset, uniq_len,
-      stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cub::DeviceScan::ExclusiveSum(d_temp_storage->ptr(),
+                                    temp_storage_bytes,
+                                    d_fea_num_info_ptr,
+                                    d_offset,
+                                    uniq_len,
+                                    stream));
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
 
-  heter_comm_kernel_->fill_restore_idx(
-      true, len, uniq_len, d_merged_keys, d_index, d_offset,
-      d_fea_num_info_ptr, d_restore_idx, stream);
+  heter_comm_kernel_->fill_restore_idx(true,
+                                       len,
+                                       uniq_len,
+                                       d_merged_keys,
+                                       d_index,
+                                       d_offset,
+                                       d_fea_num_info_ptr,
+                                       d_restore_idx,
+                                       stream);
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::pull_merge_sparse(
-        int num, KeyType* d_keys, float* d_vals, size_t len) {
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+void HeterComm<KeyType, ValType, GradType, FVAccessor>::pull_merge_sparse(
+    int num, KeyType* d_keys, float* d_vals, size_t len) {
   int total_device = resource_->total_device();
   int dev_id = resource_->dev_id(num);
   DevPlace place = DevPlace(dev_id);
@@ -952,18 +1262,24 @@ void HeterComm<KeyType, ValType, GradType>::pull_merge_sparse(
   auto xpu_context = xpu_dev_ctx.x_context();
 
   int r = xpu::constant<int>(xpu_context, d_left_ptr, total_device, -1);
-  PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
+  PADDLE_ENFORCE_EQ(r,
+                    XPU_SUCCESS,
                     platform::errors::External(
-                        "XPU constant kernel return wrong value[%d %s]", r,
+                        "XPU constant kernel return wrong value[%d %s]",
+                        r,
                         XPUAPIErrorMsg[r]));
   int r2 = xpu::constant<int>(xpu_context, d_right_ptr, total_device, -1);
-  PADDLE_ENFORCE_EQ(r2, XPU_SUCCESS,
+  PADDLE_ENFORCE_EQ(r2,
+                    XPU_SUCCESS,
                     platform::errors::External(
-                        "XPU constant kernel return wrong value[%d %s]", r2,
+                        "XPU constant kernel return wrong value[%d %s]",
+                        r2,
                         XPUAPIErrorMsg[r2]));
 #endif
 
-  size_t val_type_size = feature_value_accessor_.common_pull_value.Size(max_mf_dim_);
+  auto accessor_wrapper_ptr =
+      GlobalAccessorTransfor::GetInstance().GetAccessorWrapper();
+  size_t val_type_size = accessor_wrapper_ptr->GetPullValueSize(max_mf_dim_);
   VLOG(3) << "pull_sparse len:" << len << "  val_type_size: " << val_type_size;
   auto d_sorted_keys = memory::Alloc(place, len * sizeof(KeyType));
   auto d_sorted_keys_ptr = reinterpret_cast<KeyType*>(d_sorted_keys->ptr());
@@ -977,28 +1293,38 @@ void HeterComm<KeyType, ValType, GradType>::pull_merge_sparse(
   auto d_shard_vals_ptr = reinterpret_cast<float*>(d_shard_vals->ptr());
 
   size_t uniq_len = 0;
-  merge_keys(num, d_keys, len,
-          d_sorted_keys_ptr,
-          d_merged_keys_ptr,
-          d_restore_idx_ptr,
-          uniq_len);
+  merge_keys(num,
+             d_keys,
+             len,
+             d_sorted_keys_ptr,
+             d_merged_keys_ptr,
+             d_restore_idx_ptr,
+             uniq_len);
   sync_stream(stream);
 
   auto d_idx = memory::Alloc(place, uniq_len * sizeof(int));
   auto d_idx_ptr = reinterpret_cast<int*>(d_idx->ptr());
-  split_input_to_shard(d_merged_keys_ptr, d_idx_ptr, uniq_len, d_left_ptr, d_right_ptr, num);
+  split_input_to_shard(
+      d_merged_keys_ptr, d_idx_ptr, uniq_len, d_left_ptr, d_right_ptr, num);
   heter_comm_kernel_->fill_shard_key(
-          d_shard_keys_ptr, d_merged_keys_ptr, d_idx_ptr, uniq_len,
-          stream);
+      d_shard_keys_ptr, d_merged_keys_ptr, d_idx_ptr, uniq_len, stream);
   sync_stream(stream);
 
   auto dst_place = platform::CPUPlace();
   auto src_place = place;
 
-  memory_copy(dst_place, h_left, src_place, d_left_ptr,
-              total_device * sizeof(int), stream);
-  memory_copy(dst_place, h_right, src_place, d_right_ptr,
-              total_device * sizeof(int), stream);
+  memory_copy(dst_place,
+              h_left,
+              src_place,
+              d_left_ptr,
+              total_device * sizeof(int),
+              stream);
+  memory_copy(dst_place,
+              h_right,
+              src_place,
+              d_right_ptr,
+              total_device * sizeof(int),
+              stream);
 
   if (!FLAGS_gpugraph_enable_gpu_direct_access) {
     for (int i = 0; i < total_device; ++i) {
@@ -1006,8 +1332,8 @@ void HeterComm<KeyType, ValType, GradType>::pull_merge_sparse(
       if (h_left[i] == -1 || h_right[i] == -1) {
         continue;
       }
-      create_storage(num, i, shard_len * sizeof(KeyType),
-                     shard_len * val_type_size);
+      create_storage(
+          num, i, shard_len * sizeof(KeyType), shard_len * val_type_size);
     }
     walk_to_dest(num, total_device, h_left, h_right, d_shard_keys_ptr, NULL);
   }
@@ -1024,13 +1350,17 @@ void HeterComm<KeyType, ValType, GradType>::pull_merge_sparse(
     ptr_tables_[i]->rwlock_->RDLock();
     if (!FLAGS_gpugraph_enable_gpu_direct_access) {
       ptr_tables_[i]->get(reinterpret_cast<KeyType*>(node.key_storage),
-                          node.val_storage, h_right[i] - h_left[i] + 1,
-                          resource_->remote_stream(i, num));
+                          node.val_storage,
+                          h_right[i] - h_left[i] + 1,
+                          resource_->remote_stream(i, num),
+                          feature_value_accessor_);
     } else {
       ptr_tables_[i]->get(
           d_shard_keys_ptr + h_left[i],
           reinterpret_cast<char*>(d_shard_vals_ptr) + h_left[i] * val_type_size,
-          h_right[i] - h_left[i] + 1, resource_->remote_stream(i, num));
+          h_right[i] - h_left[i] + 1,
+          resource_->remote_stream(i, num),
+          feature_value_accessor_);
     }
   }
 
@@ -1043,8 +1373,12 @@ void HeterComm<KeyType, ValType, GradType>::pull_merge_sparse(
   }
 
   if (!FLAGS_gpugraph_enable_gpu_direct_access) {
-    walk_to_src(num, total_device, h_left, h_right,
-                reinterpret_cast<char*>(d_shard_vals_ptr), val_type_size);
+    walk_to_src(num,
+                total_device,
+                h_left,
+                h_right,
+                reinterpret_cast<char*>(d_shard_vals_ptr),
+                val_type_size);
     for (int i = 0; i < total_device; ++i) {
       auto& node = path_[num][i].nodes_.front();
       sync_stream(node.out_stream);
@@ -1053,17 +1387,21 @@ void HeterComm<KeyType, ValType, GradType>::pull_merge_sparse(
 
   auto d_merged_vals = memory::Alloc(place, uniq_len * val_type_size);
   auto d_merged_vals_ptr = reinterpret_cast<float*>(d_merged_vals->ptr());
-  heter_comm_kernel_->dy_mf_fill_dvals(
-          d_shard_vals_ptr, d_merged_vals_ptr,
-          d_idx_ptr, uniq_len,
-          val_type_size, stream);
+  heter_comm_kernel_->dy_mf_fill_dvals(d_shard_vals_ptr,
+                                       d_merged_vals_ptr,
+                                       d_idx_ptr,
+                                       uniq_len,
+                                       val_type_size,
+                                       stream);
   sync_stream(stream);
 
-  heter_comm_kernel_->unpack_merged_vals(
-          len, d_keys,
-          d_merged_vals_ptr,
-          d_restore_idx_ptr,
-          d_vals, val_type_size, stream);
+  heter_comm_kernel_->unpack_merged_vals(len,
+                                         d_keys,
+                                         d_merged_vals_ptr,
+                                         d_restore_idx_ptr,
+                                         d_vals,
+                                         val_type_size,
+                                         stream);
   sync_stream(stream);
 
   if (!FLAGS_gpugraph_enable_gpu_direct_access) {
@@ -1075,9 +1413,12 @@ void HeterComm<KeyType, ValType, GradType>::pull_merge_sparse(
     }
   }
 }
-template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::pull_normal_sparse(
-        int num, KeyType* d_keys, float* d_vals, size_t len) {
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+void HeterComm<KeyType, ValType, GradType, FVAccessor>::pull_normal_sparse(
+    int num, KeyType* d_keys, float* d_vals, size_t len) {
   int total_device = resource_->total_device();
   int dev_id = resource_->dev_id(num);
   DevPlace place = DevPlace(dev_id);
@@ -1102,21 +1443,27 @@ void HeterComm<KeyType, ValType, GradType>::pull_normal_sparse(
   auto xpu_context = xpu_dev_ctx.x_context();
 
   int r = xpu::constant<int>(xpu_context, d_left_ptr, total_device, -1);
-  PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
+  PADDLE_ENFORCE_EQ(r,
+                    XPU_SUCCESS,
                     platform::errors::External(
-                        "XPU constant kernel return wrong value[%d %s]", r,
+                        "XPU constant kernel return wrong value[%d %s]",
+                        r,
                         XPUAPIErrorMsg[r]));
   int r2 = xpu::constant<int>(xpu_context, d_right_ptr, total_device, -1);
-  PADDLE_ENFORCE_EQ(r2, XPU_SUCCESS,
+  PADDLE_ENFORCE_EQ(r2,
+                    XPU_SUCCESS,
                     platform::errors::External(
-                        "XPU constant kernel return wrong value[%d %s]", r2,
+                        "XPU constant kernel return wrong value[%d %s]",
+                        r2,
                         XPUAPIErrorMsg[r2]));
 #endif
 
   auto d_idx = memory::Alloc(place, len * sizeof(int));
   int* d_idx_ptr = reinterpret_cast<int*>(d_idx->ptr());
 
-  size_t val_type_size = feature_value_accessor_.common_pull_value.Size(max_mf_dim_);
+  auto accessor_wrapper_ptr =
+      GlobalAccessorTransfor::GetInstance().GetAccessorWrapper();
+  size_t val_type_size = accessor_wrapper_ptr->GetPullValueSize(max_mf_dim_);
   VLOG(3) << "pull_sparse len:" << len << "  val_type_size: " << val_type_size;
   auto d_shard_keys = memory::Alloc(place, len * sizeof(KeyType));
   KeyType* d_shard_keys_ptr = reinterpret_cast<KeyType*>(d_shard_keys->ptr());
@@ -1125,18 +1472,26 @@ void HeterComm<KeyType, ValType, GradType>::pull_normal_sparse(
 
   split_input_to_shard(d_keys, d_idx_ptr, len, d_left_ptr, d_right_ptr, num);
 
-  heter_comm_kernel_->fill_shard_key(d_shard_keys_ptr, d_keys, d_idx_ptr, len,
-                                     stream);
+  heter_comm_kernel_->fill_shard_key(
+      d_shard_keys_ptr, d_keys, d_idx_ptr, len, stream);
 
   sync_stream(stream);
 
   auto dst_place = platform::CPUPlace();
   auto src_place = place;
 
-  memory_copy(dst_place, h_left, src_place, d_left_ptr,
-              total_device * sizeof(int), stream);
-  memory_copy(dst_place, h_right, src_place, d_right_ptr,
-              total_device * sizeof(int), stream);
+  memory_copy(dst_place,
+              h_left,
+              src_place,
+              d_left_ptr,
+              total_device * sizeof(int),
+              stream);
+  memory_copy(dst_place,
+              h_right,
+              src_place,
+              d_right_ptr,
+              total_device * sizeof(int),
+              stream);
 
   if (!FLAGS_gpugraph_enable_gpu_direct_access) {
     for (int i = 0; i < total_device; ++i) {
@@ -1144,8 +1499,8 @@ void HeterComm<KeyType, ValType, GradType>::pull_normal_sparse(
       if (h_left[i] == -1 || h_right[i] == -1) {
         continue;
       }
-      create_storage(num, i, shard_len * sizeof(KeyType),
-                     shard_len * val_type_size);
+      create_storage(
+          num, i, shard_len * sizeof(KeyType), shard_len * val_type_size);
     }
     walk_to_dest(num, total_device, h_left, h_right, d_shard_keys_ptr, NULL);
   }
@@ -1161,13 +1516,17 @@ void HeterComm<KeyType, ValType, GradType>::pull_normal_sparse(
     ptr_tables_[i]->rwlock_->RDLock();
     if (!FLAGS_gpugraph_enable_gpu_direct_access) {
       ptr_tables_[i]->get(reinterpret_cast<KeyType*>(node.key_storage),
-                          node.val_storage, h_right[i] - h_left[i] + 1,
-                          resource_->remote_stream(i, num));
+                          node.val_storage,
+                          h_right[i] - h_left[i] + 1,
+                          resource_->remote_stream(i, num),
+                          feature_value_accessor_);
     } else {
       ptr_tables_[i]->get(
           d_shard_keys_ptr + h_left[i],
           reinterpret_cast<char*>(d_shard_vals_ptr) + h_left[i] * val_type_size,
-          h_right[i] - h_left[i] + 1, resource_->remote_stream(i, num));
+          h_right[i] - h_left[i] + 1,
+          resource_->remote_stream(i, num),
+          feature_value_accessor_);
     }
   }
 
@@ -1179,17 +1538,20 @@ void HeterComm<KeyType, ValType, GradType>::pull_normal_sparse(
     ptr_tables_[i]->rwlock_->UNLock();
   }
   if (!FLAGS_gpugraph_enable_gpu_direct_access) {
-    walk_to_src(num, total_device, h_left, h_right,
-                reinterpret_cast<char*>(d_shard_vals_ptr), val_type_size);
+    walk_to_src(num,
+                total_device,
+                h_left,
+                h_right,
+                reinterpret_cast<char*>(d_shard_vals_ptr),
+                val_type_size);
     for (int i = 0; i < total_device; ++i) {
       auto& node = path_[num][i].nodes_.front();
       sync_stream(node.out_stream);
     }
   }
 
-  heter_comm_kernel_->dy_mf_fill_dvals(d_shard_vals_ptr, d_vals, d_idx_ptr, len,
-                                       val_type_size, stream);
-
+  heter_comm_kernel_->dy_mf_fill_dvals(
+      d_shard_vals_ptr, d_vals, d_idx_ptr, len, val_type_size, stream);
   sync_stream(stream);
   if (!FLAGS_gpugraph_enable_gpu_direct_access) {
     for (int i = 0; i < total_device; ++i) {
@@ -1201,9 +1563,12 @@ void HeterComm<KeyType, ValType, GradType>::pull_normal_sparse(
   }
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::pull_sparse(
-        int num, KeyType* d_keys, float* d_vals, size_t len) {
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+void HeterComm<KeyType, ValType, GradType, FVAccessor>::pull_sparse(
+    int num, KeyType* d_keys, float* d_vals, size_t len) {
   if (len == 0) {
     return;
   }
@@ -1215,13 +1580,17 @@ void HeterComm<KeyType, ValType, GradType>::pull_sparse(
 }
 
 #if defined(PADDLE_WITH_CUDA)
-template <typename KeyType, typename ValType, typename GradType>
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
 template <typename Sgd>
-void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
-                                                        KeyType* d_keys,
-                                                        float* d_grads,
-                                                        size_t len,
-                                                        Sgd& sgd) {  // NOLINT
+void HeterComm<KeyType, ValType, GradType, FVAccessor>::push_sparse(
+    int dev_num,
+    KeyType* d_keys,
+    float* d_grads,
+    size_t len,
+    Sgd& sgd) {  // NOLINT
   if (len == 0) {
     return;
   }
@@ -1229,8 +1598,9 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
   int total_device = resource_->total_device();
   int dev_id = resource_->dev_id(dev_num);
 
-  size_t grad_value_size =
-      TYPEALIGN(8, feature_value_accessor_.common_push_value.Size(max_mf_dim_));
+  auto accessor_wrapper_ptr =
+      GlobalAccessorTransfor::GetInstance().GetAccessorWrapper();
+  size_t grad_value_size = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_);
   DevPlace place = DevPlace(dev_id);
   AnyDeviceGuard guard(dev_id);
   auto stream = resource_->local_stream(dev_num, 0);
@@ -1253,14 +1623,18 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
   auto xpu_context = xpu_dev_ctx.x_context();
 
   int r = xpu::constant<int>(xpu_context, d_left_ptr, total_device, -1);
-  PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
+  PADDLE_ENFORCE_EQ(r,
+                    XPU_SUCCESS,
                     platform::errors::External(
-                        "XPU constant kernel return wrong value[%d %s]", r,
+                        "XPU constant kernel return wrong value[%d %s]",
+                        r,
                         XPUAPIErrorMsg[r]));
   int r2 = xpu::constant<int>(xpu_context, d_right_ptr, total_device, -1);
-  PADDLE_ENFORCE_EQ(r2, XPU_SUCCESS,
+  PADDLE_ENFORCE_EQ(r2,
+                    XPU_SUCCESS,
                     platform::errors::External(
-                        "XPU constant kernel return wrong value[%d %s]", r2,
+                        "XPU constant kernel return wrong value[%d %s]",
+                        r2,
                         XPUAPIErrorMsg[r2]));
 #endif
 
@@ -1281,35 +1655,49 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
       // do two gradient merge
       // 1st. do segmented gradient merge
       // 2nd. do global gradient merge
-      dynamic_merge_grad(dev_num, d_keys, d_grads, len, uniq_len, segment_len,
-                         true);
+      dynamic_merge_grad(
+          dev_num, d_keys, d_grads, len, uniq_len, segment_len, true);
       len = segment_len;
       uniq_len = 0;
       segment_len = 0;
-      dynamic_merge_grad(dev_num, d_keys, d_grads, len, uniq_len, segment_len,
-                         false);
+      dynamic_merge_grad(
+          dev_num, d_keys, d_grads, len, uniq_len, segment_len, false);
     } else {
       // Perform gradient merge only once
-      dynamic_merge_grad(dev_num, d_keys, d_grads, len, uniq_len, segment_len,
-                         false);
+      dynamic_merge_grad(
+          dev_num, d_keys, d_grads, len, uniq_len, segment_len, false);
     }
   }
 
-  split_input_to_shard(d_keys, d_idx_ptr, uniq_len, d_left_ptr, d_right_ptr,
-                       dev_num);
+  split_input_to_shard(
+      d_keys, d_idx_ptr, uniq_len, d_left_ptr, d_right_ptr, dev_num);
 
-  heter_comm_kernel_->dy_mf_fill_shard_grads(
-      d_shard_keys_ptr, d_keys, d_shard_grads_ptr, d_grads, d_idx_ptr, uniq_len,
-      grad_value_size, stream);
+  heter_comm_kernel_->dy_mf_fill_shard_grads(d_shard_keys_ptr,
+                                             d_keys,
+                                             d_shard_grads_ptr,
+                                             d_grads,
+                                             d_idx_ptr,
+                                             uniq_len,
+                                             grad_value_size,
+                                             stream,
+                                             feature_value_accessor_);
 
   sync_stream(stream);
 
   auto dst_place = platform::CPUPlace();
   auto src_place = place;
-  memory_copy(dst_place, h_left, src_place, d_left_ptr,
-              total_device * sizeof(int), stream);
-  memory_copy(dst_place, h_right, src_place, d_right_ptr,
-              total_device * sizeof(int), stream);
+  memory_copy(dst_place,
+              h_left,
+              src_place,
+              d_left_ptr,
+              total_device * sizeof(int),
+              stream);
+  memory_copy(dst_place,
+              h_right,
+              src_place,
+              d_right_ptr,
+              total_device * sizeof(int),
+              stream);
 
   if (!FLAGS_gpugraph_enable_gpu_direct_access) {
     for (int i = 0; i < total_device; ++i) {
@@ -1317,12 +1705,17 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
       if (h_left[i] == -1 || h_right[i] == -1) {
         continue;
       }
-      create_storage(dev_num, i, shard_len * sizeof(KeyType),
-                     shard_len * grad_value_size);
+      create_storage(
+          dev_num, i, shard_len * sizeof(KeyType), shard_len * grad_value_size);
     }
 
-    walk_to_dest(dev_num, total_device, h_left, h_right, d_shard_keys_ptr,
-                 reinterpret_cast<char*>(d_shard_grads_ptr), grad_value_size);
+    walk_to_dest(dev_num,
+                 total_device,
+                 h_left,
+                 h_right,
+                 d_shard_keys_ptr,
+                 reinterpret_cast<char*>(d_shard_grads_ptr),
+                 grad_value_size);
   }
 
   for (int i = 0; i < total_device; ++i) {
@@ -1338,13 +1731,16 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
     ptr_tables_[i]->rwlock_->WRLock();
     if (!FLAGS_gpugraph_enable_gpu_direct_access) {
       ptr_tables_[i]->update(reinterpret_cast<KeyType*>(node.key_storage),
-                             node.val_storage, h_right[i] - h_left[i] + 1, sgd,
+                             node.val_storage,
+                             h_right[i] - h_left[i] + 1,
+                             sgd,
                              resource_->remote_stream(i, dev_num));
     } else {
       ptr_tables_[i]->update(d_shard_keys_ptr + h_left[i],
                              reinterpret_cast<char*>(d_shard_grads_ptr) +
                                  grad_value_size * h_left[i],
-                             h_right[i] - h_left[i] + 1, sgd,
+                             h_right[i] - h_left[i] + 1,
+                             sgd,
                              resource_->remote_stream(i, dev_num));
     }
   }
@@ -1371,11 +1767,12 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
 }
 
 #elif defined(PADDLE_WITH_XPU_KP)
-template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
-                                                        KeyType* d_keys,
-                                                        GradType* d_grads,
-                                                        size_t len) {
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+void HeterComm<KeyType, ValType, GradType, FVAccessor>::push_sparse(
+    int dev_num, KeyType* d_keys, GradType* d_grads, size_t len) {
   if (len == 0) {
     return;
   }
@@ -1405,14 +1802,18 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
   auto xpu_context = xpu_dev_ctx.x_context();
 
   int r = xpu::constant<int>(xpu_context, d_left_ptr, total_device, -1);
-  PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
+  PADDLE_ENFORCE_EQ(r,
+                    XPU_SUCCESS,
                     platform::errors::External(
-                        "XPU constant kernel return wrong value[%d %s]", r,
+                        "XPU constant kernel return wrong value[%d %s]",
+                        r,
                         XPUAPIErrorMsg[r]));
   int r2 = xpu::constant<int>(xpu_context, d_right_ptr, total_device, -1);
-  PADDLE_ENFORCE_EQ(r2, XPU_SUCCESS,
+  PADDLE_ENFORCE_EQ(r2,
+                    XPU_SUCCESS,
                     platform::errors::External(
-                        "XPU constant kernel return wrong value[%d %s]", r2,
+                        "XPU constant kernel return wrong value[%d %s]",
+                        r2,
                         XPUAPIErrorMsg[r2]));
 #endif
 
@@ -1428,32 +1829,48 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
   int uniq_len = len;
   merge_grad(dev_num, d_keys, d_grads, len, uniq_len);
 
-  split_input_to_shard(d_keys, d_idx_ptr, uniq_len, d_left_ptr, d_right_ptr,
-                       dev_num);
+  split_input_to_shard(
+      d_keys, d_idx_ptr, uniq_len, d_left_ptr, d_right_ptr, dev_num);
 
-  heter_comm_kernel_->fill_shard_grads(d_shard_keys_ptr, d_keys,
-                                       d_shard_grads_ptr, d_grads, d_idx_ptr,
-                                       (long long)uniq_len, stream);
+  heter_comm_kernel_->fill_shard_grads(d_shard_keys_ptr,
+                                       d_keys,
+                                       d_shard_grads_ptr,
+                                       d_grads,
+                                       d_idx_ptr,
+                                       (long long)uniq_len,
+                                       stream);
 
   sync_stream(stream);
 
   auto dst_place = platform::CPUPlace();
   auto src_place = place;
-  memory_copy(dst_place, h_left, src_place, d_left_ptr,
-              total_device * sizeof(int), stream);
-  memory_copy(dst_place, h_right, src_place, d_right_ptr,
-              total_device * sizeof(int), stream);
+  memory_copy(dst_place,
+              h_left,
+              src_place,
+              d_left_ptr,
+              total_device * sizeof(int),
+              stream);
+  memory_copy(dst_place,
+              h_right,
+              src_place,
+              d_right_ptr,
+              total_device * sizeof(int),
+              stream);
 
   for (int i = 0; i < total_device; ++i) {
     int shard_len = h_right[i] - h_left[i] + 1;
     if (h_left[i] == -1 || h_right[i] == -1) {
       continue;
     }
-    create_storage(dev_num, i, shard_len * sizeof(KeyType),
-                   shard_len * sizeof(GradType));
+    create_storage(
+        dev_num, i, shard_len * sizeof(KeyType), shard_len * sizeof(GradType));
   }
 
-  walk_to_dest(dev_num, total_device, h_left, h_right, d_shard_keys_ptr,
+  walk_to_dest(dev_num,
+               total_device,
+               h_left,
+               h_right,
+               d_shard_keys_ptr,
                d_shard_grads_ptr);
 
   for (int i = 0; i < total_device; ++i) {
@@ -1489,10 +1906,16 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
 #endif
 
 #if defined(PADDLE_WITH_CUDA)
-template <typename KeyType, typename ValType, typename GradType>
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
 template <typename Sgd>
-void HeterComm<KeyType, ValType, GradType>::update_one_table(
-    int gpu_num, KeyType* d_keys, GradType* d_grads, size_t len,
+void HeterComm<KeyType, ValType, GradType, FVAccessor>::update_one_table(
+    int gpu_num,
+    KeyType* d_keys,
+    GradType* d_grads,
+    size_t len,
     Sgd& sgd) {  // NOLINT
   if (len == 0) {
     return;
@@ -1501,16 +1924,22 @@ void HeterComm<KeyType, ValType, GradType>::update_one_table(
   int dev_id = resource_->dev_id(gpu_num);
   platform::CUDADeviceGuard guard(dev_id);
   tables_[gpu_num]->rwlock_->WRLock();
-  tables_[gpu_num]->update(d_keys, d_grads, len, sgd,
-                           resource_->remote_stream(gpu_num, gpu_num));
+  tables_[gpu_num]->update(
+      d_keys, d_grads, len, sgd, resource_->remote_stream(gpu_num, gpu_num));
   tables_[gpu_num]->rwlock_->UNLock();
   cudaStreamSynchronize(resource_->remote_stream(gpu_num, gpu_num));
 }
 
-template <typename KeyType, typename ValType, typename GradType>
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
 template <typename Sgd>
-void HeterComm<KeyType, ValType, GradType>::push_sparse_multi_node(
-    int gpu_num, KeyType* d_keys, GradType* d_grads, size_t len,
+void HeterComm<KeyType, ValType, GradType, FVAccessor>::push_sparse_multi_node(
+    int gpu_num,
+    KeyType* d_keys,
+    GradType* d_grads,
+    size_t len,
     Sgd& sgd) {  // NOLINT
   if (len == 0) {
     return;
@@ -1521,15 +1950,23 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse_multi_node(
 
   uniq_len = gather_one_node_grad(gpu_num, d_keys, d_grads, uniq_len);
 
-  uniq_len = gather_multi_node_grad(gpu_num, storage_[gpu_num].local_keys,
-                                    storage_[gpu_num].local_grads, uniq_len);
+  uniq_len = gather_multi_node_grad(gpu_num,
+                                    storage_[gpu_num].local_keys,
+                                    storage_[gpu_num].local_grads,
+                                    uniq_len);
 
-  update_one_table(gpu_num, storage_[gpu_num].local_keys,
-                   storage_[gpu_num].local_grads, uniq_len, sgd);
+  update_one_table(gpu_num,
+                   storage_[gpu_num].local_keys,
+                   storage_[gpu_num].local_grads,
+                   uniq_len,
+                   sgd);
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-int HeterComm<KeyType, ValType, GradType>::gather_one_node_grad(
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+int HeterComm<KeyType, ValType, GradType, FVAccessor>::gather_one_node_grad(
     int gpu_num, KeyType* d_keys, GradType* d_grads, int len) {
   int total_gpu = resource_->total_device();
   int dev_id = resource_->dev_id(gpu_num);
@@ -1546,19 +1983,24 @@ int HeterComm<KeyType, ValType, GradType>::gather_one_node_grad(
   int* d_node_len = reinterpret_cast<int*>(d_node_len_mem->ptr());
   h_node_len[gpu_num] = len;
 
-  cudaMemcpy(d_node_len + gpu_num, h_node_len + gpu_num, sizeof(int),
+  cudaMemcpy(d_node_len + gpu_num,
+             h_node_len + gpu_num,
+             sizeof(int),
              cudaMemcpyHostToDevice);
 
   // allgather grad len
   PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
-      (const void*)(d_node_len + gpu_num), (void*)d_node_len, 1,  // NOLINT
-      ncclInt,                                                    // NOLINT
-      nccl_inner_comm, stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      platform::dynload::ncclAllGather((const void*)(d_node_len + gpu_num),
+                                       (void*)d_node_len,
+                                       1,        // NOLINT
+                                       ncclInt,  // NOLINT
+                                       nccl_inner_comm,
+                                       stream));
   PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
-  cudaMemcpy(h_node_len, d_node_len, sizeof(int) * total_gpu,
-             cudaMemcpyDeviceToHost);
+  cudaMemcpy(
+      h_node_len, d_node_len, sizeof(int) * total_gpu, cudaMemcpyDeviceToHost);
 
   for (int i = 0; i < total_gpu; ++i) {
     if (h_node_len[i] > max_size) {
@@ -1572,9 +2014,13 @@ int HeterComm<KeyType, ValType, GradType>::gather_one_node_grad(
   PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
       d_keys, storage.all_keys, max_size, ncclUint64, nccl_inner_comm, stream));
 
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
-      d_grads, storage.all_grads, max_size * sizeof(GradType), ncclUint8,
-      nccl_inner_comm, stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      platform::dynload::ncclAllGather(d_grads,
+                                       storage.all_grads,
+                                       max_size * sizeof(GradType),
+                                       ncclUint8,
+                                       nccl_inner_comm,
+                                       stream));
   PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
 
@@ -1594,18 +2040,24 @@ int HeterComm<KeyType, ValType, GradType>::gather_one_node_grad(
     cudaMemset(d_left_ptr, -1, total_gpu * sizeof(int));
     cudaMemset(d_right_ptr, -1, total_gpu * sizeof(int));
 
-    split_input_to_shard(storage.all_keys + index, d_idx_ptr, h_node_len[i],
-                         d_left_ptr, d_right_ptr, gpu_num);
-    cudaMemcpy(h_left, d_left_ptr, total_gpu * sizeof(int),
-               cudaMemcpyDeviceToHost);
-    cudaMemcpy(h_right, d_right_ptr, total_gpu * sizeof(int),
-               cudaMemcpyDeviceToHost);
-
-    heter_comm_kernel_->fill_shard_grads(
-        storage.local_keys + merge_num, storage.all_keys + index,
-        storage.local_grads + merge_num, storage.all_grads + index,
-        d_idx_ptr + h_left[gpu_num], h_right[gpu_num] - h_left[gpu_num] + 1,
-        stream);
+    split_input_to_shard(storage.all_keys + index,
+                         d_idx_ptr,
+                         h_node_len[i],
+                         d_left_ptr,
+                         d_right_ptr,
+                         gpu_num);
+    cudaMemcpy(
+        h_left, d_left_ptr, total_gpu * sizeof(int), cudaMemcpyDeviceToHost);
+    cudaMemcpy(
+        h_right, d_right_ptr, total_gpu * sizeof(int), cudaMemcpyDeviceToHost);
+
+    heter_comm_kernel_->fill_shard_grads(storage.local_keys + merge_num,
+                                         storage.all_keys + index,
+                                         storage.local_grads + merge_num,
+                                         storage.all_grads + index,
+                                         d_idx_ptr + h_left[gpu_num],
+                                         h_right[gpu_num] - h_left[gpu_num] + 1,
+                                         stream);
     merge_num = merge_num + h_right[gpu_num] - h_left[gpu_num] + 1;
   }
 
@@ -1614,8 +2066,11 @@ int HeterComm<KeyType, ValType, GradType>::gather_one_node_grad(
   return ret;
 }
 
-template <typename KeyType, typename ValType, typename GradType>
-int HeterComm<KeyType, ValType, GradType>::gather_multi_node_grad(
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+int HeterComm<KeyType, ValType, GradType, FVAccessor>::gather_multi_node_grad(
     int gpu_num, KeyType* d_keys, GradType* d_grads, int len) {
   int dev_id = resource_->dev_id(gpu_num);
   auto& storage = storage_[gpu_num];
@@ -1638,8 +2093,8 @@ int HeterComm<KeyType, ValType, GradType>::gather_multi_node_grad(
       d_node_len, d_node_len, 1, ncclInt, nccl_inter_comm, stream));
   PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
-  cudaMemcpy(h_node_len, d_node_len, sizeof(int) * node_size_,
-             cudaMemcpyDeviceToHost);
+  cudaMemcpy(
+      h_node_len, d_node_len, sizeof(int) * node_size_, cudaMemcpyDeviceToHost);
 
   for (int i = 0; i < node_size_; ++i) {
     if (h_node_len[i] > max_size) {
@@ -1653,19 +2108,29 @@ int HeterComm<KeyType, ValType, GradType>::gather_multi_node_grad(
   PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
       d_keys, storage.all_keys, max_size, ncclUint64, nccl_inter_comm, stream));
 
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
-      d_grads, storage.all_grads, max_size * sizeof(GradType), ncclUint8,
-      nccl_inter_comm, stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      platform::dynload::ncclAllGather(d_grads,
+                                       storage.all_grads,
+                                       max_size * sizeof(GradType),
+                                       ncclUint8,
+                                       nccl_inter_comm,
+                                       stream));
   PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
 
   int merge_num = 0;
   for (int i = 0; i < node_size_; ++i) {
     int index = i * max_size;
-    cudaMemcpyAsync(storage.local_keys + merge_num, storage.all_keys + index,
-                    h_node_len[i], cudaMemcpyDefault, stream);
-    cudaMemcpyAsync(storage.local_grads + merge_num, storage.all_grads + index,
-                    h_node_len[i], cudaMemcpyDefault, stream);
+    cudaMemcpyAsync(storage.local_keys + merge_num,
+                    storage.all_keys + index,
+                    h_node_len[i],
+                    cudaMemcpyDefault,
+                    stream);
+    cudaMemcpyAsync(storage.local_grads + merge_num,
+                    storage.all_grads + index,
+                    h_node_len[i],
+                    cudaMemcpyDefault,
+                    stream);
     merge_num += h_node_len[i];
   }
 
@@ -1675,8 +2140,11 @@ int HeterComm<KeyType, ValType, GradType>::gather_multi_node_grad(
 }
 #endif
 
-template <typename KeyType, typename ValType, typename GradType>
-void HeterComm<KeyType, ValType, GradType>::end_pass() {
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+void HeterComm<KeyType, ValType, GradType, FVAccessor>::end_pass() {
   int total_device = resource_->total_device();
   std::vector<std::thread> threads;
 
@@ -1698,8 +2166,11 @@ void HeterComm<KeyType, ValType, GradType>::end_pass() {
 }
 
 #if defined(PADDLE_WITH_CUDA)
-template <typename KeyType, typename ValType, typename GradType>
-int HeterComm<KeyType, ValType, GradType>::dedup_keys_and_fillidx(
+template <typename KeyType,
+          typename ValType,
+          typename GradType,
+          typename FVAccessor>
+int HeterComm<KeyType, ValType, GradType, FVAccessor>::dedup_keys_and_fillidx(
     const int gpu_id,
     const int total_fea_num,
     const KeyType* d_keys,   // input
@@ -1727,29 +2198,61 @@ int HeterComm<KeyType, ValType, GradType>::dedup_keys_and_fillidx(
 
   void* d_buf = NULL;
   size_t temp_storage_bytes = 0;
-  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs(
-      NULL, temp_storage_bytes, d_keys, d_sorted_keys, d_index_in, d_sorted_idx,
-      total_fea_num, 0, 8 * sizeof(KeyType), stream, false));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cub::DeviceRadixSort::SortPairs(NULL,
+                                      temp_storage_bytes,
+                                      d_keys,
+                                      d_sorted_keys,
+                                      d_index_in,
+                                      d_sorted_idx,
+                                      total_fea_num,
+                                      0,
+                                      8 * sizeof(KeyType),
+                                      stream,
+                                      false));
   auto d_cache_ptr = memory::Alloc(place, temp_storage_bytes);
   d_buf = reinterpret_cast<int*>(d_cache_ptr->ptr());
-  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs(
-      d_buf, temp_storage_bytes, d_keys, d_sorted_keys, d_index_in,
-      d_sorted_idx, total_fea_num, 0, 8 * sizeof(KeyType), stream, false));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cub::DeviceRadixSort::SortPairs(d_buf,
+                                      temp_storage_bytes,
+                                      d_keys,
+                                      d_sorted_keys,
+                                      d_index_in,
+                                      d_sorted_idx,
+                                      total_fea_num,
+                                      0,
+                                      8 * sizeof(KeyType),
+                                      stream,
+                                      false));
 
-  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRunLengthEncode::Encode(
-      NULL, temp_storage_bytes, d_sorted_keys, d_merged_keys, d_merged_cnts,
-      d_merged_size, total_fea_num, stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cub::DeviceRunLengthEncode::Encode(NULL,
+                                         temp_storage_bytes,
+                                         d_sorted_keys,
+                                         d_merged_keys,
+                                         d_merged_cnts,
+                                         d_merged_size,
+                                         total_fea_num,
+                                         stream));
   if (d_cache_ptr->size() < temp_storage_bytes) {
     d_cache_ptr = NULL;
     d_cache_ptr = memory::Alloc(place, temp_storage_bytes);
   }
   d_buf = reinterpret_cast<int*>(d_cache_ptr->ptr());
-  PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRunLengthEncode::Encode(
-      d_buf, temp_storage_bytes, d_sorted_keys, d_merged_keys, d_merged_cnts,
-      d_merged_size, total_fea_num, stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cub::DeviceRunLengthEncode::Encode(d_buf,
+                                         temp_storage_bytes,
+                                         d_sorted_keys,
+                                         d_merged_keys,
+                                         d_merged_cnts,
+                                         d_merged_size,
+                                         total_fea_num,
+                                         stream));
   PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync((void*)&merged_size,
-                                             (void*)d_merged_size, sizeof(int),
-                                             cudaMemcpyDeviceToHost, stream));
+                                             (void*)d_merged_size,
+                                             sizeof(int),
+                                             cudaMemcpyDeviceToHost,
+                                             stream));
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
 
   PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum(
@@ -1767,8 +2270,14 @@ int HeterComm<KeyType, ValType, GradType>::dedup_keys_and_fillidx(
   }
   // fill restore idx [1,3,5,2,4,6] = [1,2,1,3,2,1]
   heter_comm_kernel_->fill_restore_idx(filter_zero,
-        total_fea_num, merged_size, d_merged_keys, d_sorted_idx,
-        d_offset, d_merged_cnts, d_restore_idx, stream);
+                                       total_fea_num,
+                                       merged_size,
+                                       d_merged_keys,
+                                       d_sorted_idx,
+                                       d_offset,
+                                       d_merged_cnts,
+                                       d_restore_idx,
+                                       stream);
 
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
 
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu
index 8d4f2625be70d..93ad75195882b 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu
@@ -117,12 +117,12 @@ __global__ void fill_dvals_kernel(ValType* d_shard_vals, ValType* d_vals,
   }
 }
 
-template <typename KeyType>
+template <typename KeyType, typename FVAccessor>
 __global__ void merge_gradients_basic_kernel(
     const KeyType* d_keys, const uint32_t* offset, const uint32_t* fea_num,
     const uint32_t* index, const char* input, char* output, int n,
     size_t grad_value_size, DynamicGradMerger& merger,
-    CommonFeatureValueAccessor& feature_value_accessor) {
+    FVAccessor& feature_value_accessor) {
   const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (i < n) {
@@ -143,12 +143,12 @@ __global__ void merge_gradients_basic_kernel(
   }
 }
 
-template <typename KeyType>
+template <typename KeyType, typename FVAccessor>
 __global__ void merge_gradients_embedx_kernel(
     const KeyType* d_keys, const uint32_t* offset, const uint32_t* fea_num,
     const uint32_t* index, const char* input, char* output, int n,
     size_t grad_dim, size_t grad_value_size, DynamicGradMerger& merger,
-    CommonFeatureValueAccessor& feature_value_accessor) {
+    FVAccessor& feature_value_accessor) {
   const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (i < n) {
@@ -363,11 +363,12 @@ void HeterCommKernel::reduce_by_key(void* d_temp_storage,
       debug_synchronous));
 }
 
-template <typename KeyType, typename T, typename StreamType>
+template <typename KeyType, typename T, typename StreamType,
+          typename FVAccessor>
 void HeterCommKernel::dy_mf_fill_shard_grads(
     KeyType* d_shard_keys, KeyType* d_keys, float* d_shard_grads,
     float* d_grads, T* idx, long long len, size_t grad_value_size,
-    const StreamType& stream) {
+    const StreamType& stream, FVAccessor& feature_value_accessor) {
   int grid_size = (len - 1) / block_size_ + 1;
   size_t c_len = (size_t)len;
   
@@ -383,21 +384,21 @@ void HeterCommKernel::dy_mf_fill_shard_grads(
           d_shard_grads, d_grads, idx, N, grad_value_size_float);
 }
 
-template <typename KeyType, typename StreamType>
+template <typename KeyType, typename StreamType, typename FVAccessor>
 void HeterCommKernel::merge_gradient(
     const KeyType* d_keys, const uint32_t* offset, const uint32_t* fea_num,
     const uint32_t* index, const char* input, char* output, int n,
     size_t grad_dim, size_t grad_value_size, DynamicGradMerger& merger,
-    const StreamType& stream) {
+    const StreamType& stream, FVAccessor& feature_value_accessor) {
   int grid_size1 = (n - 1) / block_size_ + 1;
   merge_gradients_basic_kernel<<<grid_size1, block_size_, 0, stream>>>(
       d_keys, offset, fea_num, index, input, output, n, grad_value_size, merger,
-      feature_value_accessor_);
+      feature_value_accessor);
   if (grad_dim > 0) {
     int grid_size2 = (n * grad_dim - 1) / block_size_ + 1;
     merge_gradients_embedx_kernel<<<grid_size2, block_size_, 0, stream>>>(
         d_keys, offset, fea_num, index, input, output, n * grad_dim, grad_dim,
-        grad_value_size, merger, feature_value_accessor_);
+        grad_value_size, merger, feature_value_accessor);
   }
 }
 
@@ -596,22 +597,23 @@ template void HeterCommKernel::reduce_by_key<
     int num_items, cudaStream_t stream, bool debug_synchronous);
 
 template void
-HeterCommKernel::dy_mf_fill_shard_grads<unsigned long, int, cudaStream_t>(
+HeterCommKernel::dy_mf_fill_shard_grads<
+    unsigned long, int, cudaStream_t, CommonFeatureValueAccessor>(
     unsigned long* d_shard_keys, unsigned long* d_keys, float* d_shard_grads,
     float* d_grads, int* idx, long long len, size_t grad_value_size,
-    const cudaStream_t& stream);
+    const cudaStream_t& stream, CommonFeatureValueAccessor& feature_value_accessor);
 
-template void HeterCommKernel::merge_gradient<uint32_t, cudaStream_t>(
+template void HeterCommKernel::merge_gradient<uint32_t, cudaStream_t, CommonFeatureValueAccessor>(
     const uint32_t* d_keys, const uint32_t* offset, const uint32_t* fea_num,
     const uint32_t* index, const char* input, char* output, int n,
     size_t grad_dim, size_t grad_value_size, DynamicGradMerger& merger_,
-    const cudaStream_t& stream);
+    const cudaStream_t& stream, CommonFeatureValueAccessor& feature_value_accessor);
 
-template void HeterCommKernel::merge_gradient<uint64_t, cudaStream_t>(
+template void HeterCommKernel::merge_gradient<uint64_t, cudaStream_t, CommonFeatureValueAccessor>(
     const uint64_t* d_keys, const uint32_t* offset, const uint32_t* fea_num,
     const uint32_t* index, const char* input, char* output, int n,
     size_t grad_dim, size_t grad_value_size, DynamicGradMerger& merger_,
-    const cudaStream_t& stream);
+    const cudaStream_t& stream, CommonFeatureValueAccessor& feature_value_accessor);
 
 template void HeterCommKernel::dy_mf_fill_dvals<int, cudaStream_t>(
     float* d_shard_vals, float* d_vals, int* idx, long long len,
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h
index 05d93b1d8bcc0..cb02773bc034f 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h
@@ -41,82 +41,48 @@ struct DynamicGradMerger {
     return out;
   }
 
+  template <typename FVAccessor>
   __device__ __forceinline__ void update_one(
       float* output, const float* input,
-      CommonFeatureValueAccessor& feature_value_accessor) {
-    output[feature_value_accessor.common_push_value.SlotIndex()] =
-        input[feature_value_accessor.common_push_value.SlotIndex()];
-    output[feature_value_accessor.common_push_value.ShowIndex()] =
-        input[feature_value_accessor.common_push_value.ShowIndex()];
-    output[feature_value_accessor.common_push_value.ClickIndex()] =
-        input[feature_value_accessor.common_push_value.ClickIndex()];
-    output[feature_value_accessor.common_push_value.MfDimIndex()] =
-        input[feature_value_accessor.common_push_value.MfDimIndex()];
-    output[feature_value_accessor.common_push_value.EmbedGIndex()] =
-        input[feature_value_accessor.common_push_value.EmbedGIndex()];
-    for (int j = 0;
-         j < int(output[feature_value_accessor.common_push_value.MfDimIndex()]);
-         j++) {
-      output[feature_value_accessor.common_push_value.EmbedxGIndex() + j] =
-          input[feature_value_accessor.common_push_value.EmbedxGIndex() + j];
-    }
+      FVAccessor& feature_value_accessor) {
+    feature_value_accessor.PushValueFill(output, input);
   }
 
+  template <typename FVAccessor>
   __device__ __forceinline__ void merge_one(
       float* output, const float* input,
-      CommonFeatureValueAccessor& feature_value_accessor) {
-    output[feature_value_accessor.common_push_value.ShowIndex()] +=
-        input[feature_value_accessor.common_push_value.ShowIndex()];
-    output[feature_value_accessor.common_push_value.ClickIndex()] +=
-        input[feature_value_accessor.common_push_value.ClickIndex()];
-    output[feature_value_accessor.common_push_value.EmbedGIndex()] +=
-        input[feature_value_accessor.common_push_value.EmbedGIndex()];
-    for (int j = 0;
-         j < int(output[feature_value_accessor.common_push_value.MfDimIndex()]);
-         j++) {
-      output[feature_value_accessor.common_push_value.EmbedxGIndex() + j] +=
-          input[feature_value_accessor.common_push_value.EmbedxGIndex() + j];
-    }
+      FVAccessor& feature_value_accessor) {
+    feature_value_accessor.MergePushValue(output, input);
   }
 
+  template <typename FVAccessor>
   __device__ __forceinline__ void update_basic(
       float* output, const float* input,
-      CommonFeatureValueAccessor& fv_accessor) {
-    output[fv_accessor.common_push_value.SlotIndex()] =
-        input[fv_accessor.common_push_value.SlotIndex()];
-    output[fv_accessor.common_push_value.ShowIndex()] =
-        input[fv_accessor.common_push_value.ShowIndex()];
-    output[fv_accessor.common_push_value.ClickIndex()] =
-        input[fv_accessor.common_push_value.ClickIndex()];
-    output[fv_accessor.common_push_value.MfDimIndex()] =
-        input[fv_accessor.common_push_value.MfDimIndex()];
-    output[fv_accessor.common_push_value.EmbedGIndex()] =
-        input[fv_accessor.common_push_value.EmbedGIndex()];
+      FVAccessor& fv_accessor) {
+    fv_accessor.PushValueFillBasic(output, input);
   }
 
+  template <typename FVAccessor>
   __device__ __forceinline__ void merge_basic(
       float* output, const float* input,
-      CommonFeatureValueAccessor& fv_accessor) {
-    output[fv_accessor.common_push_value.ShowIndex()] +=
-        input[fv_accessor.common_push_value.ShowIndex()];
-    output[fv_accessor.common_push_value.ClickIndex()] +=
-        input[fv_accessor.common_push_value.ClickIndex()];
-    output[fv_accessor.common_push_value.EmbedGIndex()] +=
-        input[fv_accessor.common_push_value.EmbedGIndex()];
+      FVAccessor& fv_accessor) {
+    fv_accessor.MergePushValueBasic(output, input);
   }
 
+  template <typename FVAccessor>
   __device__ __forceinline__ void update_embedx(
       float* output, const float* input, size_t embedx_idx,
-      CommonFeatureValueAccessor& fv_accessor) {
+      FVAccessor& fv_accessor) {
     if (embedx_idx < output[fv_accessor.common_push_value.MfDimIndex()]) {
       output[fv_accessor.common_push_value.EmbedxGIndex() + embedx_idx] =
           input[fv_accessor.common_push_value.EmbedxGIndex() + embedx_idx];
     }
   }
 
+  template <typename FVAccessor>
   __device__ __forceinline__ void merge_embedx(
       float* output, const float* input, size_t embedx_idx,
-      CommonFeatureValueAccessor& fv_accessor) {
+      FVAccessor& fv_accessor) {
     if (embedx_idx < output[fv_accessor.common_push_value.MfDimIndex()]) {
       output[fv_accessor.common_push_value.EmbedxGIndex() + embedx_idx] +=
           input[fv_accessor.common_push_value.EmbedxGIndex() + embedx_idx];
@@ -129,11 +95,6 @@ class HeterCommKernel {
   HeterCommKernel() {}
   explicit HeterCommKernel(const int block_size) : block_size_(block_size) {}
 
-  explicit HeterCommKernel(const int block_size,
-                           CommonFeatureValueAccessor& feature_value_accessor)
-      : block_size_(block_size),
-        feature_value_accessor_(feature_value_accessor) {}
-
   template <typename T, typename StreamType>
   void fill_idx(T* idx, long long len, const StreamType& stream);
 
@@ -182,18 +143,21 @@ class HeterCommKernel {
 
                      StreamType stream = NULL, bool debug_synchronous = false);
 
-  template <typename KeyType, typename T, typename StreamType>
+  template <typename KeyType, typename T, typename StreamType,
+            typename FVAccessor>
   void dy_mf_fill_shard_grads(KeyType* d_shard_keys, KeyType* d_keys,
                               float* d_shard_grads, float* d_grads, T* idx,
                               long long len, size_t grad_value_size,
-                              const StreamType& stream);
+                              const StreamType& stream,
+                              FVAccessor& feature_value_accessor);
 
-  template <typename KeyType, typename StreamType>
+  template <typename KeyType, typename StreamType, typename FVAccessor>
   void merge_gradient(const KeyType* d_shard_keys, const uint32_t* offset,
                       const uint32_t* fea_num, const uint32_t* index,
                       const char* input, char* output, int n, size_t grad_dim,
                       size_t grad_value_size, DynamicGradMerger& merger,
-                      const StreamType& stream);
+                      const StreamType& stream,
+                      FVAccessor& feature_value_accessor);
 
   template <typename T, typename StreamType>
   void dy_mf_fill_dvals(float* d_shard_vals, float* d_vals, T* idx,
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc
index 1ec006f580c96..86e339ca74a30 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/fleet/heter_ps/heter_ps.h"
 #include <vector>
+#include "paddle/fluid/framework/fleet/heter_ps/heter_ps.h"
 
 #ifdef PADDLE_WITH_HETERPS
 
@@ -21,25 +21,39 @@ namespace paddle {
 namespace framework {
 
 HeterPsBase* HeterPsBase::get_instance(
-    size_t capacity, std::shared_ptr<HeterPsResource> resource,
-    CommonFeatureValueAccessor feature_value_accessor,
+    size_t capacity,
+    std::shared_ptr<HeterPsResource> resource,
+    std::unordered_map<std::string, float> fleet_config,
+    std::string accessor_type,
     int optimizer_type) {
-  return new HeterPs(capacity, resource, feature_value_accessor, optimizer_type);
+  if (accessor_type == "CtrDymfAccessor" &&
+      (optimizer_type == 1 || optimizer_type == 3 || optimizer_type == 4)) {
+    return new HeterPs<CommonFeatureValueAccessor>(
+        capacity, resource, accessor_type, fleet_config, optimizer_type);
+  } else {
+    VLOG(0) << " HeterPsBase get_instance Warning: now only support "
+               "CtrDymfAccessor, but get "
+            << accessor_type_;
+    return new HeterPs<CommonFeatureValueAccessor>(
+        capacity, resource, accessor_type, fleet_config, optimizer_type);
+  }
 }
 
-HeterPs::HeterPs(size_t capacity, std::shared_ptr<HeterPsResource> resource, 
-                  CommonFeatureValueAccessor feature_value_accessor,
-                  int optimizer_type) {
-  comm_ =
-      std::make_shared<HeterComm<FeatureKey, float*, float*>>(
-          capacity, resource);
-  feature_value_accessor_ = feature_value_accessor;
+HeterPs::HeterPs(size_t capacity,
+                 std::shared_ptr<HeterPsResource> resource,
+                 std::unordered_map<std::string, float> fleet_config,
+                 std::string accessor_type,
+                 int optimizer_type) {
+  comm_ = std::make_shared<HeterComm<FeatureKey, float*, float*, FVAccessor>>(
+      capacity, resource);
   optimizer_type_ = optimizer_type;
 }
 
 HeterPs::~HeterPs() {}
 
-void HeterPs::pull_sparse(int num, FeatureKey* d_keys, float* d_vals,
+void HeterPs::pull_sparse(int num,
+                          FeatureKey* d_keys,
+                          float* d_vals,
                           size_t len) {
   comm_->pull_sparse(num, d_keys, d_vals, len);
 }
@@ -60,8 +74,10 @@ void HeterPs::end_pass() { comm_->end_pass(); }
 
 void HeterPs::show_one_table(int gpu_num) { comm_->show_one_table(gpu_num); }
 
-void HeterPs::push_sparse(int num, FeatureKey* d_keys,
-                          float* d_grads, size_t len) {
+void HeterPs::push_sparse(int num,
+                          FeatureKey* d_keys,
+                          float* d_grads,
+                          size_t len) {
   comm_->push_sparse(num, d_keys, d_grads, len);
   // comm_->push_sparse_multi_node(num, d_keys, d_grads, len, opt_);
 }
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
index b812250f012a0..32f037b248944 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
@@ -23,87 +23,132 @@ namespace framework {
 
 HeterPsBase* HeterPsBase::get_instance(
     size_t capacity, std::shared_ptr<HeterPsResource> resource,
-    CommonFeatureValueAccessor feature_value_accessor, int optimizer_type) {
-  return new HeterPs(capacity, resource, feature_value_accessor,
-                     optimizer_type);
+    std::unordered_map<std::string, float> fleet_config,
+    std::string accessor_type, int optimizer_type) {
+  if (accessor_type == "CtrDymfAccessor" &&
+      (optimizer_type == 1 || optimizer_type == 3 || optimizer_type == 4)) {
+    return new HeterPs<CommonFeatureValueAccessor>(
+        capacity, resource, fleet_config, accessor_type, optimizer_type);
+  } else {
+    VLOG(0) << " HeterPsBase get_instance Warning: now only support "
+               "CtrDymfAccessor, but get "
+            << accessor_type;
+    return new HeterPs<CommonFeatureValueAccessor>(
+        capacity, resource, fleet_config, accessor_type, optimizer_type);
+  }
 }
 
-HeterPs::HeterPs(size_t capacity, std::shared_ptr<HeterPsResource> resource,
-                 CommonFeatureValueAccessor feature_value_accessor,
-                 int optimizer_type) {
-  comm_ = std::make_shared<HeterComm<FeatureKey, float*, float*>>(
-      capacity, resource, feature_value_accessor);
-  feature_value_accessor_ = feature_value_accessor;
+template <typename FVAccessor>
+HeterPs<FVAccessor>::HeterPs(
+    size_t capacity, std::shared_ptr<HeterPsResource> resource,
+    std::unordered_map<std::string, float> fleet_config,
+    std::string accessor_type, int optimizer_type) {
+  comm_ = std::make_shared<HeterComm<FeatureKey, float*, float*, FVAccessor>>(
+      capacity, resource);
+  feature_value_accessor_.Configure(fleet_config);
+  set_accessor(feature_value_accessor_);
+  accessor_type_ = accessor_type;
   optimizer_type_ = optimizer_type;
 }
 
-HeterPs::~HeterPs() {}
+template <typename FVAccessor>
+HeterPs<FVAccessor>::~HeterPs() {}
 
-void HeterPs::pull_sparse(int num, FeatureKey* d_keys, float* d_vals,
-                          size_t len) {
+template <typename FVAccessor>
+void HeterPs<FVAccessor>::pull_sparse(int num, FeatureKey* d_keys,
+                                      float* d_vals, size_t len) {
   comm_->pull_sparse(num, d_keys, d_vals, len);
 }
 
-void HeterPs::build_ps(int num, FeatureKey* h_keys, char* pool, size_t len,
-                       size_t feature_value_size, size_t chunk_size,
-                       int stream_num) {
+template <typename FVAccessor>
+void HeterPs<FVAccessor>::build_ps(int num, FeatureKey* h_keys, char* pool,
+                                   size_t len, size_t feature_value_size,
+                                   size_t chunk_size, int stream_num) {
   comm_->build_ps(num, h_keys, pool, len, feature_value_size, chunk_size,
                   stream_num);
 }
 
-int HeterPs::get_index_by_devid(int devid) {
+template <typename FVAccessor>
+int HeterPs<FVAccessor>::get_index_by_devid(int devid) {
   return comm_->get_index_by_devid(devid);
 }
 
-void HeterPs::set_sparse_sgd(const OptimizerConfig& optimizer_config) {
+template <typename FVAccessor>
+void HeterPs<FVAccessor>::set_sparse_sgd(
+    const OptimizerConfig& optimizer_config) {
   comm_->set_sparse_sgd(optimizer_config);
 }
 
-void HeterPs::set_embedx_sgd(const OptimizerConfig& optimizer_config) {
+template <typename FVAccessor>
+void HeterPs<FVAccessor>::set_embedx_sgd(
+    const OptimizerConfig& optimizer_config) {
   comm_->set_embedx_sgd(optimizer_config);
 }
 
-void HeterPs::end_pass() { comm_->end_pass(); }
+template <typename FVAccessor>
+void HeterPs<FVAccessor>::end_pass() {
+  comm_->end_pass();
+}
 
-void HeterPs::show_one_table(int gpu_num) { comm_->show_one_table(gpu_num); }
+template <typename FVAccessor>
+void HeterPs<FVAccessor>::show_one_table(int gpu_num) {
+  comm_->show_one_table(gpu_num);
+}
 
-void HeterPs::push_sparse(int num, FeatureKey* d_keys, float* d_grads,
+template <typename FVAccessor>
+void HeterPs<FVAccessor>::push_sparse(int num, FeatureKey* d_keys, float* d_grads,
                           size_t len) {
-  if (optimizer_type_ == 3) {  // adam
-    auto optimizer = SparseAdamOptimizer(feature_value_accessor_);
-    VLOG(5) << "INTO push_sparse SparseAdamOptimizer, EmbedDim():"
-            << optimizer.EmbedDim();
-    comm_->push_sparse(num, d_keys, d_grads, len, optimizer);
-  } else if (optimizer_type_ == 4) {  // shared_adam
-    auto optimizer = SparseAdamSharedOptimizer(feature_value_accessor_);
-    VLOG(5) << "INTO push_sparse SparseAdamSharedOptimizer, EmbedDim():"
-            << optimizer.EmbedDim();
-    comm_->push_sparse(num, d_keys, d_grads, len, optimizer);
+  if (accessor_type_ == "CtrDymfAccessor") {
+    if (optimizer_type_ == 3) {  // adam
+      auto optimizer = SparseAdamOptimizer(feature_value_accessor_);
+      VLOG(5) << "INTO push_sparse SparseAdamOptimizer, EmbedDim():"
+              << optimizer.EmbedDim();
+      comm_->push_sparse(num, d_keys, d_grads, len, optimizer);
+    } else if (optimizer_type_ == 4) {  // shared_adam
+      auto optimizer = SparseAdamSharedOptimizer(feature_value_accessor_);
+      VLOG(5) << "INTO push_sparse SparseAdamSharedOptimizer, EmbedDim():"
+              << optimizer.EmbedDim();
+      comm_->push_sparse(num, d_keys, d_grads, len, optimizer);
+    } else if (optimizer_type_ == 1) {  // adagrad  {
+      auto optimizer = SparseAdagradOptimizer(feature_value_accessor_);
+      VLOG(5) << "INTO push_sparse SparseAdagradOptimizer, EmbedDim():"
+              << optimizer.EmbedDim();
+      comm_->push_sparse(num, d_keys, d_grads, len, optimizer);
+    } else {
+      VLOG(0) << " push sparse Error: CtrDymfAccessor only support adagrad(1),"
+                 "adam(3) or shared_adam(4), bug get optimizer type:"
+              << optimizer_type_;
+    }
   } else {
-    auto optimizer = SparseAdagradOptimizer(feature_value_accessor_);
-    VLOG(5) << "INTO push_sparse SparseAdagradOptimizer, EmbedDim():"
-            << optimizer.EmbedDim();
-    comm_->push_sparse(num, d_keys, d_grads, len, optimizer);
+    VLOG(0) << " push sparse Error: now only support CtrDymfAccessor, but get "
+            << accessor_type_;
   }
 }
 
-void HeterPs::set_nccl_comm_and_size(const std::vector<ncclComm_t>& inner_comms,
-                                     const std::vector<ncclComm_t>& inter_comms,
-                                     int comm_size) {
+template <typename FVAccessor>
+void HeterPs<FVAccessor>::set_nccl_comm_and_size(
+    const std::vector<ncclComm_t>& inner_comms,
+    const std::vector<ncclComm_t>& inter_comms, int comm_size) {
   comm_->set_nccl_comm_and_size(inner_comms, inter_comms, comm_size);
 }
 
-void HeterPs::set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) {
+template <typename FVAccessor>
+void HeterPs<FVAccessor>::set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) {
   comm_->set_multi_mf_dim(multi_mf_dim, max_mf_dim);
 }
 
-void HeterPs::set_accessor(CommonFeatureValueAccessor& accessor) {
+template <typename FVAccessor>
+void HeterPs<FVAccessor>::set_accessor(FVAccessor& accessor) {
   comm_->set_accessor(accessor);
 }
 
-void HeterPs::show_table_collisions() { comm_->show_table_collisions(); }
+template <typename FVAccessor>
+void HeterPs<FVAccessor>::show_table_collisions() {
+  comm_->show_table_collisions();
+}
 
-int HeterPs::dedup_keys_and_fillidx(const int gpu_id, 
+template <typename FVAccessor>
+int HeterPs<FVAccessor>::dedup_keys_and_fillidx(const int gpu_id, 
         const int total_fea_num,
         const FeatureKey* d_keys,   // input
         FeatureKey* d_merged_keys,  // output
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
index caa069fb30613..a8314e2e28e22 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
@@ -26,12 +26,13 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+template <typename FVAccessor>
 class HeterPs : public HeterPsBase {
  public:
   HeterPs() {}
   HeterPs(size_t capacity, std::shared_ptr<HeterPsResource> resource,
-          CommonFeatureValueAccessor feature_value_accessor,
-          int optimizer_type);
+          std::unordered_map<std::string, float> fleet_config,
+          std::string accessor_type, int optimizer_type);
   virtual ~HeterPs();
   HeterPs(const HeterPs&) = delete;
   HeterPs& operator=(const HeterPs&) = delete;
@@ -48,7 +49,8 @@ class HeterPs : public HeterPsBase {
                               const std::vector<ncclComm_t>& inter_comms,
                               int comm_size) override;
   void set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) override;
-  void set_accessor(CommonFeatureValueAccessor& accessor) override;
+
+  void set_accessor(FVAccessor& accessor);
 #endif
 
   void set_sparse_sgd(const OptimizerConfig& optimizer_config) override;
@@ -73,9 +75,10 @@ class HeterPs : public HeterPsBase {
                              bool filter_zero);
 #endif
  private:
-  std::shared_ptr<HeterComm<FeatureKey, float*, float*>> comm_;
+  std::shared_ptr<HeterComm<FeatureKey, float*, float*, FVAccessor>> comm_;
 #if defined(PADDLE_WITH_CUDA)
-  CommonFeatureValueAccessor feature_value_accessor_;
+  FVAccessor feature_value_accessor_;
+  std::string accessor_type_;
   int optimizer_type_;
 #endif
 };
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
index e5fe095f9b011..840ad8f001b74 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
@@ -43,7 +43,6 @@ class HeterPsBase {
       const std::vector<ncclComm_t>& inner_comms,
       const std::vector<ncclComm_t>& inter_comms, int comm_size) = 0;
   virtual void set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) = 0;
-  virtual void set_accessor(CommonFeatureValueAccessor& accessor) = 0;
 
 #endif
   virtual void end_pass() = 0;
@@ -57,7 +56,9 @@ class HeterPsBase {
 
   static HeterPsBase* get_instance(
       size_t capacity, std::shared_ptr<HeterPsResource> resource,
-      CommonFeatureValueAccessor feature_value_accessor, int optimizer_type);
+      //  CommonFeatureValueAccessor feature_value_accessor,
+      std::unordered_map<std::string, float> fleet_config,
+      std::string accessor_type, int optimizer_type);
 #if defined(PADDLE_WITH_CUDA)
   // dedup
   virtual int dedup_keys_and_fillidx(const int gpu_id,
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
index 5fc21d2b0e495..4d64375f93656 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
@@ -28,12 +28,12 @@ limitations under the License. */
 #ifdef PADDLE_WITH_HETERPS
 
 #include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
-#include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h"
 
 #include <algorithm>
 #include <deque>
 
 #include "paddle/fluid/framework/data_set.h"
+#include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h"
 #include "paddle/fluid/platform/timer.h"
 #if defined(PADDLE_WITH_PSCORE)
 #include "paddle/fluid/distributed/ps/table/depends/feature_value.h"
@@ -45,10 +45,12 @@ namespace paddle {
 namespace framework {
 
 #ifdef PADDLE_WITH_PSLIB
-void AfsWrapper::init(const std::string& fs_name, const std::string& fs_user,
-                      const std::string& pass_wd, const std::string& conf) {
-  int ret = afs_handler_.init(fs_name.c_str(), fs_user.c_str(), pass_wd.c_str(),
-                              conf.c_str());
+void AfsWrapper::init(const std::string& fs_name,
+                      const std::string& fs_user,
+                      const std::string& pass_wd,
+                      const std::string& conf) {
+  int ret = afs_handler_.init(
+      fs_name.c_str(), fs_user.c_str(), pass_wd.c_str(), conf.c_str());
   if (ret != 0) {
     LOG(ERROR) << "AFS Init Error";
   }
@@ -100,8 +102,8 @@ void PSGPUWrapper::InitAfsApi(const std::string& fs_name,
                               const std::string& fs_user,
                               const std::string& pass_wd,
                               const std::string& conf) {
-  int ret = afs_handler_.init(fs_name.c_str(), fs_user.c_str(), pass_wd.c_str(),
-                              conf.c_str());
+  int ret = afs_handler_.init(
+      fs_name.c_str(), fs_user.c_str(), pass_wd.c_str(), conf.c_str());
   if (ret != 0) {
     VLOG(0) << "AFS Init Error";
   }
@@ -148,16 +150,20 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
       VLOG(0) << "total len: " << total_len;
       auto gen_dynamic_mf_func = [this](
                                      const std::deque<SlotRecord>& total_data,
-                                     int begin_index, int end_index, int i) {
+                                     int begin_index,
+                                     int end_index,
+                                     int i) {
         for (auto iter = total_data.begin() + begin_index;
-             iter != total_data.begin() + end_index; iter++) {
+             iter != total_data.begin() + end_index;
+             iter++) {
           const auto& ins = *iter;
           const auto& feasign_v = ins->slot_uint64_feasigns_.slot_values;
           const auto& slot_offset = ins->slot_uint64_feasigns_.slot_offsets;
           for (size_t slot_idx = 0; slot_idx < slot_offset_vector_.size();
                slot_idx++) {
             for (size_t j = slot_offset[slot_offset_vector_[slot_idx]];
-                 j < slot_offset[slot_offset_vector_[slot_idx] + 1]; j++) {
+                 j < slot_offset[slot_offset_vector_[slot_idx] + 1];
+                 j++) {
               int shard_id = feasign_v[j] % thread_keys_shard_num_;
               int dim_id = slot_index_vec_[slot_idx];
               if (feasign_v[j] != 0) {
@@ -170,8 +176,11 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
       };
       for (int i = 0; i < thread_keys_thread_num_; i++) {
         threads.push_back(
-            std::thread(gen_dynamic_mf_func, std::ref(vec_data), begin,
-                        begin + len_per_thread + (i < remain ? 1 : 0), i));
+            std::thread(gen_dynamic_mf_func,
+                        std::ref(vec_data),
+                        begin,
+                        begin + len_per_thread + (i < remain ? 1 : 0),
+                        i));
 
         begin += len_per_thread + (i < remain ? 1 : 0);
       }
@@ -192,9 +201,12 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
       len_per_thread = total_len / thread_keys_thread_num_;
       remain = total_len % thread_keys_thread_num_;
       auto gen_func = [this](const std::deque<Record>& total_data,
-                             int begin_index, int end_index, int i) {
+                             int begin_index,
+                             int end_index,
+                             int i) {
         for (auto iter = total_data.begin() + begin_index;
-             iter != total_data.begin() + end_index; iter++) {
+             iter != total_data.begin() + end_index;
+             iter++) {
           const auto& ins = *iter;
           const auto& feasign_v = ins.uint64_feasigns_;
           for (const auto feasign : feasign_v) {
@@ -206,8 +218,11 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
       };
       for (int i = 0; i < thread_keys_thread_num_; i++) {
         threads.push_back(
-            std::thread(gen_func, std::ref(vec_data), begin,
-                        begin + len_per_thread + (i < remain ? 1 : 0), i));
+            std::thread(gen_func,
+                        std::ref(vec_data),
+                        begin,
+                        begin + len_per_thread + (i < remain ? 1 : 0),
+                        i));
         begin += len_per_thread + (i < remain ? 1 : 0);
       }
       for (std::thread& t : threads) {
@@ -227,19 +242,25 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
     VLOG(0) << "GpuGraphTotalKeys: " << total_len;
     remain = total_len % thread_keys_thread_num_;
     auto gen_graph_data_func = [this](const std::vector<uint64_t>& total_data,
-                                      int begin_index, int end_index, int i) {
+                                      int begin_index,
+                                      int end_index,
+                                      int i) {
       for (auto iter = total_data.begin() + begin_index;
-           iter != total_data.begin() + end_index; iter++) {
+           iter != total_data.begin() + end_index;
+           iter++) {
         uint64_t cur_key = *iter;
         int shard_id = cur_key % thread_keys_shard_num_;
         this->thread_keys_[i][shard_id].insert(cur_key);
       }
     };
     auto gen_graph_dynamic_mf_func =
-        [this](const std::vector<uint64_t>& total_data, int begin_index,
-               int end_index, int i) {
+        [this](const std::vector<uint64_t>& total_data,
+               int begin_index,
+               int end_index,
+               int i) {
           for (auto iter = total_data.begin() + begin_index;
-               iter != total_data.begin() + end_index; iter++) {
+               iter != total_data.begin() + end_index;
+               iter++) {
             uint64_t cur_key = *iter;
             int shard_id = cur_key % thread_keys_shard_num_;
             // TODO: feasign <-> slot <-> multi_dim
@@ -250,13 +271,19 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
       if (!multi_mf_dim_) {
         VLOG(1) << "psgpu graph wrapper genfunc";
         threads.push_back(
-            std::thread(gen_graph_data_func, std::ref(vec_data), begin,
-                        begin + len_per_thread + (i < remain ? 1 : 0), i));
+            std::thread(gen_graph_data_func,
+                        std::ref(vec_data),
+                        begin,
+                        begin + len_per_thread + (i < remain ? 1 : 0),
+                        i));
       } else {
         VLOG(1) << "psgpu graph wrapper genfunc with dynamic mf";
         threads.push_back(
-            std::thread(gen_graph_dynamic_mf_func, std::ref(vec_data), begin,
-                        begin + len_per_thread + (i < remain ? 1 : 0), i));
+            std::thread(gen_graph_dynamic_mf_func,
+                        std::ref(vec_data),
+                        begin,
+                        begin + len_per_thread + (i < remain ? 1 : 0),
+                        i));
       }
       begin += len_per_thread + (i < remain ? 1 : 0);
     }
@@ -271,8 +298,8 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
   // merge thread_keys to shard_keys
   auto merge_ins_dynamic_mf_func = [this, gpu_task](int shard_num, int dim_id) {
     for (int i = 0; i < thread_keys_thread_num_; ++i) {
-      gpu_task->batch_add_keys(shard_num, dim_id,
-                               thread_dim_keys_[i][shard_num][dim_id]);
+      gpu_task->batch_add_keys(
+          shard_num, dim_id, thread_dim_keys_[i][shard_num][dim_id]);
       thread_dim_keys_[i][shard_num][dim_id].clear();
     }
   };
@@ -349,82 +376,87 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
 
   timeline.Start();
 
-  auto ptl_dynamic_mf_func = [this, &local_dim_keys, &local_dim_ptr,
-                              &fleet_ptr](int i, int j) {
-    size_t key_size = local_dim_keys[i][j].size();
-    int32_t status = -1;
-    int32_t cnt = 0;
+  auto ptl_dynamic_mf_func =
+      [this, &local_dim_keys, &local_dim_ptr, &fleet_ptr](int i, int j) {
+        size_t key_size = local_dim_keys[i][j].size();
+        int32_t status = -1;
+        int32_t cnt = 0;
 #ifdef PADDLE_WITH_PSLIB
-    while (true) {
-      auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr(
-          i, reinterpret_cast<char**>(local_dim_ptr[i][j].data()),
-          this->table_id_, local_dim_keys[i][j].data(), key_size);
-      bool flag = true;
-
-      tt.wait();
-
-      try {
-        status = tt.get();
-      } catch (const std::future_error& e) {
-        VLOG(0) << "Caught a future_error with code" << e.code()
-                << ", Message:" << e.what();
-      }
-      if (status != 0) {
-        VLOG(0) << "fleet pull sparse failed, status[" << status << "]";
-        sleep(sleep_seconds_before_fail_exit_);
-        flag = false;
-        cnt++;
-      }
-      if (cnt > 3) {
-        VLOG(0) << "fleet pull sparse failed, retry 3 times";
-        exit(-1);
-      }
+        while (true) {
+          auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr(
+              i,
+              reinterpret_cast<char**>(local_dim_ptr[i][j].data()),
+              this->table_id_,
+              local_dim_keys[i][j].data(),
+              key_size);
+          bool flag = true;
+
+          tt.wait();
+
+          try {
+            status = tt.get();
+          } catch (const std::future_error& e) {
+            VLOG(0) << "Caught a future_error with code" << e.code()
+                    << ", Message:" << e.what();
+          }
+          if (status != 0) {
+            VLOG(0) << "fleet pull sparse failed, status[" << status << "]";
+            sleep(sleep_seconds_before_fail_exit_);
+            flag = false;
+            cnt++;
+          }
+          if (cnt > 3) {
+            VLOG(0) << "fleet pull sparse failed, retry 3 times";
+            exit(-1);
+          }
 
-      if (flag) {
-        break;
-      }
-    }
+          if (flag) {
+            break;
+          }
+        }
 #endif
 #ifdef PADDLE_WITH_PSCORE
-    while (true) {
-      auto tt = fleet_ptr->worker_ptr_->PullSparsePtr(
-          reinterpret_cast<char**>(local_dim_ptr[i][j].data()), this->table_id_,
-          local_dim_keys[i][j].data(), key_size);
-      bool flag = true;
-
-      tt.wait();
-
-      try {
-        status = tt.get();
-      } catch (const std::future_error& e) {
-        VLOG(0) << "Caught a future_error with code" << e.code()
-                << ", Message:" << e.what();
-      }
-      if (status != 0) {
-        VLOG(0) << "fleet pull sparse failed, status[" << status << "]";
-        sleep(sleep_seconds_before_fail_exit_);
-        flag = false;
-        cnt++;
-      }
-      if (cnt > 3) {
-        VLOG(0) << "fleet pull sparse failed, retry 3 times";
-        exit(-1);
-      }
+        while (true) {
+          auto tt = fleet_ptr->worker_ptr_->PullSparsePtr(
+              reinterpret_cast<char**>(local_dim_ptr[i][j].data()),
+              this->table_id_,
+              local_dim_keys[i][j].data(),
+              key_size);
+          bool flag = true;
+
+          tt.wait();
+
+          try {
+            status = tt.get();
+          } catch (const std::future_error& e) {
+            VLOG(0) << "Caught a future_error with code" << e.code()
+                    << ", Message:" << e.what();
+          }
+          if (status != 0) {
+            VLOG(0) << "fleet pull sparse failed, status[" << status << "]";
+            sleep(sleep_seconds_before_fail_exit_);
+            flag = false;
+            cnt++;
+          }
+          if (cnt > 3) {
+            VLOG(0) << "fleet pull sparse failed, retry 3 times";
+            exit(-1);
+          }
 
-      if (flag) {
-        break;
-      }
-    }
+          if (flag) {
+            break;
+          }
+        }
 #endif
-    if (status != 0) {
-      LOG(ERROR) << "fleet pull sparse failed, status[" << status << "]";
-      sleep(300);
-      exit(-1);
-    } else {
-      VLOG(0) << "FleetWrapper Pull sparse to local done with table size: "
-              << local_dim_keys[i][j].size();
-    }
-  };
+        if (status != 0) {
+          LOG(ERROR) << "fleet pull sparse failed, status[" << status << "]";
+          sleep(300);
+          exit(-1);
+        } else {
+          VLOG(0) << "FleetWrapper Pull sparse to local done with table size: "
+                  << local_dim_keys[i][j].size();
+        }
+      };
 
   threads.resize(thread_keys_shard_num_ * multi_mf_dim_);
   for (int i = 0; i < thread_keys_shard_num_; i++) {
@@ -455,8 +487,11 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
   bool record_status = false;
   auto& device_task_keys = gpu_task->device_task_keys_;
   auto& device_task_ptrs = gpu_task->device_task_ptr_;
-  auto build_pull_dynamic_mf_func = [this, device_num, &local_dim_keys,
-                                     &local_dim_ptr, &device_dim_keys,
+  auto build_pull_dynamic_mf_func = [this,
+                                     device_num,
+                                     &local_dim_keys,
+                                     &local_dim_ptr,
+                                     &device_dim_keys,
                                      &device_dim_ptr,
                                      &device_dim_mutex](int i, int j) {
     std::vector<std::vector<FeatureKey>> task_keys(device_num);
@@ -488,8 +523,13 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
       device_dim_mutex[dev][j]->unlock();
     }
   };
-  auto build_func = [device_num, record_status, &pass_values, &local_keys,
-                     &local_ptr, &device_task_keys, &device_task_ptrs](int i) {
+  auto build_func = [device_num,
+                     record_status,
+                     &pass_values,
+                     &local_keys,
+                     &local_ptr,
+                     &device_task_keys,
+                     &device_task_ptrs](int i) {
     auto& task_keys = device_task_keys[i];
 #ifdef PADDLE_WITH_PSLIB
     auto& task_ptrs = device_task_ptrs[i];
@@ -584,163 +624,123 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
     return;
   }
   std::vector<std::thread> threads(device_num);
+  auto accessor_wrapper_ptr =
+      GlobalAccessorTransfor::GetInstance().GetAccessorWrapper();
   HeterPs_ = HeterPsBase::get_instance(
-      size_max, resource_, feature_value_accessor_, optimizer_type_);
+      size_max, resource_, fleet_config_, accessor_class_, optimizer_type_);
 #ifdef PADDLE_WITH_CUDA
   HeterPs_->set_nccl_comm_and_size(inner_comms_, inter_comms_, node_size_);
   HeterPs_->set_sparse_sgd(optimizer_config_);
   HeterPs_->set_embedx_sgd(optimizer_config_);
 #endif
-  auto build_dynamic_mf_func =
-      [this, &gpu_task](int i, int j) {
-        this->HeterPs_->set_multi_mf_dim(multi_mf_dim_, max_mf_dim_);
-        // this->HeterPs_->set_accessor(feature_value_accessor_);
-        int mf_dim = this->index_dim_vec_[j];
-        VLOG(0) << "building table: " << i << "with mf dim: " << mf_dim
-                << " feature_value_dim:"
-                << feature_value_accessor_.common_feature_value.Dim(mf_dim)
-                << " feature_value_size:"
-                << feature_value_accessor_.common_feature_value.Size(mf_dim);
-        size_t feature_value_size = TYPEALIGN(
-            8, feature_value_accessor_.common_feature_value.Size(mf_dim));
-        auto& device_dim_keys = gpu_task->device_dim_keys_[i][j];
-        auto& device_dim_ptrs = gpu_task->device_dim_ptr_[i][j];
-        size_t len = device_dim_keys.size();
-        CHECK(len == device_dim_ptrs.size());
-        this->mem_pools_[i * this->multi_mf_dim_ + j] =
-            new MemoryPool(len, feature_value_size);
-        auto& mem_pool = this->mem_pools_[i * this->multi_mf_dim_ + j];
-        for (size_t k = 0; k < len; k++) {
-          float* val = (float*)(mem_pool->mem_address(k));
-          float* ptr_val = device_dim_ptrs[k]->data();
-          size_t dim = device_dim_ptrs[k]->size();
+  auto build_dynamic_mf_func = [this, &gpu_task, &accessor_wrapper_ptr](int i,
+                                                                        int j) {
+    this->HeterPs_->set_multi_mf_dim(multi_mf_dim_, max_mf_dim_);
+    // this->HeterPs_->set_accessor(feature_value_accessor_);
+    int mf_dim = this->index_dim_vec_[j];
+    VLOG(0) << "building table: " << i << "with mf dim: " << mf_dim
+            << " feature_value_size:"
+            << accessor_wrapper_ptr->GetFeatureValueSize(mf_dim);
+    size_t feature_value_size =
+        accessor_wrapper_ptr->GetFeatureValueSize(mf_dim);
+    auto& device_dim_keys = gpu_task->device_dim_keys_[i][j];
+    auto& device_dim_ptrs = gpu_task->device_dim_ptr_[i][j];
+    size_t len = device_dim_keys.size();
+    CHECK(len == device_dim_ptrs.size());
+    this->mem_pools_[i * this->multi_mf_dim_ + j] =
+        new MemoryPool(len, feature_value_size);
+    auto& mem_pool = this->mem_pools_[i * this->multi_mf_dim_ + j];
+
 #ifdef PADDLE_WITH_PSLIB
-          val->delta_score =
-              ptr_val[paddle::ps::DownpourCtrDymfAccessor::
-                          DownpourCtrDymfFeatureValue::delta_score_index()];
-          val->show = ptr_val[paddle::ps::DownpourCtrDymfAccessor::
-                                  DownpourCtrDymfFeatureValue::show_index()];
-          val->clk = ptr_val[paddle::ps::DownpourCtrDymfAccessor::
-                                 DownpourCtrDymfFeatureValue::click_index()];
-          val->slot =
-              int(ptr_val[paddle::ps::DownpourCtrDymfAccessor::
-                              DownpourCtrDymfFeatureValue::slot_index()]);
-          val->lr = ptr_val[paddle::ps::DownpourCtrDymfAccessor::
-                                DownpourCtrDymfFeatureValue::embed_w_index()];
-          val->lr_g2sum =
-              ptr_val[paddle::ps::DownpourCtrDymfAccessor::
-                          DownpourCtrDymfFeatureValue::embed_g2sum_index()];
-          // TODO(xuefeng) set mf_dim while using DownpourCtrDymfAccessor
+    for (size_t k = 0; k < len; k++) {
+      float* val = (float*)(mem_pool->mem_address(k));
+      float* ptr_val = device_dim_ptrs[k]->data();
+      size_t dim = device_dim_ptrs[k]->size();
+      val->delta_score =
           ptr_val[paddle::ps::DownpourCtrDymfAccessor::
-                      DownpourCtrDymfFeatureValue::mf_dim_index()] =
-              float(mf_dim);
-          val->mf_dim = mf_dim;
-          if (dim > 8) {  // CpuPS alreay expand as mf_dim
-            val->mf_size = mf_dim + 1;
-            for (int x = 0; x < val->mf_dim + 1; x++) {
-              val->mf[x] = ptr_val[x + 8];
-            }
-          } else {
-            val->mf_size = 0;
-            for (int x = 0; x < val->mf_dim + 1; x++) {
-              val->mf[x] = 0;
-            }
-          }
-        }
-#endif
-#ifdef PADDLE_WITH_PSCORE
-        VLOG(5) << "cpu build " << k
-                << " cpuptr: " << (uint64_t)(device_dim_ptrs[k])
-                << " |: " << cpu_table_accessor_->ParseToString(ptr_val, dim);
-        val[feature_value_accessor_.common_feature_value.DeltaScoreIndex()] =
-            ptr_val[cpu_table_accessor_->common_feature_value
-                        .DeltaScoreIndex()];
-        val[feature_value_accessor_.common_feature_value.ShowIndex()] =
-            ptr_val[cpu_table_accessor_->common_feature_value.ShowIndex()];
-        val[feature_value_accessor_.common_feature_value.ClickIndex()] =
-            ptr_val[cpu_table_accessor_->common_feature_value.ClickIndex()];
-        val[feature_value_accessor_.common_feature_value.SlotIndex()] =
-            ptr_val[cpu_table_accessor_->common_feature_value.SlotIndex()];
-        val[feature_value_accessor_.common_feature_value.EmbedWIndex()] =
-            ptr_val[cpu_table_accessor_->common_feature_value.EmbedWIndex()];
-        for (int i = 0;
-             i < feature_value_accessor_.common_feature_value.EmbedDim(); i++) {
-          val[feature_value_accessor_.common_feature_value.EmbedG2SumIndex() +
-              i] = ptr_val
-              [cpu_table_accessor_->common_feature_value.EmbedG2SumIndex() + i];
+                      DownpourCtrDymfFeatureValue::delta_score_index()];
+      val->show = ptr_val[paddle::ps::DownpourCtrDymfAccessor::
+                              DownpourCtrDymfFeatureValue::show_index()];
+      val->clk = ptr_val[paddle::ps::DownpourCtrDymfAccessor::
+                             DownpourCtrDymfFeatureValue::click_index()];
+      val->slot = int(ptr_val[paddle::ps::DownpourCtrDymfAccessor::
+                                  DownpourCtrDymfFeatureValue::slot_index()]);
+      val->lr = ptr_val[paddle::ps::DownpourCtrDymfAccessor::
+                            DownpourCtrDymfFeatureValue::embed_w_index()];
+      val->lr_g2sum =
+          ptr_val[paddle::ps::DownpourCtrDymfAccessor::
+                      DownpourCtrDymfFeatureValue::embed_g2sum_index()];
+      // TODO(xuefeng) set mf_dim while using DownpourCtrDymfAccessor
+      ptr_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue::
+                  mf_dim_index()] = float(mf_dim);
+      val->mf_dim = mf_dim;
+      if (dim > 8) {  // CpuPS alreay expand as mf_dim
+        val->mf_size = mf_dim + 1;
+        for (int x = 0; x < val->mf_dim + 1; x++) {
+          val->mf[x] = ptr_val[x + 8];
         }
-
-        *(reinterpret_cast<uint64_t*>(
-            val + feature_value_accessor_.common_feature_value.CpuPtrIndex())) =
-            (uint64_t)(device_dim_ptrs[k]);
-
-        ptr_val[cpu_table_accessor_->common_feature_value.MfDimIndex()] =
-            float(mf_dim);
-        val[feature_value_accessor_.common_feature_value.MfDimIndex()] = mf_dim;
-        if (dim > cpu_table_accessor_->GetAccessorInfo().dim -
-                      cpu_table_accessor_->GetAccessorInfo().mf_size /
-                          sizeof(float)) {
-          val[feature_value_accessor_.common_feature_value.MfSizeIndex()] =
-              feature_value_accessor_.common_feature_value.MFSize(mf_dim) /
-              sizeof(float);
-
-          for (int x = 0;
-               x <
-               int(feature_value_accessor_.common_feature_value.MFSize(mf_dim) /
-                   sizeof(float));
-               x++) {
-            val[feature_value_accessor_.common_feature_value
-                    .EmbedxG2SumIndex() +
-                x] = ptr_val[cpu_table_accessor_->common_feature_value
-                                 .EmbedxG2SumIndex() +
-                             x];
-          }
-        } else {
-          val[feature_value_accessor_.common_feature_value.MfSizeIndex()] = 0;
-          for (int x = feature_value_accessor_.common_feature_value
-                           .EmbedxG2SumIndex();
-               x <
-               int(feature_value_accessor_.common_feature_value.Size(mf_dim) /
-                   sizeof(float));
-               x++) {
-            val[x] = 0;
-          }
+      } else {
+        val->mf_size = 0;
+        for (int x = 0; x < val->mf_dim + 1; x++) {
+          val->mf[x] = 0;
         }
-        VLOG(5) << "build " << k << " : "
-                << feature_value_accessor_.ParseToString(
-                       val, feature_value_accessor_.common_feature_value.Dim(
-                                mf_dim));
       }
+    }
+#endif
+#ifdef PADDLE_WITH_PSCORE
+    for (size_t k = 0; k < len; k++) {
+      void* val = mem_pool->mem_address(k);
+      // float* ptr_val = device_dim_ptrs[k]->data();
+      // size_t dim = device_dim_ptrs[k]->size();
+      // VLOG(5) << "cpu build " << k
+      //         << " cpuptr: " << (uint64_t)(device_dim_ptrs[k])
+      //         << " |: " << cpu_table_accessor_->ParseToString(ptr_val,
+      //         dim);
+      accessor_wrapper_ptr->BuildFill(
+          val, device_dim_ptrs[k], cpu_table_accessor_, mf_dim);
+      VLOG(5) << "build " << k << " : "
+              << accessor_wrapper_ptr->ParseToString(
+                     (float*)(val),
+                     int(accessor_wrapper_ptr->GetFeatureValueSize(mf_dim) /
+                         sizeof(float)));
+    }
 #endif
 
-  platform::CUDADeviceGuard guard(resource_->dev_id(i));
+    platform::CUDADeviceGuard guard(resource_->dev_id(i));
 
-  this->hbm_pools_[i * this->multi_mf_dim_ + j] = new HBMMemoryPool(mem_pool);
-  auto& cur_pool = this->hbm_pools_[i * this->multi_mf_dim_ + j];
+    this->hbm_pools_[i * this->multi_mf_dim_ + j] = new HBMMemoryPool(mem_pool);
+    auto& cur_pool = this->hbm_pools_[i * this->multi_mf_dim_ + j];
 
-  this->HeterPs_->build_ps(i, device_dim_keys.data(), cur_pool->mem(), len,
-                           feature_value_size, 500000, 2);
+    this->HeterPs_->build_ps(i,
+                             device_dim_keys.data(),
+                             cur_pool->mem(),
+                             len,
+                             feature_value_size,
+                             500000,
+                             2);
 
-  if (device_dim_keys.size() > 0) {
-    VLOG(0) << "show ptr table: " << i
-            << " table kv size: " << device_dim_keys.size() << "dim: " << mf_dim
-            << " len: " << len;
-    this->HeterPs_->show_one_table(i);
-  }
-  delete mem_pool;
-};
-threads.resize(device_num* multi_mf_dim_);
-for (int i = 0; i < device_num; i++) {
-  for (int j = 0; j < multi_mf_dim_; j++) {
-    threads[i + j * device_num] = std::thread(build_dynamic_mf_func, i, j);
+    if (device_dim_keys.size() > 0) {
+      VLOG(0) << "show ptr table: " << i
+              << " table kv size: " << device_dim_keys.size()
+              << "dim: " << mf_dim << " len: " << len;
+      this->HeterPs_->show_one_table(i);
+    }
+    delete mem_pool;
+  };
+
+  threads.resize(device_num * multi_mf_dim_);
+  for (int i = 0; i < device_num; i++) {
+    for (int j = 0; j < multi_mf_dim_; j++) {
+      threads[i + j * device_num] = std::thread(build_dynamic_mf_func, i, j);
+    }
   }
-}
 
-for (std::thread& t : threads) {
-  t.join();
-}
-timeline.Pause();
-VLOG(0) << "GpuPs build table total costs: " << timeline.ElapsedSec() << " s.";
+  for (std::thread& t : threads) {
+    t.join();
+  }
+  timeline.Pause();
+  VLOG(0) << "GpuPs build table total costs: " << timeline.ElapsedSec()
+          << " s.";
 }
 
 void PSGPUWrapper::LoadIntoMemory(bool is_shuffle) {
@@ -833,7 +833,8 @@ void PSGPUWrapper::BeginPass() {
   }
   if (FLAGS_gpugraph_dedup_pull_push_mode) {
     VLOG(0) << "BeginPass end, cost time: " << timer.ElapsedSec()
-            << "s, enable pull push dedup mode=" << FLAGS_gpugraph_dedup_pull_push_mode;
+            << "s, enable pull push dedup mode="
+            << FLAGS_gpugraph_dedup_pull_push_mode;
   } else {
     VLOG(0) << "BeginPass end, cost time: " << timer.ElapsedSec() << "s";
   }
@@ -856,152 +857,79 @@ void PSGPUWrapper::EndPass() {
     }
   }
 
-  auto dump_pool_to_cpu_func =
-      [this](int i, int j) {
-        PADDLE_ENFORCE_GPU_SUCCESS(cudaSetDevice(this->resource_->dev_id(i)));
-        auto& hbm_pool = this->hbm_pools_[i * this->multi_mf_dim_ + j];
-        auto& device_keys = this->current_task_->device_dim_keys_[i][j];
-        size_t len = device_keys.size();
-        int mf_dim = this->index_dim_vec_[j];
-        size_t feature_value_size = TYPEALIGN(
-            8, feature_value_accessor_.common_feature_value.Size(mf_dim));
-        VLOG(0) << "dump pool to cpu table: " << i << "with mf dim: " << mf_dim
-                << " key_len :" << len
-                << " feature_value_size:" << feature_value_size;
-
-        char* test_build_values = (char*)malloc(feature_value_size * len);
-        cudaMemcpy(test_build_values, hbm_pool->mem(), feature_value_size * len,
-                   cudaMemcpyDeviceToHost);
-
-        CHECK(len == hbm_pool->capacity());
-        uint64_t unuse_key = std::numeric_limits<uint64_t>::max();
-        for (size_t index = 0; index < len; ++index) {
-          if (device_keys[index] == unuse_key) {
-            continue;
-          }
-          size_t offset = index * feature_value_size;
-          float* gpu_val = (float*)(test_build_values + offset);
+  auto accessor_wrapper_ptr =
+      GlobalAccessorTransfor::GetInstance().GetAccessorWrapper();
+  auto dump_pool_to_cpu_func = [this, &accessor_wrapper_ptr](int i, int j) {
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaSetDevice(this->resource_->dev_id(i)));
+    auto& hbm_pool = this->hbm_pools_[i * this->multi_mf_dim_ + j];
+    auto& device_keys = this->current_task_->device_dim_keys_[i][j];
+    size_t len = device_keys.size();
+    int mf_dim = this->index_dim_vec_[j];
+    size_t feature_value_size =
+        accessor_wrapper_ptr->GetFeatureValueSize(mf_dim);
+    VLOG(0) << "dump pool to cpu table: " << i << "with mf dim: " << mf_dim
+            << " key_len :" << len
+            << " feature_value_size:" << feature_value_size;
+
+    char* test_build_values = (char*)malloc(feature_value_size * len);
+    cudaMemcpy(test_build_values,
+               hbm_pool->mem(),
+               feature_value_size * len,
+               cudaMemcpyDeviceToHost);
+
+    CHECK(len == hbm_pool->capacity());
+    uint64_t unuse_key = std::numeric_limits<uint64_t>::max();
+    for (size_t index = 0; index < len; ++index) {
+      if (device_keys[index] == unuse_key) {
+        continue;
+      }
+      size_t offset = index * feature_value_size;
+      float* gpu_val = (float*)(test_build_values + offset);
 #ifdef PADDLE_WITH_PSLIB
-          auto* downpour_value =
-              (paddle::ps::DownpourFixedFeatureValue*)(gpu_val->cpu_ptr);
-          int downpour_value_size = downpour_value->size();
-          if (gpu_val->mf_size > 0 && downpour_value_size == 8) {
-            downpour_value->resize(gpu_val->mf_dim + 1 + downpour_value_size);
-          }
-          float* cpu_val = downpour_value->data();
-          cpu_val[paddle::ps::DownpourCtrDymfAccessor::
-                      DownpourCtrDymfFeatureValue::delta_score_index()] =
-              gpu_val->delta_score;
-          cpu_val[paddle::ps::DownpourCtrDymfAccessor::
-                      DownpourCtrDymfFeatureValue::show_index()] =
-              gpu_val->show;
-          cpu_val[paddle::ps::DownpourCtrDymfAccessor::
-                      DownpourCtrDymfFeatureValue::click_index()] =
-              gpu_val->clk;
-          cpu_val[paddle::ps::DownpourCtrDymfAccessor::
-                      DownpourCtrDymfFeatureValue::embed_w_index()] =
-              gpu_val->lr;
-          cpu_val[paddle::ps::DownpourCtrDymfAccessor::
-                      DownpourCtrDymfFeatureValue::embed_g2sum_index()] =
-              gpu_val->lr_g2sum;
-          cpu_val[paddle::ps::DownpourCtrDymfAccessor::
-                      DownpourCtrDymfFeatureValue::slot_index()] =
-              gpu_val->slot;
-
-          if (gpu_val->mf_size > 0) {
-            for (int x = 0; x < gpu_val->mf_dim + 1; x++) {
-              cpu_val[x + 8] = gpu_val->mf[x];
-            }
-          }
-        }
+      // TODO: pslib DumpFill
 #endif
 #ifdef PADDLE_WITH_PSCORE
-        auto* downpour_value = (paddle::distributed::FixedFeatureValue*)(*(
-            reinterpret_cast<uint64_t*>(
-                gpu_val +
-                feature_value_accessor_.common_feature_value.CpuPtrIndex())));
-        size_t downpour_value_size = downpour_value->size();
-        if (gpu_val[feature_value_accessor_.common_feature_value
-                        .MfSizeIndex()] > 0 &&
-            downpour_value_size ==
-                (cpu_table_accessor_->GetAccessorInfo().dim -
-                 int(cpu_table_accessor_->GetAccessorInfo().mf_size /
-                     sizeof(float)))) {  // cpu_accessor
-          downpour_value->resize(
-              cpu_table_accessor_->common_feature_value.Dim(mf_dim));
-        }
-        float* cpu_val = downpour_value->data();
-
-        cpu_val[cpu_table_accessor_->common_feature_value.DeltaScoreIndex()] =
-            gpu_val[feature_value_accessor_.common_feature_value
-                        .DeltaScoreIndex()];
-        cpu_val[cpu_table_accessor_->common_feature_value.ShowIndex()] =
-            gpu_val[feature_value_accessor_.common_feature_value.ShowIndex()];
-        cpu_val[cpu_table_accessor_->common_feature_value.ClickIndex()] =
-            gpu_val[feature_value_accessor_.common_feature_value.ClickIndex()];
-        cpu_val[cpu_table_accessor_->common_feature_value.EmbedWIndex()] =
-            gpu_val[feature_value_accessor_.common_feature_value.EmbedWIndex()];
-        cpu_val[cpu_table_accessor_->common_feature_value.SlotIndex()] =
-            gpu_val[feature_value_accessor_.common_feature_value.SlotIndex()];
-
-        for (int i = 0;
-             i < feature_value_accessor_.common_feature_value.EmbedDim(); i++) {
-          cpu_val[cpu_table_accessor_->common_feature_value.EmbedG2SumIndex() +
-                  i] = gpu_val[feature_value_accessor_.common_feature_value
-                                   .EmbedG2SumIndex() +
-                               i];
-        }
+      accessor_wrapper_ptr->DumpFill(gpu_val, cpu_table_accessor_, mf_dim);
+      // auto* downpour_value = (paddle::distributed::FixedFeatureValue*)(*(
+      //     reinterpret_cast<uint64_t*>(gpu_val)));
+      // float* cpu_val = downpour_value->data();
+      // VLOG(5) << "dump to cpu " << index << "  gpu_value: "
+      //         << accessor_wrapper_ptr->ParseToString(gpu_val,
+      //              int(accessor_wrapper_ptr->GetFeatureValueSize(mf_dim) /
+      //              sizeof(float)))
+      //         << " \t cpu_value:"
+      //         << cpu_table_accessor_->ParseToString(cpu_val,
+      //                                               downpour_value->size());
+    }
+#endif
+    free(test_build_values);
+  };
 
-        if (gpu_val[feature_value_accessor_.common_feature_value
-                        .MfSizeIndex()] > 0) {
-          for (int x = 0;
-               x <
-               int(feature_value_accessor_.common_feature_value.MFSize(mf_dim) /
-                   sizeof(float));
-               x++) {
-            cpu_val[cpu_table_accessor_->common_feature_value
-                        .EmbedxG2SumIndex() +
-                    x] = gpu_val[feature_value_accessor_.common_feature_value
-                                     .EmbedxG2SumIndex() +
-                                 x];
-          }
-        }
-        VLOG(5) << "dump to cpu " << index << " : "
-                << feature_value_accessor_.ParseToString(
-                       gpu_val,
-                       feature_value_accessor_.common_feature_value.Dim(mf_dim))
-                << " ===== CPU:"
-                << cpu_table_accessor_->ParseToString(cpu_val,
-                                                      downpour_value->size());
+  if (multi_mf_dim_) {
+    VLOG(0) << "psgpu wrapper dump pool: multi_mf_dim_: " << multi_mf_dim_;
+    size_t device_num = heter_devices_.size();
+    std::vector<std::thread> threads(device_num * multi_mf_dim_);
+    for (size_t i = 0; i < device_num; i++) {
+      for (int j = 0; j < multi_mf_dim_; j++) {
+        threads[i + j * device_num] = std::thread(dump_pool_to_cpu_func, i, j);
       }
-#endif
-  free(test_build_values);
-};
-if (multi_mf_dim_) {
-  VLOG(0) << "psgpu wrapper dump pool: multi_mf_dim_: " << multi_mf_dim_;
-  size_t device_num = heter_devices_.size();
-  std::vector<std::thread> threads(device_num * multi_mf_dim_);
-  for (size_t i = 0; i < device_num; i++) {
-    for (int j = 0; j < multi_mf_dim_; j++) {
-      threads[i + j * device_num] = std::thread(dump_pool_to_cpu_func, i, j);
+    }
+    for (std::thread& t : threads) {
+      t.join();
     }
   }
-  for (std::thread& t : threads) {
-    t.join();
+  if (keysize_max != 0) {
+    HeterPs_->end_pass();
   }
-}
-if (keysize_max != 0) {
-  HeterPs_->end_pass();
-}
-VLOG(0) << "HeterPs_->end_pass end";
-for (size_t i = 0; i < hbm_pools_.size(); i++) {
-  delete hbm_pools_[i];
-}
-gpu_task_pool_.Push(current_task_);
-current_task_ = nullptr;
-gpu_free_channel_->Put(current_task_);
-timer.Pause();
-VLOG(0) << "EndPass end, cost time: " << timer.ElapsedSec() << "s";
+  VLOG(0) << "HeterPs_->end_pass end";
+  for (size_t i = 0; i < hbm_pools_.size(); i++) {
+    delete hbm_pools_[i];
+  }
+  gpu_task_pool_.Push(current_task_);
+  current_task_ = nullptr;
+  gpu_free_channel_->Put(current_task_);
+  timer.Pause();
+  VLOG(0) << "EndPass end, cost time: " << timer.ElapsedSec() << "s";
 }
 
 void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
@@ -1026,7 +954,10 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
   platform::Timer pull_gpups_timer;
   all_timer.Start();
 
-  size_t feature_value_size = feature_value_accessor_.common_pull_value.Size(max_mf_dim_);
+  auto accessor_wrapper_ptr =
+      GlobalAccessorTransfor::GetInstance().GetAccessorWrapper();
+  size_t feature_value_size =
+      accessor_wrapper_ptr->GetPullValueSize(max_mf_dim_);
   VLOG(3) << "PullSparse max_dim:" << max_mf_dim_
           << " pull_feature_value_size:" << pull_type_size_;
 
@@ -1067,25 +998,40 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
 
       int64_t* slot_lens = dev.slot_lens.mutable_data<int64_t>(
           (slot_num + 1) * sizeof(int64_t), place);
-      cudaMemcpyAsync(gpu_keys, keys.data(), keys.size() * sizeof(uint64_t*),
-                      cudaMemcpyHostToDevice, stream);
-      cudaMemcpyAsync(slot_lens, slot_lengths_lod.data(),
+      cudaMemcpyAsync(gpu_keys,
+                      keys.data(),
+                      keys.size() * sizeof(uint64_t*),
+                      cudaMemcpyHostToDevice,
+                      stream);
+      cudaMemcpyAsync(slot_lens,
+                      slot_lengths_lod.data(),
                       slot_lengths_lod.size() * sizeof(int64_t),
-                      cudaMemcpyHostToDevice, stream);
+                      cudaMemcpyHostToDevice,
+                      stream);
 
-      cudaMemcpyAsync(gpu_slot_dims, slot_dim.data(),
-                      slot_dim.size() * sizeof(int), cudaMemcpyHostToDevice,
+      cudaMemcpyAsync(gpu_slot_dims,
+                      slot_dim.data(),
+                      slot_dim.size() * sizeof(int),
+                      cudaMemcpyHostToDevice,
                       stream);
       float** gpu_values = dev.values_ptr_tensor.mutable_data<float*>(
           values.size() * sizeof(float*), place);
-      cudaMemcpyAsync(gpu_values, values.data(), values.size() * sizeof(float*),
-                      cudaMemcpyHostToDevice, stream);
+      cudaMemcpyAsync(gpu_values,
+                      values.data(),
+                      values.size() * sizeof(float*),
+                      cudaMemcpyHostToDevice,
+                      stream);
 
       int* key2slot = dev.keys2slot.mutable_data<int>(
           (total_length * 5) * sizeof(int), place);
 
-      this->CopyKeys(place, gpu_keys, total_keys, slot_lens, slot_num,
-                     static_cast<int>(total_length), key2slot);
+      this->CopyKeys(place,
+                     gpu_keys,
+                     total_keys,
+                     slot_lens,
+                     slot_num,
+                     static_cast<int>(total_length),
+                     key2slot);
 
       uint32_t* d_restore_idx =
           reinterpret_cast<uint32_t*>(&key2slot[total_length]);
@@ -1101,7 +1047,8 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
           reinterpret_cast<uint64_t*>(&d_merged_keys[total_length]);
 
       int dedup_size = HeterPs_->dedup_keys_and_fillidx(
-          devid_2_index, static_cast<int>(total_length),
+          devid_2_index,
+          static_cast<int>(total_length),
           total_keys,     // input
           d_merged_keys,  // output
           d_sorted_keys,  // sort keys
@@ -1110,11 +1057,13 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
           d_offset,       // offset
           d_merged_cnts,
           FLAGS_gpugraph_dedup_pull_push_mode & 0x02);
-//      printf("device %d, end dedup_keys_and_fillidx total %d, "
-//              "dedup_size %d, slot num: %d, value size: %d\n",
-//             device_id, int(total_length), dedup_size, slot_num, int(feature_value_size));
+      //      printf("device %d, end dedup_keys_and_fillidx total %d, "
+      //              "dedup_size %d, slot num: %d, value size: %d\n",
+      //             device_id, int(total_length), dedup_size, slot_num,
+      //             int(feature_value_size));
 
-      PADDLE_ENFORCE_GT(dedup_size, 0,
+      PADDLE_ENFORCE_GT(dedup_size,
+                        0,
                         platform::errors::PreconditionNotMet(
                             "dedup keys need more than zero failed in BoxPS."));
       dev.dedup_key_length = dedup_size;
@@ -1123,13 +1072,21 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
       float* total_values_gpu =
           dev.pull_push_tensor.mutable_data<float>(total_bytes, place);
       pull_gpups_timer.Start();
-      HeterPs_->pull_sparse(devid_2_index, d_merged_keys, total_values_gpu,
-                            dedup_size);
+      HeterPs_->pull_sparse(
+          devid_2_index, d_merged_keys, total_values_gpu, dedup_size);
 
       // values.size() not sure equal slot_num
-      this->CopyForPull(place, total_keys, gpu_values, total_values_gpu,
-                        slot_lens, key2slot, max_mf_dim_ + 3, total_length,
-                        gpu_slot_dims, d_restore_idx);
+      accessor_wrapper_ptr->CopyForPull(place,
+                                        total_keys,
+                                        gpu_values,
+                                        total_values_gpu,
+                                        slot_lens,
+                                        key2slot,
+                                        max_mf_dim_ + 3,
+                                        total_length,
+                                        gpu_slot_dims,
+                                        d_restore_idx,
+                                        feature_value_size);
     } else {
       size_t total_length =
           std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL);
@@ -1150,32 +1107,48 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
           memory::Alloc(place, slot_lengths.size() * sizeof(int64_t));
       uint64_t** gpu_keys = reinterpret_cast<uint64_t**>(buf_key->ptr());
       int64_t* gpu_len = reinterpret_cast<int64_t*>(buf_length->ptr());
-      cudaMemcpy(gpu_keys, keys.data(), keys.size() * sizeof(uint64_t*),
+      cudaMemcpy(gpu_keys,
+                 keys.data(),
+                 keys.size() * sizeof(uint64_t*),
+                 cudaMemcpyHostToDevice);
+      cudaMemcpy(gpu_len,
+                 slot_lengths_lod.data(),
+                 slot_lengths.size() * sizeof(int64_t),
                  cudaMemcpyHostToDevice);
-      cudaMemcpy(gpu_len, slot_lengths_lod.data(),
-                 slot_lengths.size() * sizeof(int64_t), cudaMemcpyHostToDevice);
 
       auto buf_dim = memory::Alloc(place, slot_dim.size() * sizeof(int));
       int* gpu_dim = reinterpret_cast<int*>(buf_dim->ptr());
-      cudaMemcpy(gpu_dim, slot_dim.data(), slot_dim.size() * sizeof(int),
+      cudaMemcpy(gpu_dim,
+                 slot_dim.data(),
+                 slot_dim.size() * sizeof(int),
                  cudaMemcpyHostToDevice);
 
-      this->CopyKeys(place, gpu_keys, total_keys, gpu_len,
+      this->CopyKeys(place,
+                     gpu_keys,
+                     total_keys,
+                     gpu_len,
                      static_cast<int>(slot_lengths.size()),
                      static_cast<int>(total_length));
       VLOG(3) << "Begin call PullSparseGPU in GPUPS, dev: " << devid_2_index
               << " len: " << total_length;
 
       pull_gpups_timer.Start();
-      HeterPs_->pull_sparse(devid_2_index, total_keys, total_values_gpu,
-                            total_length);
+      HeterPs_->pull_sparse(
+          devid_2_index, total_keys, total_values_gpu, total_length);
 
       VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length
               << "]";
 
-      this->CopyForPull(place, gpu_keys, values, total_values_gpu, gpu_len,
-                        static_cast<int>(slot_lengths.size()), hidden_size,
-                        total_length, gpu_dim);
+      accessor_wrapper_ptr->CopyForPull(place,
+                                        gpu_keys,
+                                        values,
+                                        total_values_gpu,
+                                        gpu_len,
+                                        static_cast<int>(slot_lengths.size()),
+                                        hidden_size,
+                                        total_length,
+                                        gpu_dim,
+                                        feature_value_size);
     }
     pull_gpups_timer.Pause();
 
@@ -1207,28 +1180,41 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
         memory::Alloc(place, slot_lengths.size() * sizeof(int64_t));
     uint64_t** xpu_keys = reinterpret_cast<uint64_t**>(buf_key->ptr());
     int64_t* xpu_len = reinterpret_cast<int64_t*>(buf_length->ptr());
-    PADDLE_ENFORCE_XPU_SUCCESS(xpu_memcpy(xpu_keys, keys.data(),
+    PADDLE_ENFORCE_XPU_SUCCESS(xpu_memcpy(xpu_keys,
+                                          keys.data(),
                                           keys.size() * sizeof(uint64_t*),
                                           XPU_HOST_TO_DEVICE));
-    PADDLE_ENFORCE_XPU_SUCCESS(xpu_memcpy(xpu_len, slot_lengths_lod.data(),
+    PADDLE_ENFORCE_XPU_SUCCESS(xpu_memcpy(xpu_len,
+                                          slot_lengths_lod.data(),
                                           slot_lengths.size() * sizeof(int64_t),
                                           XPU_HOST_TO_DEVICE));
 
-    this->CopyKeys(place, xpu_keys, total_keys, xpu_len,
+    this->CopyKeys(place,
+                   xpu_keys,
+                   total_keys,
+                   xpu_len,
                    static_cast<int>(slot_lengths.size()),
                    static_cast<int>(total_length));
     VLOG(3) << "Begin call PullSparseGPU in GPUPS, dev: " << devid_2_index
             << " len: " << total_length;
     pull_gpups_timer.Start();
-    HeterPs_->pull_sparse(devid_2_index, total_keys, total_values_gpu,
+    HeterPs_->pull_sparse(devid_2_index,
+                          total_keys,
+                          total_values_gpu,
                           static_cast<int>(total_length));
     pull_gpups_timer.Pause();
 
     VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length
             << "]";
-    this->CopyForPull(place, xpu_keys, values, total_values_gpu, xpu_len,
-                      static_cast<int>(slot_lengths.size()), hidden_size,
-                      total_length);
+    accessor_wrapper_ptr->CopyForPull(place,
+                                      xpu_keys,
+                                      values,
+                                      total_values_gpu,
+                                      xpu_len,
+                                      static_cast<int>(slot_lengths.size()),
+                                      hidden_size,
+                                      total_length,
+                                      feature_value_size);
 #endif
   } else {
     PADDLE_THROW(platform::errors::PreconditionNotMet(
@@ -1246,12 +1232,14 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place,
                                   const std::vector<const uint64_t*>& keys,
                                   const std::vector<const float*>& grad_values,
                                   const std::vector<int64_t>& slot_lengths,
-                                  const int hidden_size, const int batch_size) {
+                                  const int hidden_size,
+                                  const int batch_size) {
   platform::Timer all_timer;
   platform::Timer push_gpups_timer;
   all_timer.Start();
-  size_t grad_value_size =
-      TYPEALIGN(8, feature_value_accessor_.common_push_value.Size(max_mf_dim_));
+  auto accessor_wrapper_ptr =
+      GlobalAccessorTransfor::GetInstance().GetAccessorWrapper();
+  size_t grad_value_size = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_);
 
   if (platform::is_cpu_place(place)) {
     PADDLE_THROW(platform::errors::Unimplemented(
@@ -1275,17 +1263,22 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place,
       if (!dev.d_slot_vector.IsInitialized()) {
         int* buf_slot_vector =
             dev.d_slot_vector.mutable_data<int>(slot_num * sizeof(int), place);
-        cudaMemcpyAsync(buf_slot_vector, slot_vector_.data(),
-                        slot_num * sizeof(int), cudaMemcpyHostToDevice, stream);
+        cudaMemcpyAsync(buf_slot_vector,
+                        slot_vector_.data(),
+                        slot_num * sizeof(int),
+                        cudaMemcpyHostToDevice,
+                        stream);
       }
 
       const int64_t* slot_lens = dev.slot_lens.data<int64_t>();
       const int* d_slot_vector = dev.d_slot_vector.data<int>();
       const int* key2slot = dev.keys2slot.data<int>();
       float** gpu_values = dev.values_ptr_tensor.data<float*>();
-      cudaMemcpyAsync(gpu_values, grad_values.data(),
+      cudaMemcpyAsync(gpu_values,
+                      grad_values.data(),
                       grad_values.size() * sizeof(float*),
-                      cudaMemcpyHostToDevice, stream);
+                      cudaMemcpyHostToDevice,
+                      stream);
 
       uint64_t* d_merged_keys = &total_keys[total_length];
 
@@ -1296,26 +1289,50 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place,
       // dedup rate more than 3
       if (total_length > dedup_size * 3) {
         const uint32_t* d_restore_idx =
-                    reinterpret_cast<const uint32_t*>(&key2slot[total_length]);
-        this->CopyForPush(place, total_keys, gpu_values, total_grad_values_gpu,
-                        d_slot_vector, slot_lens, max_mf_dim_ + 3, total_length,
-                        dedup_size, batch_size, slot_dims, key2slot,
-                        d_restore_idx, grad_value_size);
+            reinterpret_cast<const uint32_t*>(&key2slot[total_length]);
+        accessor_wrapper_ptr->CopyForPush(place,
+                                          total_keys,
+                                          gpu_values,
+                                          total_grad_values_gpu,
+                                          d_slot_vector,
+                                          slot_lens,
+                                          max_mf_dim_ + 3,
+                                          total_length,
+                                          dedup_size,
+                                          batch_size,
+                                          slot_dims,
+                                          key2slot,
+                                          d_restore_idx,
+                                          grad_value_size);
       } else {
         const uint32_t* d_sorted_idx =
-              reinterpret_cast<const uint32_t*>(&key2slot[total_length * 2]);
+            reinterpret_cast<const uint32_t*>(&key2slot[total_length * 2]);
         const uint32_t* d_offset =
-              reinterpret_cast<const uint32_t*>(&d_sorted_idx[total_length]);
+            reinterpret_cast<const uint32_t*>(&d_sorted_idx[total_length]);
         const uint32_t* d_merged_cnts =
-              reinterpret_cast<const uint32_t*>(&d_offset[total_length]);
-        this->CopyForPush(place, d_merged_keys, gpu_values, total_grad_values_gpu,
-                        d_slot_vector, slot_lens, max_mf_dim_ + 3, total_length,
-                        dedup_size, batch_size, slot_dims, key2slot,
-                        d_sorted_idx, d_offset, d_merged_cnts, grad_value_size);
+            reinterpret_cast<const uint32_t*>(&d_offset[total_length]);
+        accessor_wrapper_ptr->CopyForPush(place,
+                                          d_merged_keys,
+                                          gpu_values,
+                                          total_grad_values_gpu,
+                                          d_slot_vector,
+                                          slot_lens,
+                                          max_mf_dim_ + 3,
+                                          total_length,
+                                          dedup_size,
+                                          batch_size,
+                                          slot_dims,
+                                          key2slot,
+                                          d_sorted_idx,
+                                          d_offset,
+                                          d_merged_cnts,
+                                          grad_value_size);
       }
 
       push_gpups_timer.Start();
-      HeterPs_->push_sparse(devid_2_index, d_merged_keys, total_grad_values_gpu,
+      HeterPs_->push_sparse(devid_2_index,
+                            d_merged_keys,
+                            total_grad_values_gpu,
                             static_cast<int>(dedup_size));
     } else {
       int64_t total_length =
@@ -1332,13 +1349,22 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place,
           reinterpret_cast<uint64_t*>(total_keys_tensor.data<int64_t>());
       VLOG(3) << "Begin copy grad tensor to gpups struct";
 
-      this->CopyForPush(place, grad_values, total_grad_values_gpu, slot_lengths,
-                        total_length, batch_size, grad_value_size);
+      accessor_wrapper_ptr->CopyForPush(place,
+                                        grad_values,
+                                        total_grad_values_gpu,
+                                        slot_lengths,
+                                        total_length,
+                                        batch_size,
+                                        grad_value_size,
+                                        slot_vector_,
+                                        slot_mf_dim_vector_);
 
       VLOG(3) << "Begin call PushSparseGPU in GPUPS, dev: " << devid_2_index
               << " len: " << total_length;
       push_gpups_timer.Start();
-      HeterPs_->push_sparse(devid_2_index, total_keys, total_grad_values_gpu,
+      HeterPs_->push_sparse(devid_2_index,
+                            total_keys,
+                            total_grad_values_gpu,
                             static_cast<int>(total_length));
     }
     push_gpups_timer.Pause();
@@ -1359,13 +1385,21 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place,
     uint64_t* total_keys =
         reinterpret_cast<uint64_t*>(total_keys_tensor.data<int64_t>());
     VLOG(3) << "Begin copy grad tensor to xpups struct";
-    this->CopyForPush(place, grad_values, total_grad_values_gpu, slot_lengths,
-                      hidden_size, total_length, batch_size);
+    accessor_wrapper_ptr->CopyForPush(place,
+                                      grad_values,
+                                      total_grad_values_gpu,
+                                      slot_lengths,
+                                      hidden_size,
+                                      total_length,
+                                      batch_size,
+                                      slot_vector_);
 
     VLOG(3) << "Begin call PushSparseXPU in XPUPS, dev: " << devid_2_index
             << " len: " << total_length;
     push_gpups_timer.Start();
-    HeterPs_->push_sparse(devid_2_index, total_keys, total_grad_values_gpu,
+    HeterPs_->push_sparse(devid_2_index,
+                          total_keys,
+                          total_grad_values_gpu,
                           static_cast<int>(total_length));
     push_gpups_timer.Pause();
 #endif
@@ -1382,6 +1416,6 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place,
   VLOG(3) << "End PushSparseGrad";
 }
 
-}  // end namespace framework
+}  // namespace framework
 }  // end namespace paddle
 #endif
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
index d9db24e0d0183..73506538e2358 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
@@ -31,88 +31,6 @@ const int CUDA_NUM_THREADS = platform::PADDLE_CUDA_NUM_THREADS;
 #define GET_BLOCK(N) ((N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS)
 #define CUDA_BLOCK(N) GET_BLOCK(N), CUDA_NUM_THREADS, 0
 
-__global__ void PullCopy(float** dest, const FeatureValue* src,
-                         const int64_t* len, int hidden, int slot_num,
-                         int total_len, uint64_t** keys) {
-  CUDA_KERNEL_LOOP(i, total_len) {
-    int low = 0;
-    int high = slot_num - 1;
-    while (low < high) {
-      int mid = (low + high) / 2;
-      if (i < len[mid])
-        high = mid;
-      else
-        low = mid + 1;
-    }
-    int x = low;
-    int y = i - (x ? len[x - 1] : 0);
-    if (*(keys[x] + y) == 0) {
-      *(dest[x] + y * hidden) = 0;
-      *(dest[x] + y * hidden + 1) = 0;
-      *(dest[x] + y * hidden + 2) = 0;
-    } else {
-      *(dest[x] + y * hidden) = (src + i)->show;
-      *(dest[x] + y * hidden + 1) = (src + i)->clk;
-      *(dest[x] + y * hidden + 2) = (src + i)->lr;
-    }
-    if ((src + i)->mf_size == 0 || *(keys[x] + y) == 0) {
-      for (int j = 0; j < hidden - 3; j++) {
-        *(dest[x] + y * hidden + 3 + j) = 0;
-      }
-    } else {
-      for (int j = 0; j < hidden - 3; j++) {
-        *(dest[x] + y * hidden + 3 + j) = (src + i)->mf[1 + j];
-      }
-    }
-  }
-}
-
-template<typename TAccess>
-__global__ void PullCopy(float** dest, const float* src, const int64_t* len,
-                         int slot_num, int total_len, uint64_t** keys,
-                         uint64_t max_val_size, int* gpu_dim,
-                         TAccess accessor) {
-  CUDA_KERNEL_LOOP(i, total_len) {
-    int low = 0;
-    int high = slot_num - 1;
-    while (low < high) {
-      int mid = (low + high) / 2;
-      if (i < len[mid])
-        high = mid;
-      else
-        low = mid + 1;
-    }
-    int x = low;
-    int y = i - (x ? len[x - 1] : 0);
-    float* feature_value_ptr =
-        (float*)((char*)src + uint64_t(i) * uint64_t(max_val_size));
-    int mf_dim = gpu_dim[x] - 3;
-    if (*(keys[x] + y) == 0) {
-      *(dest[x] + y * (mf_dim + 3)) = 0;
-      *(dest[x] + y * (mf_dim + 3) + 1) = 0;
-      *(dest[x] + y * (mf_dim + 3) + 2) = 0;
-    } else {
-      *(dest[x] + y * (mf_dim + 3)) =
-          feature_value_ptr[accessor.ShowIndex()];
-      *(dest[x] + y * (mf_dim + 3) + 1) =
-          feature_value_ptr[accessor.ClickIndex()];
-      *(dest[x] + y * (mf_dim + 3) + 2) =
-          feature_value_ptr[accessor.EmbedWIndex()];
-    }
-
-    if (feature_value_ptr[accessor.MfSizeIndex()] == 0 ||
-        *(keys[x] + y) == 0) {
-      for (int j = 0; j < mf_dim; j++) {
-        *(dest[x] + y * (mf_dim + 3) + 3 + j) = 0;
-      }
-    } else {
-      for (int j = 0; j < mf_dim; j++) {
-        *(dest[x] + y * (mf_dim + 3) + 3 + j) =
-            feature_value_ptr[accessor.EmbedxWIndex() + j];
-      }
-    }
-  }
-}
 
 __global__ void CopyKeysKernel(uint64_t** src_keys, uint64_t* dest_total_keys,
                                const int64_t* len, int slot_num,
@@ -158,84 +76,8 @@ __global__ void PushCopy(FeaturePushValue* dest, float** src, int64_t* len,
   }
 }
 
-__global__ void PushCopyWithPool(
-    float* dest, float** src, int64_t* len, int slot_num, uint64_t total_len,
-    int bs, int* slot_vector, int* mf_dim_vector, size_t grad_value_size,
-    CommonFeatureValueAccessor feature_value_accessor) {
-  CUDA_KERNEL_LOOP(i, total_len) {
-    int low = 0;
-    int high = slot_num - 1;
-    while (low < high) {
-      int mid = (low + high) / 2;
-      if (i < len[mid])
-        high = mid;
-      else
-        low = mid + 1;
-    }
-    int x = low;
-    int y = i - (x ? len[low - 1] : 0);
-    float* cur = (float*)((char*)dest + i * grad_value_size);
-
-    cur[feature_value_accessor.common_push_value.SlotIndex()] =
-        (float)slot_vector[x];
-    int mf_dim = mf_dim_vector[x];
-    cur[feature_value_accessor.common_push_value.MfDimIndex()] = mf_dim;
-
-    cur[feature_value_accessor.common_push_value.ShowIndex()] =
-        *(src[x] + y * (mf_dim + 3));
-    cur[feature_value_accessor.common_push_value.ClickIndex()] =
-        *(src[x] + y * (mf_dim + 3) + 1);
-    cur[feature_value_accessor.common_push_value.EmbedGIndex()] =
-        *(src[x] + y * (mf_dim + 3) + 2) * -1. * bs;
-    for (int j = 0; j < mf_dim; j++) {
-      cur[feature_value_accessor.common_push_value.EmbedxGIndex() + j] =
-          *(src[x] + y * (mf_dim + 3) + 3 + j) * -1. * bs;
-    }
-  }
-}
 PSGPUWrapper::~PSGPUWrapper() { delete HeterPs_; }
 
-void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place,
-                               uint64_t** gpu_keys,
-                               const std::vector<float*>& values,
-                               const FeatureValue* total_values_gpu,
-                               const int64_t* gpu_len, const int slot_num,
-                               const int hidden_size,
-                               const int64_t total_length) {
-  auto stream = dynamic_cast<platform::CUDADeviceContext*>(
-                    platform::DeviceContextPool::Instance().Get(place))
-                    ->stream();
-  auto buf_value = memory::Alloc(place, values.size() * sizeof(float*));
-  float** gpu_values = reinterpret_cast<float**>(buf_value->ptr());
-  cudaMemcpy(gpu_values, values.data(), values.size() * sizeof(float*),
-             cudaMemcpyHostToDevice);
-
-  PullCopy<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>(
-      gpu_values, total_values_gpu, gpu_len, hidden_size, slot_num,
-      total_length, gpu_keys);
-  cudaStreamSynchronize(stream);
-}
-
-void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place,
-                               uint64_t** gpu_keys,
-                               const std::vector<float*>& values,
-                               const float* total_values_gpu,
-                               const int64_t* gpu_len, const int slot_num,
-                               const int hidden_size,
-                               const int64_t total_length, int* gpu_dim) {
-  auto stream = dynamic_cast<platform::CUDADeviceContext*>(
-                    platform::DeviceContextPool::Instance().Get(place))
-                    ->stream();
-  auto buf_value = memory::Alloc(place, values.size() * sizeof(float*));
-  float** gpu_values = reinterpret_cast<float**>(buf_value->ptr());
-  cudaMemcpy(gpu_values, values.data(), values.size() * sizeof(float*),
-             cudaMemcpyHostToDevice);
-  PullCopy<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>(
-      gpu_values, total_values_gpu, gpu_len, slot_num, total_length, gpu_keys,
-      pull_type_size_, gpu_dim, feature_value_accessor_.common_pull_value);
-  cudaStreamSynchronize(stream);
-}
-
 void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place,
                             uint64_t** origin_keys, uint64_t* total_keys,
                             const int64_t* gpu_len, int slot_num,
@@ -271,6 +113,7 @@ __global__ void CopyKeysKernel2(
     dest_total_keys[i] = src_keys[low][y];
   }
 }
+
 void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place,
                             uint64_t** origin_keys, uint64_t* total_keys,
                             const int64_t* slot_lens, int slot_num,
@@ -282,320 +125,6 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place,
       total_len, origin_keys, total_keys, slot_num, slot_lens, key2slot);
   cudaStreamSynchronize(stream);
 }
-template<typename TAccess>
-__global__ void PullDedupCopy(
-    const size_t N, const uint64_t* total_keys, float** dest, const float* src,
-    const int64_t* slot_lens, uint64_t max_val_size, const int* slot_dims,
-    const int hidden, const int* key2slot, const uint32_t* restore_idx,
-    TAccess accessor) {
-  CUDA_KERNEL_LOOP(idx, N) {
-    int i = idx / hidden;
-    int off = idx % hidden;
-
-    int x = key2slot[i];
-    int y = i - slot_lens[x];
-
-    assert(slot_dims[x] == hidden);
-    float* dest_ptr = dest[x] + y * hidden;
-    // 0 key fill zero
-    if (total_keys[i] == 0) {
-      *(dest_ptr + off) = 0;
-      return;
-    }
-
-    float* src_ptr =
-        (float*)((char*)src +
-                 uint64_t(restore_idx[i]) * uint64_t(max_val_size));
-    switch (off) {
-      case 0:
-        *(dest_ptr + off) = src_ptr[accessor.ShowIndex()];
-        break;
-      case 1:
-        *(dest_ptr + off) = src_ptr[accessor.ClickIndex()];
-        break;
-      case 2:
-        *(dest_ptr + off) = src_ptr[accessor.EmbedWIndex()];
-        break;
-      default:
-        if (src_ptr[accessor.MfSizeIndex()] == 0) {
-          *(dest_ptr + off) = 0;
-        } else {
-          *(dest_ptr + off) =
-                  src_ptr[accessor.EmbedxWIndex() + off - 3];
-        }
-        break;
-    }
-  }
-}
-void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place,
-                               const uint64_t* total_keys, float** gpu_values,
-                               const float* total_values_gpu,
-                               const int64_t* slot_lens, const int* key2slot,
-                               const int hidden_size,
-                               const int64_t total_length, 
-                               const int* slot_dims,
-                               const uint32_t* gpu_restore_idx) {
-  auto stream = dynamic_cast<platform::CUDADeviceContext*>(
-                    platform::DeviceContextPool::Instance().Get(place))
-                    ->stream();
-  size_t N = total_length * hidden_size;
-  PullDedupCopy<<<CUDA_BLOCK(N), stream>>>(
-      N, total_keys, gpu_values, total_values_gpu, slot_lens, pull_type_size_,
-      slot_dims, hidden_size, key2slot, gpu_restore_idx,
-      feature_value_accessor_.common_pull_value);
-  cudaStreamSynchronize(stream);
-}
-void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place,
-                               const std::vector<const float*>& grad_values,
-                               FeaturePushValue* total_grad_values_gpu,
-                               const std::vector<int64_t>& slot_lengths,
-                               const int hidden_size,
-                               const int64_t total_length,
-                               const int batch_size) {
-  auto stream = dynamic_cast<platform::CUDADeviceContext*>(
-                    platform::DeviceContextPool::Instance().Get(place))
-                    ->stream();
-  auto slot_lengths_lod = slot_lengths;
-  for (int i = 1; i < slot_lengths_lod.size(); i++) {
-    slot_lengths_lod[i] += slot_lengths_lod[i - 1];
-  }
-  auto buf_grad_value =
-      memory::Alloc(place, grad_values.size() * sizeof(float*));
-  auto buf_length = memory::Alloc(place, slot_lengths.size() * sizeof(int64_t));
-  auto buf_slot_vector =
-      memory::Alloc(place, slot_lengths_lod.size() * sizeof(int));
-
-  float** gpu_values = reinterpret_cast<float**>(buf_grad_value->ptr());
-  int64_t* gpu_len = reinterpret_cast<int64_t*>(buf_length->ptr());
-  int* d_slot_vector = reinterpret_cast<int*>(buf_slot_vector->ptr());
-
-  cudaMemcpy(gpu_values, grad_values.data(),
-             grad_values.size() * sizeof(float*), cudaMemcpyHostToDevice);
-  cudaMemcpy(gpu_len, slot_lengths_lod.data(),
-             slot_lengths.size() * sizeof(int64_t), cudaMemcpyHostToDevice);
-  cudaMemcpy(d_slot_vector, slot_vector_.data(),
-             slot_lengths_lod.size() * sizeof(int), cudaMemcpyHostToDevice);
-
-  PushCopy<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>(
-      total_grad_values_gpu, gpu_values, gpu_len, hidden_size,
-      slot_lengths.size(), total_length, batch_size, d_slot_vector);
-  cudaStreamSynchronize(stream);
-}
-
-void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place,
-                               const std::vector<const float*>& grad_values,
-                               float* total_grad_values_gpu,
-                               const std::vector<int64_t>& slot_lengths,
-                               const uint64_t total_length,
-                               const int batch_size, size_t grad_value_size) {
-  auto stream = dynamic_cast<platform::CUDADeviceContext*>(
-                    platform::DeviceContextPool::Instance().Get(place))
-                    ->stream();
-  auto slot_lengths_lod = slot_lengths;
-  for (int i = 1; i < slot_lengths_lod.size(); i++) {
-    slot_lengths_lod[i] += slot_lengths_lod[i - 1];
-  }
-  auto buf_grad_value =
-      memory::Alloc(place, grad_values.size() * sizeof(float*));
-  auto buf_length = memory::Alloc(place, slot_lengths.size() * sizeof(int64_t));
-  auto buf_slot_vector =
-      memory::Alloc(place, slot_lengths_lod.size() * sizeof(int));
-  auto buf_mf_dim_vector =
-      memory::Alloc(place, slot_lengths_lod.size() * sizeof(int));
-  float** gpu_values = reinterpret_cast<float**>(buf_grad_value->ptr());
-  int64_t* gpu_len = reinterpret_cast<int64_t*>(buf_length->ptr());
-  int* d_slot_vector = reinterpret_cast<int*>(buf_slot_vector->ptr());
-  int* d_mf_dim_vector = reinterpret_cast<int*>(buf_mf_dim_vector->ptr());
-  cudaMemcpy(gpu_values, grad_values.data(),
-             grad_values.size() * sizeof(float*), cudaMemcpyHostToDevice);
-  cudaMemcpy(gpu_len, slot_lengths_lod.data(),
-             slot_lengths.size() * sizeof(int64_t), cudaMemcpyHostToDevice);
-  cudaMemcpy(d_slot_vector, slot_vector_.data(),
-             slot_lengths_lod.size() * sizeof(int), cudaMemcpyHostToDevice);
-  cudaMemcpy(d_mf_dim_vector, slot_mf_dim_vector_.data(),
-             slot_lengths_lod.size() * sizeof(int), cudaMemcpyHostToDevice);
-  PushCopyWithPool<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>(
-      total_grad_values_gpu, gpu_values, gpu_len, slot_lengths.size(),
-      total_length, batch_size, d_slot_vector, d_mf_dim_vector, grad_value_size,
-      feature_value_accessor_);
-  cudaStreamSynchronize(stream);
-}
-
-template<typename TAccess>
-__global__ void PushMergeCopyAtomic(
-    const size_t N, const uint64_t* total_keys, float* dest, float** src,
-    const int hidden, const int bs, const int* slot_vector,
-    const int* slot_dims, const int64_t* slot_lens, const int* key2slot,
-    const uint32_t* d_restore_idx, size_t grad_value_size,
-    TAccess accessor) {
-  CUDA_KERNEL_LOOP(idx, N) {
-    int i = idx / hidden;
-    int off = idx % hidden;
-    // filter 0 keys
-    if (total_keys[i] == 0) {
-      return;
-    }
-
-    int x = key2slot[i];
-    int y = i - slot_lens[x];
-
-    const float* ptr = src[x] + y * hidden;
-    float* cur = (float*)((char*)dest + d_restore_idx[i] * grad_value_size);
-    int mf_dim = slot_dims[x] - 3;
-    switch (off) {
-      case 0:
-        cur[accessor.SlotIndex()] = (float)slot_vector[x];
-        cur[accessor.MfDimIndex()] = mf_dim;
-        paddle::platform::CudaAtomicAdd(
-            &cur[accessor.ShowIndex()], *(ptr + off));
-        break;
-      case 1:
-        paddle::platform::CudaAtomicAdd(
-            &cur[accessor.ClickIndex()], *(ptr + off));
-        break;
-      case 2:
-        paddle::platform::CudaAtomicAdd(
-            &cur[accessor.EmbedGIndex()], *(ptr + off) * -1. * bs);
-        break;
-      default:
-        int embedx_idx = off - 3;
-        if (mf_dim < embedx_idx) {
-            return;
-        }
-        paddle::platform::CudaAtomicAdd(
-            &cur[accessor.EmbedxGIndex() + embedx_idx], *(ptr + off) * -1. * bs);
-        break;
-    }
-  }
-}
-
-void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place,
-                               const uint64_t* total_keys, float** grad_values,
-                               float* total_grad_values_gpu, const int* slots,
-                               const int64_t* slot_lens, const int hidden_size,
-                               const int64_t total_length,
-                               const int64_t dedup_length, const int batch_size,
-                               const int* slot_dims, const int* key2slot,
-                               const uint32_t* d_restore_idx,
-                               const size_t grad_value_size) {
-  auto stream = dynamic_cast<platform::CUDADeviceContext*>(
-                    platform::DeviceContextPool::Instance().Get(place))
-                    ->stream();
-  cudaMemsetAsync(total_grad_values_gpu, 0, dedup_length * grad_value_size,
-                  stream);
-  size_t N = total_length * hidden_size;
-  PushMergeCopyAtomic<<<CUDA_BLOCK(N), stream>>>(
-      N, total_keys, total_grad_values_gpu, grad_values, hidden_size,
-      batch_size, slots, slot_dims, slot_lens, key2slot, d_restore_idx,
-      grad_value_size, feature_value_accessor_.common_push_value);
-
-  cudaStreamSynchronize(stream);
-}
-
-#define SUM_GRAD_VALUE      \
-   for (uint32_t j = 0; j < count; ++j) { \
-      const uint32_t& pos = d_sort_idx[start + j]; \
-      const int& x = key2slot[pos]; \
-      y = pos - slot_lens[x]; \
-      val += *(reinterpret_cast<float*>(src[x] + y * hidden + off)); \
-   }
-
-template<typename TAccess>
-__global__ void PushMergeCopy(
-    const size_t N, const uint64_t* total_keys, float* dest, float** src,
-    const int hidden, const int bs, const int* slot_vector,
-    const int* slot_dims, const int64_t* slot_lens, const int* key2slot,
-    const uint32_t* d_sort_idx, 
-    const uint32_t* d_sort_offset, 
-    const uint32_t* d_sort_cnt, size_t grad_value_size,
-    TAccess accessor) {
-  CUDA_KERNEL_LOOP(idx, N) {
-    int i = idx / hidden;
-    int off = idx % hidden;
-    // filter 0 keys
-    float* cur = (float*)((char*)dest + i * grad_value_size);
-    
-    if (total_keys[i] == 0) {
-      switch (off) {
-      case 0:
-        cur[accessor.SlotIndex()] = 0;
-        cur[accessor.MfDimIndex()] = 0;
-        cur[accessor.ShowIndex()] = 0.0;
-        break;
-      case 1:
-        cur[accessor.ClickIndex()] = 0.0;
-        break;
-      case 2:
-        cur[accessor.EmbedGIndex()] = 0.0;
-        break;
-      default: 
-        cur[accessor.EmbedxGIndex() + off - 3] = 0.0;
-        break;
-      }
-      return;
-    }
-    
-    const uint32_t& start = d_sort_offset[i];
-    const uint32_t& count = d_sort_cnt[i];
-    const uint32_t& pos = d_sort_idx[start];
-
-    const int& x = key2slot[pos];
-    int y = pos - slot_lens[x];
-    int mf_dim = slot_dims[x] - 3;
-        
-    double val = 0.0;
-    
-    switch (off) {
-      case 0: 
-        cur[accessor.SlotIndex()] = (float)slot_vector[x];
-        cur[accessor.MfDimIndex()] = mf_dim;
-        SUM_GRAD_VALUE
-        cur[accessor.ShowIndex()] = val;
-        break;
-      case 1: 
-        SUM_GRAD_VALUE
-        cur[accessor.ClickIndex()] = val;
-        break;
-      case 2: 
-        SUM_GRAD_VALUE
-        cur[accessor.EmbedGIndex()] = val * -1. * bs;
-        break;
-      default: 
-        int embedx_idx = off - 3;
-        if (mf_dim < embedx_idx) {
-            cur[accessor.EmbedxGIndex() + embedx_idx] = 0.0;
-            return;
-        }
-        SUM_GRAD_VALUE
-        cur[accessor.EmbedxGIndex() + embedx_idx] = val * -1. * bs;
-        break;
-    }
-  }
-}
-
-void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place,
-                 const uint64_t* total_keys, float** grad_values,
-                 float* total_grad_values_gpu, const int* slots,
-                 const int64_t* slot_lens, const int hidden_size,
-                 const int64_t total_length, const int64_t dedup_length,
-                 const int batch_size, const int* slot_dims,
-                 const int* key2slot,
-                 const uint32_t* gpu_sort_idx,
-                 const uint32_t* gpu_sort_offset,
-                 const uint32_t* gpu_sort_lens,
-                 const size_t grad_value_size) {
-  auto stream = dynamic_cast<platform::CUDADeviceContext*>(
-                        platform::DeviceContextPool::Instance().Get(place))
-                        ->stream();
-  // merge all grad to one
-  size_t N = dedup_length * hidden_size;
-  PushMergeCopy<<<CUDA_BLOCK(N), stream>>>(
-        N, total_keys, total_grad_values_gpu, grad_values, hidden_size,
-        batch_size, slots, slot_dims, slot_lens, key2slot, 
-        gpu_sort_idx, gpu_sort_offset, gpu_sort_lens,
-        grad_value_size, feature_value_accessor_.common_push_value);
-  cudaStreamSynchronize(stream);
-}
 
 void PSGPUWrapper::SetSparseSGD(float nonclk_coeff, float clk_coeff,
                                 float min_bound, float max_bound,
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
index babee603b1517..b69cbccd0c1c9 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
@@ -65,9 +65,6 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-#define TYPEALIGN(ALIGNVAL, LEN) \
-  (((uint64_t)(LEN) + ((ALIGNVAL)-1)) & ~((uint64_t)((ALIGNVAL)-1)))
-
 class Dataset;
 
 #ifdef PADDLE_WITH_PSLIB
@@ -174,53 +171,6 @@ class PSGPUWrapper {
   void CopyKeys(const paddle::platform::Place& place, uint64_t** origin_keys,
                 uint64_t* total_keys, const int64_t* gpu_len, int slot_num,
                 int total_len, int* key2slot);
-  void CopyForPull(const paddle::platform::Place& place, uint64_t** gpu_keys,
-                   const std::vector<float*>& values,
-                   const FeatureValue* total_values_gpu, const int64_t* gpu_len,
-                   const int slot_num, const int hidden_size,
-                   const int64_t total_length);
-  void CopyForPull(const paddle::platform::Place& place, uint64_t** gpu_keys,
-                   const std::vector<float*>& values,
-                   const float* total_values_gpu, const int64_t* gpu_len,
-                   const int slot_num, const int hidden_size,
-                   const int64_t total_length, int* gpu_dim);
-  void CopyForPull(const paddle::platform::Place& place,
-                   const uint64_t* total_keys, float** gpu_values,
-                   const float* total_values_gpu, const int64_t* slot_lens,
-                   const int* key2slot, const int hidden_size,
-                   const int64_t total_length, const int* slot_dims,
-                   const uint32_t* gpu_restore_idx);
-  void CopyForPush(const paddle::platform::Place& place,
-                   const std::vector<const float*>& grad_values,
-                   FeaturePushValue* total_grad_values_gpu,
-                   const std::vector<int64_t>& slot_lengths,
-                   const int hidden_size, const int64_t total_length,
-                   const int batch_size);
-  void CopyForPush(const paddle::platform::Place& place,
-                   const std::vector<const float*>& grad_values,
-                   float* total_grad_values_gpu,
-                   const std::vector<int64_t>& slot_lengths,
-                   const uint64_t total_length, const int batch_size,
-                   size_t grad_value_size);
-  void CopyForPush(const paddle::platform::Place& place,
-                   const uint64_t* total_keys, float** grad_values,
-                   float* total_grad_values_gpu, const int* slots,
-                   const int64_t* slot_lens, const int hidden_size,
-                   const int64_t total_length, const int64_t dedup_length,
-                   const int batch_size, const int* slot_dims,
-                   const int* key2slot, const uint32_t* d_restore_idx,
-                   const size_t grad_value_size);
-  void CopyForPush(const paddle::platform::Place& place,
-                 const uint64_t* total_keys, float** grad_values,
-                 float* total_grad_values_gpu, const int* slots,
-                 const int64_t* slot_lens, const int hidden_size,
-                 const int64_t total_length, const int64_t dedup_length,
-                 const int batch_size, const int* slot_dims,
-                 const int* key2slot,
-                 const uint32_t* gpu_sort_idx,
-                 const uint32_t* gpu_sort_offset,
-                 const uint32_t* gpu_sort_lens,
-                 const size_t grad_value_size);
 
   void BuildGPUTask(std::shared_ptr<HeterContext> gpu_task);
   void PreBuildTask(std::shared_ptr<HeterContext> gpu_task);
@@ -402,7 +352,7 @@ class PSGPUWrapper {
     auto sparse_table_accessor = sparse_table.accessor();
     auto sparse_table_accessor_parameter =
         sparse_table_accessor.ctr_accessor_param();
-    auto accessor_class = sparse_table_accessor.accessor_class();
+    accessor_class_ = sparse_table_accessor.accessor_class();
 
     std::unordered_map<std::string, float> config;
     config["embedx_dim"] = sparse_table_accessor.embedx_dim();
@@ -413,14 +363,17 @@ class PSGPUWrapper {
     config["nodeid_slot"] = sparse_table_accessor.graph_sgd_param().nodeid_slot();
     config["feature_learning_rate"] = sparse_table_accessor.graph_sgd_param().feature_learning_rate();
     
-    if (accessor_class == "CtrDymfAccessor") {
+    if (accessor_class_ == "CtrDymfAccessor") {
       // optimizer config for embed_w and embedx
       add_sparse_optimizer(config, sparse_table_accessor.embed_sgd_param());
       add_sparse_optimizer(config, sparse_table_accessor.embedx_sgd_param(),
                            "mf_");
     }
 
-    feature_value_accessor_.Configure(config);
+    fleet_config_ = config;
+    GlobalAccessorTransfor::GetInstance().Init(accessor_class_);
+    GlobalAccessorTransfor::GetInstance().GetAccessorWrapper()->Configure(
+        config);
     InitializeGPUServer(config);
   }
 #endif
@@ -505,24 +458,8 @@ class PSGPUWrapper {
     optimizer_type_ = (config.find("optimizer_type") == config.end())
                           ? 1
                           : int(config["optimizer_type"]);
-    embedx_dim_ = (config.find("embedx_dim") == config.end())
-                      ? 8
-                      : int(config["embedx_dim"]);
-    if (optimizer_type_ == 3) {  // adam
-      embed_sgd_dim_ = 4;
-      embedx_sgd_dim_ = embedx_dim_ * 2 + 2;
-    } else if (optimizer_type_ == 4) {  // shared_adam
-      embed_sgd_dim_ = 4;
-      embedx_sgd_dim_ = 4;
-    } else {
-      embed_sgd_dim_ = 1;
-      embedx_sgd_dim_ = 1;
-    }
 
-    VLOG(0) << "InitializeGPUServer embed_sgd_dim_:" << embed_sgd_dim_
-            << " embedx_sgd_dim_:" << embedx_sgd_dim_
-            << "  embedx_dim_:" << embedx_dim_
-            << " optimizer_type_:" << optimizer_type_
+    VLOG(0) << "InitializeGPUServer optimizer_type_:" << optimizer_type_
             << " nodeid_slot:" << nodeid_slot
             << " feature_learning_rate:" << feature_learning_rate;
   }
@@ -610,12 +547,13 @@ class PSGPUWrapper {
     for (size_t i = 0; i < slot_index_vec_.size(); i++) {
       slot_index_vec_[i] = dim_index_map[slot_mf_dim_vector_[i]];
     }
-    val_type_size_ = TYPEALIGN(
-        8, feature_value_accessor_.common_feature_value.Size(max_mf_dim_));
-    grad_type_size_ = TYPEALIGN(
-        8, feature_value_accessor_.common_push_value.Size(max_mf_dim_));
-    pull_type_size_ = feature_value_accessor_.common_pull_value.Size(max_mf_dim_);
-    VLOG(0) << "InitSlotInfo: val_type_size_" << val_type_size_
+
+    auto accessor_wrapper_ptr =
+        GlobalAccessorTransfor::GetInstance().GetAccessorWrapper();
+    val_type_size_ = accessor_wrapper_ptr->GetFeatureValueSize(max_mf_dim_);
+    grad_type_size_ = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_);
+    pull_type_size_ = accessor_wrapper_ptr->GetPullValueSize(max_mf_dim_);
+    VLOG(0) << "InitS  lotInfo: val_type_size_" << val_type_size_
             << " grad_type_size_:" << grad_type_size_
             << " pull_type_size_:" << pull_type_size_;
     slot_info_initialized_ = true;
@@ -638,13 +576,10 @@ class PSGPUWrapper {
 
 #ifdef PADDLE_WITH_PSCORE
   void SetTableAccessor(paddle::distributed::ValueAccessor* accessor) {
-    cpu_table_accessor_ =
-        dynamic_cast<paddle::distributed::CtrDymfAccessor*>(accessor);
+    cpu_table_accessor_ = accessor;
   }
 #endif
 
-  CommonFeatureValueAccessor feature_value_accessor_;
-
  private:
   static std::shared_ptr<PSGPUWrapper> s_instance_;
   Dataset* dataset_;
@@ -699,11 +634,10 @@ class PSGPUWrapper {
   bool slot_info_initialized_ = false;
   int use_afs_api_ = 0;
   int optimizer_type_ = 1;
-  int embed_sgd_dim_ = 1;
-  int embedx_sgd_dim_ = 1;
-  int embedx_dim_ = 8;
+  std::string accessor_class_;
+  std::unordered_map<std::string, float> fleet_config_;
 #ifdef PADDLE_WITH_PSCORE
-  paddle::distributed::CtrDymfAccessor* cpu_table_accessor_;
+  paddle::distributed::ValueAccessor* cpu_table_accessor_;
 #endif
 
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps b/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps
index f1084dc4d758b..df8ad45bb472d 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps
@@ -169,33 +169,6 @@ __global__ void PushCopy(FeaturePushValue* dest, float** src, long long* len,
 
 PSGPUWrapper::~PSGPUWrapper() { delete HeterPs_; }
 
-void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place,
-                               uint64_t** gpu_keys,
-                               const std::vector<float*>& values,
-                               const FeatureValue* total_values_gpu,
-                               const int64_t* gpu_len, const int slot_num,
-                               const int hidden_size,
-                               const int64_t total_length) {
-  XPUStream stream = nullptr;
-  auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-  stream = static_cast<platform::XPUDeviceContext*>(dev_ctx)
-               ->x_context()
-               ->xpu_stream;
-  float* buf_value = nullptr;
-  xpu_malloc(reinterpret_cast<void**>(&buf_value),
-             values.size() * sizeof(float*));
-  float** gpu_values = reinterpret_cast<float**>(&buf_value);
-  xpu_memcpy(gpu_values, values.data(), values.size() * sizeof(float*),
-             XPU_HOST_TO_DEVICE);
-
-  unsigned long long** c_keys = (unsigned long long**)gpu_keys;
-  const long long* c_len = (const long long*)gpu_len;
-  PullCopy<<<2, 64, stream>>>(gpu_values, total_values_gpu, c_len, hidden_size,
-                              slot_num, total_length, c_keys);
-
-  xpu_wait(stream);
-}
-
 void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place,
                             uint64_t** origin_keys, uint64_t* total_keys,
                             const int64_t* gpu_len, int slot_num,
@@ -213,51 +186,6 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place,
   xpu_wait(stream);
 }
 
-void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place,
-                               const std::vector<const float*>& grad_values,
-                               FeaturePushValue* total_grad_values_gpu,
-                               const std::vector<int64_t>& slot_lengths,
-                               const int hidden_size,
-                               const int64_t total_length,
-                               const int batch_size) {
-  XPUStream stream = nullptr;
-  auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-  stream = static_cast<platform::XPUDeviceContext*>(dev_ctx)
-               ->x_context()
-               ->xpu_stream;
-  auto slot_lengths_lod = slot_lengths;
-  for (size_t i = 1; i < slot_lengths_lod.size(); i++) {
-    slot_lengths_lod[i] += slot_lengths_lod[i - 1];
-  }
-
-  float* buf_grad_value = nullptr;
-  int64_t* buf_length = nullptr;
-  int* buf_slot_vector = nullptr;
-
-  xpu_malloc(reinterpret_cast<void**>(&buf_grad_value),
-             grad_values.size() * sizeof(float*));
-  xpu_malloc(reinterpret_cast<void**>(&buf_length),
-             slot_lengths.size() * sizeof(int64_t));
-  xpu_malloc(reinterpret_cast<void**>(&buf_slot_vector),
-             slot_lengths_lod.size() * sizeof(int));
-
-  float** gpu_values = reinterpret_cast<float**>(&buf_grad_value);
-  int64_t* gpu_len = reinterpret_cast<int64_t*>(buf_length);
-  int* d_slot_vector = reinterpret_cast<int*>(buf_slot_vector);
-  xpu_memcpy(gpu_values, grad_values.data(),
-             grad_values.size() * sizeof(float*), XPU_HOST_TO_DEVICE);
-  xpu_memcpy(gpu_len, slot_lengths_lod.data(),
-             slot_lengths.size() * sizeof(int64_t), XPU_HOST_TO_DEVICE);
-  xpu_memcpy(d_slot_vector, slot_vector_.data(),
-             slot_lengths_lod.size() * sizeof(int), XPU_HOST_TO_DEVICE);
-
-  long long* c_len = (long long*)gpu_len;
-  PushCopy<<<2, 64, stream>>>(total_grad_values_gpu, gpu_values, c_len,
-                              hidden_size, slot_lengths.size(), total_length,
-                              batch_size, d_slot_vector);
-  xpu_wait(stream);
-}
-
 }  // end namespace framework
 }  // end namespace paddle
 #endif
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
index ffc3f2b21a476..f81ea5f5572c5 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
@@ -326,6 +326,13 @@ def test_fleet_desc_configs(self):
                          .accessor.embed_sgd_param.adagrad.initial_range,
                          0.0001)
 
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        configs = {}
+        configs['emb'] = {"sparse_optimizer": "shared_adam"}
+        strategy.fleet_desc_configs = configs
+        self.assertEqual(strategy.sparse_table_configs[0]
+                         .accessor.embed_sgd_param.adam.beta1_decay_rate, 0.9)
+
     def test_trainer_desc_configs(self):
         strategy = paddle.distributed.fleet.DistributedStrategy()
         configs = {