diff --git a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt index d62fc1c084962..e342d4422a5ed 100644 --- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt +++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt @@ -7,10 +7,10 @@ IF(WITH_GPU) get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS) SET(HETERPS_DEPS ${HETERPS_DEPS} ${RPC_DEPS}) endif() - nv_library(heter_comm_kernel SRCS heter_comm_kernel.cu feature_value.h DEPS ${HETERPS_DEPS}) - nv_library(hashtable_kernel SRCS hashtable_kernel.cu feature_value.h DEPS ${HETERPS_DEPS}) - nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h mem_pool.h DEPS ${HETERPS_DEPS} heter_comm_kernel hashtable_kernel) - nv_test(test_heter_comm SRCS feature_value.h DEPS heter_comm) + nv_library(heter_comm_kernel SRCS heter_comm_kernel.cu feature_value.h feature_value.cu DEPS ${HETERPS_DEPS}) + nv_library(hashtable_kernel SRCS hashtable_kernel.cu feature_value.h feature_value.cu DEPS ${HETERPS_DEPS}) + nv_library(heter_comm SRCS heter_comm.h feature_value.h feature_value.cu heter_resource.cc heter_resource.h mem_pool.h DEPS ${HETERPS_DEPS} heter_comm_kernel hashtable_kernel) + nv_test(test_heter_comm SRCS DEPS heter_comm) nv_library(heter_ps SRCS heter_ps.cu DEPS heter_comm) if(WITH_PSCORE) nv_library(graph_gpu_ps SRCS graph_gpu_ps_table_inl.cu DEPS heter_comm table hashtable_kernel) diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.cu b/paddle/fluid/framework/fleet/heter_ps/feature_value.cu new file mode 100644 index 0000000000000..25ad7e99f2895 --- /dev/null +++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.cu @@ -0,0 +1,445 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#ifdef PADDLE_WITH_HETERPS +#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" + +namespace paddle { +namespace framework { + +const int CUDA_NUM_THREADS = platform::PADDLE_CUDA_NUM_THREADS; +#define GET_BLOCK(N) ((N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS) +#define CUDA_BLOCK(N) GET_BLOCK(N), CUDA_NUM_THREADS, 0 + +template +__global__ void PullCopy(float** dest, + const float* src, + const int64_t* len, + int slot_num, + int total_len, + uint64_t** keys, + uint64_t max_val_size, + int* gpu_dim, + FVAccessor feature_value_accessor) { + CUDA_KERNEL_LOOP(i, total_len) { + int low = 0; + int high = slot_num - 1; + while (low < high) { + int mid = (low + high) / 2; + if (i < len[mid]) + high = mid; + else + low = mid + 1; + } + int x = low; + int y = i - (x ? len[x - 1] : 0); + float* feature_value_ptr = + (float*)((char*)src + uint64_t(i) * uint64_t(max_val_size)); + int mf_dim = gpu_dim[x] - 3; + feature_value_accessor.Select( + dest[x] + y * (mf_dim + 3), feature_value_ptr, keys[x] + y, mf_dim); + } +} + +template +__global__ void PullDedupCopy( + const size_t N, const uint64_t* total_keys, float** dest, const float* src, + const int64_t* slot_lens, uint64_t max_val_size, const int* slot_dims, + const int hidden, const int* key2slot, const uint32_t* restore_idx, + TAccess accessor) { + CUDA_KERNEL_LOOP(idx, N) { + int i = idx / hidden; + int off = idx % hidden; + + int x = key2slot[i]; + int y = i - slot_lens[x]; + + assert(slot_dims[x] == hidden); + float* dest_ptr = dest[x] + y * hidden; + // 0 key fill zero + if (total_keys[i] == 0) { + *(dest_ptr + off) = 0; + return; + } + + float* src_ptr = + (float*)((char*)src + + uint64_t(restore_idx[i]) * uint64_t(max_val_size)); + switch (off) { + case 0: + *(dest_ptr + off) = src_ptr[accessor.ShowIndex()]; + break; + case 1: + *(dest_ptr + off) = src_ptr[accessor.ClickIndex()]; + break; + case 2: + *(dest_ptr + off) = src_ptr[accessor.EmbedWIndex()]; + break; + default: + if (src_ptr[accessor.MfSizeIndex()] == 0) { + *(dest_ptr + off) = 0; + } else { + *(dest_ptr + off) = + src_ptr[accessor.EmbedxWIndex() + off - 3]; + } + break; + } + } +} + +template +__global__ void PushCopyWithPool(float* dest, + float** src, + int64_t* len, + int slot_num, + uint64_t total_len, + int bs, + int* slot_vector, + int* mf_dim_vector, + size_t grad_value_size, + FVAccessor feature_value_accessor) { + CUDA_KERNEL_LOOP(i, total_len) { + int low = 0; + int high = slot_num - 1; + while (low < high) { + int mid = (low + high) / 2; + if (i < len[mid]) + high = mid; + else + low = mid + 1; + } + int x = low; + int y = i - (x ? len[low - 1] : 0); + float* cur = (float*)((char*)dest + i * grad_value_size); + + cur[feature_value_accessor.common_push_value.SlotIndex()] = + (float)slot_vector[x]; + int mf_dim = mf_dim_vector[x]; + cur[feature_value_accessor.common_push_value.MfDimIndex()] = mf_dim; + + cur[feature_value_accessor.common_push_value.ShowIndex()] = + *(src[x] + y * (mf_dim + 3)); + cur[feature_value_accessor.common_push_value.ClickIndex()] = + *(src[x] + y * (mf_dim + 3) + 1); + cur[feature_value_accessor.common_push_value.EmbedGIndex()] = + *(src[x] + y * (mf_dim + 3) + 2) * -1. * bs; + for (int j = 0; j < mf_dim; j++) { + cur[feature_value_accessor.common_push_value.EmbedxGIndex() + j] = + *(src[x] + y * (mf_dim + 3) + 3 + j) * -1. * bs; + } + } +} + +template +__global__ void PushMergeCopyAtomic( + const size_t N, const uint64_t* total_keys, float* dest, float** src, + const int hidden, const int bs, const int* slot_vector, + const int* slot_dims, const int64_t* slot_lens, const int* key2slot, + const uint32_t* d_restore_idx, size_t grad_value_size, + TAccess accessor) { + CUDA_KERNEL_LOOP(idx, N) { + int i = idx / hidden; + int off = idx % hidden; + // filter 0 keys + if (total_keys[i] == 0) { + return; + } + + int x = key2slot[i]; + int y = i - slot_lens[x]; + + const float* ptr = src[x] + y * hidden; + float* cur = (float*)((char*)dest + d_restore_idx[i] * grad_value_size); + int mf_dim = slot_dims[x] - 3; + switch (off) { + case 0: + cur[accessor.SlotIndex()] = (float)slot_vector[x]; + cur[accessor.MfDimIndex()] = mf_dim; + paddle::platform::CudaAtomicAdd( + &cur[accessor.ShowIndex()], *(ptr + off)); + break; + case 1: + paddle::platform::CudaAtomicAdd( + &cur[accessor.ClickIndex()], *(ptr + off)); + break; + case 2: + paddle::platform::CudaAtomicAdd( + &cur[accessor.EmbedGIndex()], *(ptr + off) * -1. * bs); + break; + default: + int embedx_idx = off - 3; + if (mf_dim < embedx_idx) { + return; + } + paddle::platform::CudaAtomicAdd( + &cur[accessor.EmbedxGIndex() + embedx_idx], *(ptr + off) * -1. * bs); + break; + } + } +} + +#define SUM_GRAD_VALUE \ + for (uint32_t j = 0; j < count; ++j) { \ + const uint32_t& pos = d_sort_idx[start + j]; \ + const int& x = key2slot[pos]; \ + y = pos - slot_lens[x]; \ + val += *(reinterpret_cast(src[x] + y * hidden + off)); \ + } + +template +__global__ void PushMergeCopy( + const size_t N, const uint64_t* total_keys, float* dest, float** src, + const int hidden, const int bs, const int* slot_vector, + const int* slot_dims, const int64_t* slot_lens, const int* key2slot, + const uint32_t* d_sort_idx, + const uint32_t* d_sort_offset, + const uint32_t* d_sort_cnt, size_t grad_value_size, + TAccess accessor) { + CUDA_KERNEL_LOOP(idx, N) { + int i = idx / hidden; + int off = idx % hidden; + // filter 0 keys + float* cur = (float*)((char*)dest + i * grad_value_size); + + if (total_keys[i] == 0) { + switch (off) { + case 0: + cur[accessor.SlotIndex()] = 0; + cur[accessor.MfDimIndex()] = 0; + cur[accessor.ShowIndex()] = 0.0; + break; + case 1: + cur[accessor.ClickIndex()] = 0.0; + break; + case 2: + cur[accessor.EmbedGIndex()] = 0.0; + break; + default: + cur[accessor.EmbedxGIndex() + off - 3] = 0.0; + break; + } + return; + } + + const uint32_t& start = d_sort_offset[i]; + const uint32_t& count = d_sort_cnt[i]; + const uint32_t& pos = d_sort_idx[start]; + + const int& x = key2slot[pos]; + int y = pos - slot_lens[x]; + int mf_dim = slot_dims[x] - 3; + + double val = 0.0; + + switch (off) { + case 0: + cur[accessor.SlotIndex()] = (float)slot_vector[x]; + cur[accessor.MfDimIndex()] = mf_dim; + SUM_GRAD_VALUE + cur[accessor.ShowIndex()] = val; + break; + case 1: + SUM_GRAD_VALUE + cur[accessor.ClickIndex()] = val; + break; + case 2: + SUM_GRAD_VALUE + cur[accessor.EmbedGIndex()] = val * -1. * bs; + break; + default: + int embedx_idx = off - 3; + if (mf_dim < embedx_idx) { + cur[accessor.EmbedxGIndex() + embedx_idx] = 0.0; + return; + } + SUM_GRAD_VALUE + cur[accessor.EmbedxGIndex() + embedx_idx] = val * -1. * bs; + break; + } + } +} + +template +void AccessorWrapper::CopyForPullImpl( + const paddle::platform::Place& place, + uint64_t** gpu_keys, + const std::vector& values, + const float* total_values_gpu, + const int64_t* gpu_len, + const int slot_num, + const int hidden_size, + const int64_t total_length, + int* gpu_dim, + int feature_value_size) { + auto stream = dynamic_cast( + paddle::platform::DeviceContextPool::Instance().Get(place)) + ->stream(); + auto buf_value = memory::Alloc(place, values.size() * sizeof(float*)); + float** gpu_values = reinterpret_cast(buf_value->ptr()); + cudaMemcpy(gpu_values, + values.data(), + values.size() * sizeof(float*), + cudaMemcpyHostToDevice); + PullCopy<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>( + gpu_values, + total_values_gpu, + gpu_len, + slot_num, + total_length, + gpu_keys, + feature_value_size, + gpu_dim, + gpu_accessor_); + cudaStreamSynchronize(stream); +} + +template +void AccessorWrapper::CopyForPushImpl( + const paddle::platform::Place& place, + const std::vector& grad_values, + float* total_grad_values_gpu, + const std::vector& slot_lengths, + const uint64_t total_length, + const int batch_size, + size_t grad_value_size, + std::vector& slot_vector, + std::vector& slot_mf_dim_vector) { + auto stream = dynamic_cast( + paddle::platform::DeviceContextPool::Instance().Get(place)) + ->stream(); + auto slot_lengths_lod = slot_lengths; + for (int i = 1; i < slot_lengths_lod.size(); i++) { + slot_lengths_lod[i] += slot_lengths_lod[i - 1]; + } + auto buf_grad_value = + memory::Alloc(place, grad_values.size() * sizeof(float*)); + auto buf_length = memory::Alloc(place, slot_lengths.size() * sizeof(int64_t)); + auto buf_slot_vector = + memory::Alloc(place, slot_lengths_lod.size() * sizeof(int)); + auto buf_mf_dim_vector = + memory::Alloc(place, slot_lengths_lod.size() * sizeof(int)); + float** gpu_values = reinterpret_cast(buf_grad_value->ptr()); + int64_t* gpu_len = reinterpret_cast(buf_length->ptr()); + int* d_slot_vector = reinterpret_cast(buf_slot_vector->ptr()); + int* d_mf_dim_vector = reinterpret_cast(buf_mf_dim_vector->ptr()); + cudaMemcpy(gpu_values, + grad_values.data(), + grad_values.size() * sizeof(float*), + cudaMemcpyHostToDevice); + cudaMemcpy(gpu_len, + slot_lengths_lod.data(), + slot_lengths.size() * sizeof(int64_t), + cudaMemcpyHostToDevice); + cudaMemcpy(d_slot_vector, + slot_vector.data(), + slot_lengths_lod.size() * sizeof(int), + cudaMemcpyHostToDevice); + cudaMemcpy(d_mf_dim_vector, + slot_mf_dim_vector.data(), + slot_lengths_lod.size() * sizeof(int), + cudaMemcpyHostToDevice); + PushCopyWithPool<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>( + total_grad_values_gpu, + gpu_values, + gpu_len, + slot_lengths.size(), + total_length, + batch_size, + d_slot_vector, + d_mf_dim_vector, + grad_value_size, + gpu_accessor_); + cudaStreamSynchronize(stream); +} + +template +void AccessorWrapper::CopyForPullDedupImpl( + const paddle::platform::Place& place, + const uint64_t* total_keys, float** gpu_values, + const float* total_values_gpu, + const int64_t* slot_lens, const int* key2slot, + const int hidden_size, + const int64_t total_length, + const int* slot_dims, + const uint32_t* gpu_restore_idx, + int pull_value_size) { + auto stream = dynamic_cast( + paddle::platform::DeviceContextPool::Instance().Get(place)) + ->stream(); + size_t N = total_length * hidden_size; + PullDedupCopy<<>>( + N, total_keys, gpu_values, total_values_gpu, slot_lens, pull_value_size, + slot_dims, hidden_size, key2slot, gpu_restore_idx, + gpu_accessor_.common_pull_value); + cudaStreamSynchronize(stream); +} + +template +void AccessorWrapper::CopyForPushDedupImpl( + const paddle::platform::Place& place, + const uint64_t* total_keys, float** grad_values, + float* total_grad_values_gpu, const int* slots, + const int64_t* slot_lens, const int hidden_size, + const int64_t total_length, + const int64_t dedup_length, const int batch_size, + const int* slot_dims, const int* key2slot, + const uint32_t* d_restore_idx, + const size_t grad_value_size) { + auto stream = dynamic_cast( + paddle::platform::DeviceContextPool::Instance().Get(place)) + ->stream(); + cudaMemsetAsync(total_grad_values_gpu, 0, dedup_length * grad_value_size, + stream); + size_t N = total_length * hidden_size; + PushMergeCopyAtomic<<>>( + N, total_keys, total_grad_values_gpu, grad_values, hidden_size, + batch_size, slots, slot_dims, slot_lens, key2slot, d_restore_idx, + grad_value_size, gpu_accessor_.common_push_value); + + cudaStreamSynchronize(stream); +} + +template +void AccessorWrapper::CopyForPushDedupImpl( + const paddle::platform::Place& place, + const uint64_t* total_keys, float** grad_values, + float* total_grad_values_gpu, const int* slots, + const int64_t* slot_lens, const int hidden_size, + const int64_t total_length, const int64_t dedup_length, + const int batch_size, const int* slot_dims, + const int* key2slot, + const uint32_t* gpu_sort_idx, + const uint32_t* gpu_sort_offset, + const uint32_t* gpu_sort_lens, + const size_t grad_value_size) { + auto stream = dynamic_cast( + paddle::platform::DeviceContextPool::Instance().Get(place)) + ->stream(); + // merge all grad to one + size_t N = dedup_length * hidden_size; + PushMergeCopy<<>>( + N, total_keys, total_grad_values_gpu, grad_values, hidden_size, + batch_size, slots, slot_dims, slot_lens, key2slot, + gpu_sort_idx, gpu_sort_offset, gpu_sort_lens, + grad_value_size, gpu_accessor_.common_push_value); + cudaStreamSynchronize(stream); +} + +#ifdef PADDLE_WITH_PSCORE +template class AccessorWrapper; +#endif + +} // namespace framework +} // namespace paddle +#endif diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.h b/paddle/fluid/framework/fleet/heter_ps/feature_value.h index 134fd1f2760c0..d237521eecf91 100644 --- a/paddle/fluid/framework/fleet/heter_ps/feature_value.h +++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.h @@ -20,24 +20,34 @@ limitations under the License. */ #include #include +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/place.h" +#ifdef PADDLE_WITH_PSCORE +#include "paddle/fluid/distributed/ps/table/accessor.h" +#include "paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h" +#include "paddle/fluid/distributed/ps/table/depends/feature_value.h" +#endif namespace paddle { namespace framework { #define MF_DIM 8 typedef uint64_t FeatureKey; +#define TYPEALIGN(ALIGNVAL, LEN) \ + (((uint64_t)(LEN) + ((ALIGNVAL)-1)) & ~((uint64_t)((ALIGNVAL)-1))) class FeatureValueAccessor { public: - __host__ __device__ FeatureValueAccessor() {} + __host__ __device__ FeatureValueAccessor() {} __host__ __device__ ~FeatureValueAccessor() {} - __host__ __device__ virtual int Configure(std::unordered_map config) { + __host__ __device__ virtual int Configure( + std::unordered_map config) { _config = config; Initialize(); return 0; } - __host__ __device__ virtual int Initialize() = 0; + __host__ __device__ virtual int Initialize() = 0; protected: std::unordered_map _config; @@ -63,47 +73,58 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor { std::vector embedx_w; */ - __host__ __device__ int Dim() { return 9 + embed_sgd_dim + embedx_sgd_dim + embedx_dim; } // has cpu_ptr(2) - __host__ __device__ int DimSize(size_t dim, int embedx_dim) { return sizeof(float); } - __host__ __device__ int Size() { return Dim() * sizeof(float); } // cpu_ptr:uint64=2float - __host__ __device__ int EmbedDim() { return embed_sgd_dim;} - __host__ __device__ int EmbedXDim() { return embedx_sgd_dim;} - __host__ __device__ int EmbedWDim() { return embedx_dim;} - __host__ __device__ int CpuPtrIndex() {return 0; } // cpuprt uint64 - __host__ __device__ int DeltaScoreIndex() { return CpuPtrIndex() + 2; } + __host__ __device__ int Dim() { + return 9 + embed_sgd_dim + embedx_sgd_dim + embedx_dim; + } // has cpu_ptr(2) + __host__ __device__ int DimSize(size_t dim, int embedx_dim) { + return sizeof(float); + } + __host__ __device__ size_t Size() { + return TYPEALIGN(8, Dim() * sizeof(float)); + } // cpu_ptr:uint64=2float + __host__ __device__ int EmbedDim() { return embed_sgd_dim; } + __host__ __device__ int EmbedXDim() { return embedx_sgd_dim; } + __host__ __device__ int EmbedWDim() { return embedx_dim; } + __host__ __device__ int CpuPtrIndex() { return 0; } // cpuprt uint64 + __host__ __device__ int DeltaScoreIndex() { return CpuPtrIndex() + 2; } __host__ __device__ int ShowIndex() { return DeltaScoreIndex() + 1; } __host__ __device__ int ClickIndex() { return ShowIndex() + 1; } __host__ __device__ int EmbedWIndex() { return ClickIndex() + 1; } __host__ __device__ int EmbedG2SumIndex() { return EmbedWIndex() + 1; } - __host__ __device__ int SlotIndex() { return EmbedG2SumIndex() + embed_sgd_dim; } + __host__ __device__ int SlotIndex() { + return EmbedG2SumIndex() + embed_sgd_dim; + } __host__ __device__ int MfDimIndex() { return SlotIndex() + 1; } - __host__ __device__ int MfSizeIndex() { return MfDimIndex() + 1; } // actual mf size (ex. 0) + __host__ __device__ int MfSizeIndex() { + return MfDimIndex() + 1; + } // actual mf size (ex. 0) __host__ __device__ int EmbedxG2SumIndex() { return MfSizeIndex() + 1; } - __host__ __device__ int EmbedxWIndex() { return EmbedxG2SumIndex() + embedx_sgd_dim; } - + __host__ __device__ int EmbedxWIndex() { + return EmbedxG2SumIndex() + embedx_sgd_dim; + } // 根据mf_dim计算的总长度 __host__ __device__ int Dim(int& mf_dim) { int tmp_embedx_sgd_dim = 1; - if (optimizer_type_ == 3) {//adam + if (optimizer_type_ == 3) { // adam tmp_embedx_sgd_dim = mf_dim * 2 + 2; - } else if (optimizer_type_ == 4) { //shared_adam + } else if (optimizer_type_ == 4) { // shared_adam tmp_embedx_sgd_dim = 4; } return 9 + embed_sgd_dim + tmp_embedx_sgd_dim + mf_dim; } // 根据mf_dim 计算的总byte数 - __host__ __device__ int Size(int& mf_dim) { - return Dim(mf_dim) * sizeof(float); // cpu_ptr:2float + __host__ __device__ size_t Size(int& mf_dim) { + return TYPEALIGN(8, Dim(mf_dim) * sizeof(float)); // cpu_ptr:2float } // 根据mf_dim 计算的 mf_size byte数 - __host__ __device__ int MFSize(int& mf_dim) { + __host__ __device__ size_t MFSize(int& mf_dim) { int tmp_embedx_sgd_dim = 1; - if (optimizer_type_ == 3) { //adam + if (optimizer_type_ == 3) { // adam tmp_embedx_sgd_dim = mf_dim * 2 + 2; - } else if (optimizer_type_ == 4) { //shared_adam + } else if (optimizer_type_ == 4) { // shared_adam tmp_embedx_sgd_dim = 4; } return (tmp_embedx_sgd_dim + mf_dim) * sizeof(float); @@ -111,33 +132,42 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor { __host__ __device__ int EmbedxG2SumOffsetIndex() { return 0; } __host__ __device__ int EmbedxWOffsetIndex(float* val) { - // has mf + // has mf int tmp_embedx_sgd_dim = 1; if (int(MfSize(val)) > 0) { - if (optimizer_type_ == 3) {//adam + if (optimizer_type_ == 3) { // adam tmp_embedx_sgd_dim = int(MfDim(val)) * 2 + 2; - } else if (optimizer_type_ == 4) { //shared_adam + } else if (optimizer_type_ == 4) { // shared_adam tmp_embedx_sgd_dim = 4; } - return EmbedxG2SumIndex() + tmp_embedx_sgd_dim; + return EmbedxG2SumIndex() + tmp_embedx_sgd_dim; } else { // no mf return 0; } } - - __host__ __device__ uint64_t CpuPtr(float* val) {return *(reinterpret_cast(val)); } - __host__ __device__ float& DeltaScore(float* val) { return val[DeltaScoreIndex()]; } + __host__ __device__ uint64_t CpuPtr(float* val) { + return *(reinterpret_cast(val)); + } + __host__ __device__ float& DeltaScore(float* val) { + return val[DeltaScoreIndex()]; + } __host__ __device__ float& Show(float* val) { return val[ShowIndex()]; } __host__ __device__ float& Click(float* val) { return val[ClickIndex()]; } __host__ __device__ float& Slot(float* val) { return val[SlotIndex()]; } __host__ __device__ float& MfDim(float* val) { return val[MfDimIndex()]; } __host__ __device__ float& MfSize(float* val) { return val[MfSizeIndex()]; } __host__ __device__ float& EmbedW(float* val) { return val[EmbedWIndex()]; } - __host__ __device__ float& EmbedG2Sum(float* val) { return val[EmbedG2SumIndex()]; } - __host__ __device__ float& EmbedxG2Sum(float* val) { return val[EmbedxG2SumIndex()]; } - __host__ __device__ float& EmbedxW(float* val) { return val[EmbedxWIndex()]; } + __host__ __device__ float& EmbedG2Sum(float* val) { + return val[EmbedG2SumIndex()]; + } + __host__ __device__ float& EmbedxG2Sum(float* val) { + return val[EmbedxG2SumIndex()]; + } + __host__ __device__ float& EmbedxW(float* val) { + return val[EmbedxWIndex()]; + } int embed_sgd_dim; int embedx_dim; @@ -146,21 +176,27 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor { }; struct CommonPullValue { - /* - float show; - float click; - float embed_w; - float mf_size - std::vector embedx_w; - */ - __host__ __device__ int ShowIndex() { return 0; } - __host__ __device__ int ClickIndex() { return 1; } - __host__ __device__ int EmbedWIndex() { return 2; } - __host__ __device__ int MfSizeIndex() { return 3; } // actual mf size (ex. 0) - __host__ __device__ int EmbedxWIndex() { return 4; } - __host__ __device__ int Size(const int mf_dim) { - return (4 + mf_dim) * sizeof(float); - } + /* + float show; + float click; + float embed_w; + float mf_size + std::vector embedx_w; + */ + __host__ __device__ static int Dim(int embedx_dim) { + return 4 + embedx_dim; + } + __host__ __device__ int DimSize(size_t dim) { return sizeof(float); } + __host__ __device__ int Size(int embedx_dim) { + return TYPEALIGN(8, Dim(embedx_dim) * sizeof(float)); + } + __host__ __device__ int ShowIndex() { return 0; } + __host__ __device__ int ClickIndex() { return 1; } + __host__ __device__ int EmbedWIndex() { return 2; } + __host__ __device__ int MfSizeIndex() { + return 3; + } // actual mf size (ex. 0) + __host__ __device__ int EmbedxWIndex() { return 4; } }; struct CommonPushValue { @@ -175,14 +211,28 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor { __host__ __device__ int Dim(int embedx_dim) { return 5 + embedx_dim; } - __host__ __device__ int DimSize(int dim, int embedx_dim) { return sizeof(float); } - __host__ __device__ int Size(int embedx_dim) { return Dim(embedx_dim) * sizeof(float); } + __host__ __device__ int DimSize(int dim, int embedx_dim) { + return sizeof(float); + } + __host__ __device__ int Size(int embedx_dim) { + return TYPEALIGN(8, Dim(embedx_dim) * sizeof(float)); + } __host__ __device__ int SlotIndex() { return 0; } - __host__ __device__ int ShowIndex() { return CommonPushValue::SlotIndex() + 1; } - __host__ __device__ int ClickIndex() { return CommonPushValue::ShowIndex() + 1; } - __host__ __device__ int MfDimIndex() { return CommonPushValue::ClickIndex() + 1; } - __host__ __device__ int EmbedGIndex() { return CommonPushValue::MfDimIndex() + 1; } - __host__ __device__ int EmbedxGIndex() { return CommonPushValue::EmbedGIndex() + 1; } + __host__ __device__ int ShowIndex() { + return CommonPushValue::SlotIndex() + 1; + } + __host__ __device__ int ClickIndex() { + return CommonPushValue::ShowIndex() + 1; + } + __host__ __device__ int MfDimIndex() { + return CommonPushValue::ClickIndex() + 1; + } + __host__ __device__ int EmbedGIndex() { + return CommonPushValue::MfDimIndex() + 1; + } + __host__ __device__ int EmbedxGIndex() { + return CommonPushValue::EmbedGIndex() + 1; + } __host__ __device__ float& Slot(float* val) { return val[CommonPushValue::SlotIndex()]; } @@ -203,21 +253,20 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor { } }; - __host__ __device__ CommonFeatureValueAccessor() {} __host__ __device__ ~CommonFeatureValueAccessor() {} __host__ __device__ virtual int Initialize() { int optimizer_type = (_config.find("optimizer_type") == _config.end()) - ? 1 - : int(_config["optimizer_type"]); + ? 1 + : int(_config["optimizer_type"]); int sparse_embedx_dim = (_config.find("embedx_dim") == _config.end()) ? 8 : int(_config["embedx_dim"]); - if (optimizer_type == 3) { //adam + if (optimizer_type == 3) { // adam common_feature_value.embed_sgd_dim = 4; common_feature_value.embedx_sgd_dim = sparse_embedx_dim * 2 + 2; - } else if (optimizer_type == 4) { //shared_adam + } else if (optimizer_type == 4) { // shared_adam common_feature_value.embed_sgd_dim = 4; common_feature_value.embedx_sgd_dim = 4; } else { @@ -230,7 +279,252 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor { return 0; } - __host__ __device__ std::string ParseToString(const float* v, int param_size) { + // // build阶段从cpu_val赋值给gpu_val + __host__ void BuildFill( + float* gpu_val, void* cpu, + paddle::distributed::ValueAccessor* cpu_table_accessor, int mf_dim) { +#ifdef PADDLE_WITH_PSCORE + paddle::distributed::CtrDymfAccessor* cpu_accessor = + dynamic_cast(cpu_table_accessor); + paddle::distributed::FixedFeatureValue* cpu_ptr = + (paddle::distributed::FixedFeatureValue*)(cpu); + float* cpu_val = cpu_ptr->data(); + size_t cpu_dim = cpu_ptr->size(); + + gpu_val[common_feature_value.DeltaScoreIndex()] = + cpu_val[cpu_accessor->common_feature_value.DeltaScoreIndex()]; + gpu_val[common_feature_value.ShowIndex()] = + cpu_val[cpu_accessor->common_feature_value.ShowIndex()]; + gpu_val[common_feature_value.ClickIndex()] = + cpu_val[cpu_accessor->common_feature_value.ClickIndex()]; + gpu_val[common_feature_value.SlotIndex()] = + cpu_val[cpu_accessor->common_feature_value.SlotIndex()]; + gpu_val[common_feature_value.EmbedWIndex()] = + cpu_val[cpu_accessor->common_feature_value.EmbedWIndex()]; + for (int i = 0; i < common_feature_value.EmbedDim(); i++) { + gpu_val[common_feature_value.EmbedG2SumIndex() + i] = + cpu_val[cpu_accessor->common_feature_value.EmbedG2SumIndex() + i]; + } + *(reinterpret_cast( + gpu_val + common_feature_value.CpuPtrIndex())) = (uint64_t)(cpu); + cpu_val[cpu_accessor->common_feature_value.MfDimIndex()] = float(mf_dim); + gpu_val[common_feature_value.MfDimIndex()] = mf_dim; + if (cpu_dim > cpu_accessor->GetAccessorInfo().dim - + cpu_accessor->GetAccessorInfo().mf_size / sizeof(float)) { + gpu_val[common_feature_value.MfSizeIndex()] = + common_feature_value.MFSize(mf_dim) / sizeof(float); + + for (int x = 0; + x < int(common_feature_value.MFSize(mf_dim) / sizeof(float)); x++) { + gpu_val[common_feature_value.EmbedxG2SumIndex() + x] = + cpu_val[cpu_accessor->common_feature_value.EmbedxG2SumIndex() + x]; + } + } else { + gpu_val[common_feature_value.MfSizeIndex()] = 0; + for (int x = common_feature_value.EmbedxG2SumIndex(); + x < int(common_feature_value.Size(mf_dim) / sizeof(float)); x++) { + gpu_val[x] = 0; + } + } +#endif + } + + // dump_to_cpu阶段从gpu_val赋值给cpu_val + __host__ void DumpFill(float* gpu_val, + paddle::distributed::ValueAccessor* cpu_table_accessor, + int mf_dim) { +#ifdef PADDLE_WITH_PSCORE + paddle::distributed::CtrDymfAccessor* cpu_accessor = + dynamic_cast(cpu_table_accessor); + + auto* downpour_value = + (paddle::distributed::FixedFeatureValue*)(*(reinterpret_cast( + gpu_val + common_feature_value.CpuPtrIndex()))); + size_t downpour_value_size = downpour_value->size(); + if (gpu_val[common_feature_value.MfSizeIndex()] > 0 && + downpour_value_size == (cpu_accessor->GetAccessorInfo().dim - + int(cpu_accessor->GetAccessorInfo().mf_size / + sizeof(float)))) { // cpu_accessor + downpour_value->resize(cpu_accessor->common_feature_value.Dim(mf_dim)); + } + float* cpu_val = downpour_value->data(); + cpu_val[cpu_accessor->common_feature_value.DeltaScoreIndex()] = + gpu_val[common_feature_value.DeltaScoreIndex()]; + cpu_val[cpu_accessor->common_feature_value.ShowIndex()] = + gpu_val[common_feature_value.ShowIndex()]; + cpu_val[cpu_accessor->common_feature_value.ClickIndex()] = + gpu_val[common_feature_value.ClickIndex()]; + cpu_val[cpu_accessor->common_feature_value.EmbedWIndex()] = + gpu_val[common_feature_value.EmbedWIndex()]; + cpu_val[cpu_accessor->common_feature_value.SlotIndex()] = + gpu_val[common_feature_value.SlotIndex()]; + + for (int i = 0; i < common_feature_value.EmbedDim(); i++) { + cpu_val[cpu_accessor->common_feature_value.EmbedG2SumIndex() + i] = + gpu_val[common_feature_value.EmbedG2SumIndex() + i]; + } + + if (gpu_val[common_feature_value.MfSizeIndex()] > 0) { + for (int x = 0; + x < int(common_feature_value.MFSize(mf_dim) / sizeof(float)); x++) { + cpu_val[cpu_accessor->common_feature_value.EmbedxG2SumIndex() + x] = + gpu_val[common_feature_value.EmbedxG2SumIndex() + x]; + } + } +#endif + } + + // dy_mf_fill_dvals_kernel 阶段 gpukernel + // 中从src_val赋值给dest_val + __host__ __device__ void FeatureValueFill(float* dest_val, float* src_val, + int mf_dim) { + *(reinterpret_cast(dest_val + + common_feature_value.CpuPtrIndex())) = + *(reinterpret_cast(src_val + + common_feature_value.CpuPtrIndex())); + dest_val[common_feature_value.DeltaScoreIndex()] = + src_val[common_feature_value.DeltaScoreIndex()]; + dest_val[common_feature_value.ShowIndex()] = + src_val[common_feature_value.ShowIndex()]; + dest_val[common_feature_value.ClickIndex()] = + src_val[common_feature_value.ClickIndex()]; + dest_val[common_feature_value.EmbedWIndex()] = + src_val[common_feature_value.EmbedWIndex()]; + for (int i = 0; i < common_feature_value.EmbedDim(); i++) { + dest_val[common_feature_value.EmbedG2SumIndex() + i] = + src_val[common_feature_value.EmbedG2SumIndex() + i]; + } + dest_val[common_feature_value.SlotIndex()] = + src_val[common_feature_value.SlotIndex()]; + dest_val[common_feature_value.MfDimIndex()] = mf_dim; + dest_val[common_feature_value.MfSizeIndex()] = + src_val[common_feature_value.MfSizeIndex()]; + + for (int x = common_feature_value.EmbedxG2SumIndex(); + x < int(common_feature_value.Size(mf_dim) / sizeof(float)); x++) { + dest_val[x] = src_val[x]; + } + } + + // dy_mf_fill_dvals_kernel, dy_mf_search_kernel 阶段 gpukernel + // 中从src_val赋值给dest_val + __host__ __device__ void PullValueFill(float* dest_val, float* src_val) { + dest_val[common_pull_value.ShowIndex()] = + src_val[common_feature_value.ShowIndex()]; + dest_val[common_pull_value.ClickIndex()] = + src_val[common_feature_value.ClickIndex()]; + dest_val[common_pull_value.EmbedWIndex()] = + src_val[common_feature_value.EmbedWIndex()]; + + int mf_size = int(src_val[common_feature_value.MfSizeIndex()]); + if (mf_size == 0) { + dest_val[common_pull_value.MfSizeIndex()] = 0; + return; + } + // set pull value real dim size + int mf_dim = int(src_val[common_feature_value.MfDimIndex()]); + dest_val[common_pull_value.MfSizeIndex()] = mf_dim; + + int embedx_off = common_pull_value.EmbedxWIndex(); + int value_off = common_feature_value.EmbedxWIndex(); + for (int k = 0; k < mf_dim; ++k) { + dest_val[embedx_off + k] = src_val[value_off + k]; + } + } + + // dy_mf_fill_shard_grads_kernel,update_one 阶段 gpukernel + // 中从src_val赋值给dest_val + __host__ __device__ void PushValueFill(float* dest_val, + const float* src_val) { + dest_val[common_push_value.SlotIndex()] = + src_val[common_push_value.SlotIndex()]; + dest_val[common_push_value.ShowIndex()] = + src_val[common_push_value.ShowIndex()]; + dest_val[common_push_value.ClickIndex()] = + src_val[common_push_value.ClickIndex()]; + dest_val[common_push_value.MfDimIndex()] = + src_val[common_push_value.MfDimIndex()]; + dest_val[common_push_value.EmbedGIndex()] = + src_val[common_push_value.EmbedGIndex()]; + + for (int x = 0; x < int(src_val[common_push_value.MfDimIndex()]); x++) { + dest_val[common_push_value.EmbedxGIndex() + x] = + src_val[common_push_value.EmbedxGIndex() + x]; + } + } + + // update_basic 阶段 gpukernel 中从src_val赋值给dest_val + __host__ __device__ void PushValueFillBasic(float* dest_val, + const float* src_val) { + dest_val[common_push_value.SlotIndex()] = + src_val[common_push_value.SlotIndex()]; + dest_val[common_push_value.ShowIndex()] = + src_val[common_push_value.ShowIndex()]; + dest_val[common_push_value.ClickIndex()] = + src_val[common_push_value.ClickIndex()]; + dest_val[common_push_value.MfDimIndex()] = + src_val[common_push_value.MfDimIndex()]; + dest_val[common_push_value.EmbedGIndex()] = + src_val[common_push_value.EmbedGIndex()]; + } + + // merge_one 阶段 gpukernel 中 PushValue 从src_val赋值给dest_val + __host__ __device__ void MergePushValue(float* dest_val, + const float* src_val) { + dest_val[common_push_value.ShowIndex()] += + src_val[common_push_value.ShowIndex()]; + dest_val[common_push_value.ClickIndex()] += + src_val[common_push_value.ClickIndex()]; + dest_val[common_push_value.EmbedGIndex()] += + src_val[common_push_value.EmbedGIndex()]; + for (int j = 0; j < int(dest_val[common_push_value.MfDimIndex()]); j++) { + dest_val[common_push_value.EmbedxGIndex() + j] += + src_val[common_push_value.EmbedxGIndex() + j]; + } + } + + // merge_basic 阶段 gpukernel 中 PushValue 从src_val赋值给dest_val + __host__ __device__ void MergePushValueBasic(float* dest_val, + const float* src_val) { + dest_val[common_push_value.ShowIndex()] += + src_val[common_push_value.ShowIndex()]; + dest_val[common_push_value.ClickIndex()] += + src_val[common_push_value.ClickIndex()]; + dest_val[common_push_value.EmbedGIndex()] += + src_val[common_push_value.EmbedGIndex()]; + } + + // PullCopy 阶段 gpukernel 中 FeatureValue回填到PullValue + __host__ __device__ void Select(float* dest_val, float* src_val, + uint64_t* key, int mf_dim) { + if (*key == 0) { + *(dest_val + common_pull_value.ShowIndex()) = 0; + *(dest_val + common_pull_value.ClickIndex()) = 0; + *(dest_val + common_pull_value.EmbedWIndex()) = 0; + } else { + *(dest_val + common_pull_value.ShowIndex()) = + src_val[common_pull_value.ShowIndex()]; + *(dest_val + common_pull_value.ClickIndex()) = + src_val[common_pull_value.ClickIndex()]; + *(dest_val + common_pull_value.EmbedWIndex()) = + src_val[common_pull_value.EmbedWIndex()]; + } + + if (src_val[common_pull_value.MfSizeIndex()] == 0 || *key == 0) { + for (int j = 0; j < mf_dim; j++) { + *(dest_val + common_pull_value.EmbedxWIndex() + j) = 0; + } + } else { + for (int j = 0; j < mf_dim; j++) { + // common_pull_value EmbedxWIndex 之前还有 MfSizeIndex, + // 所以这里没有直接使用 common_pull_value.EmbedxWIndex() + *(dest_val + 3 + j) = src_val[common_pull_value.EmbedxWIndex() + j]; + } + } + } + + __host__ __device__ std::string ParseToString(const float* v, + int param_size) { /* uint64_t cpu_ptr; // 2float float delta_score; @@ -245,21 +539,21 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor { std::vector embedx_w; */ std::stringstream os; - os << "cpuptr: " << common_feature_value.CpuPtr(const_cast(v)) << " delta_score: " << v[2] - << " show: " << v[3] << " click: " << v[4] - << " embed_w:" << v[5] << " embed_g2sum:"; + os << "cpuptr: " << common_feature_value.CpuPtr(const_cast(v)) + << " delta_score: " << v[2] << " show: " << v[3] << " click: " << v[4] + << " embed_w:" << v[5] << " embed_g2sum:"; for (int i = common_feature_value.EmbedG2SumIndex(); - i < common_feature_value.SlotIndex(); i++) { + i < common_feature_value.SlotIndex(); i++) { os << " " << v[i]; } int mf_dim = int(common_feature_value.MfDim(const_cast(v))); - os << " slot: " << common_feature_value.Slot(const_cast(v)) - << " mf_dim: " << mf_dim - << " mf_size: " << common_feature_value.MfSize(const_cast(v)) - << " mf: "; + os << " slot: " << common_feature_value.Slot(const_cast(v)) + << " mf_dim: " << mf_dim + << " mf_size: " << common_feature_value.MfSize(const_cast(v)) + << " mf: "; if (param_size > common_feature_value.EmbedxG2SumIndex()) { for (auto i = common_feature_value.EmbedxG2SumIndex(); - i < common_feature_value.Dim(mf_dim); ++i) { + i < common_feature_value.Dim(mf_dim); ++i) { os << " " << v[i]; } } @@ -272,7 +566,6 @@ class CommonFeatureValueAccessor : public FeatureValueAccessor { CommonPullValue common_pull_value; }; - struct FeatureValue { float delta_score; float show; @@ -344,6 +637,253 @@ struct FeaturePushValue { } }; +class VirtualAccessor { + public: + virtual int Configure(std::unordered_map config) = 0; + + virtual size_t GetFeatureValueSize(int& mf_dim) = 0; + + virtual size_t GetPushValueSize(int& mf_dim) = 0; + + virtual size_t GetPullValueSize(int& mf_dim) = 0; + + virtual void BuildFill(void* gpu_val, void* cpu_val, + paddle::distributed::ValueAccessor* cpu_table_accessor, + int mf_dim) = 0; + + virtual void DumpFill(float* gpu_val, + paddle::distributed::ValueAccessor* cpu_table_accessor, + int mf_dim) = 0; + + virtual void CopyForPull(const paddle::platform::Place& place, + uint64_t** gpu_keys, + const std::vector& values, + const float* total_values_gpu, + const int64_t* gpu_len, const int slot_num, + const int hidden_size, const int64_t total_length, + int* gpu_dim, int feature_value_size) = 0; + // dedup + virtual void CopyForPull(const paddle::platform::Place& place, + const uint64_t* total_keys, float** gpu_values, + const float* total_values_gpu, + const int64_t* slot_lens, const int* key2slot, + const int hidden_size, const int64_t total_length, + const int* slot_dims, + const uint32_t* gpu_restore_idx, + int pull_value_size) = 0; + + virtual void CopyForPush(const paddle::platform::Place& place, + const std::vector& grad_values, + float* total_grad_values_gpu, + const std::vector& slot_lengths, + const uint64_t total_length, const int batch_size, + size_t grad_value_size, + std::vector& slot_vector, + std::vector& slot_mf_dim_vector) = 0; + + // dedup + virtual void CopyForPush(const paddle::platform::Place& place, + const uint64_t* total_keys, float** grad_values, + float* total_grad_values_gpu, const int* slots, + const int64_t* slot_lens, const int hidden_size, + const int64_t total_length, + const int64_t dedup_length, const int batch_size, + const int* slot_dims, const int* key2slot, + const uint32_t* d_restore_idx, + const size_t grad_value_size) = 0; + + virtual void CopyForPush( + const paddle::platform::Place& place, const uint64_t* total_keys, + float** grad_values, float* total_grad_values_gpu, const int* slots, + const int64_t* slot_lens, const int hidden_size, + const int64_t total_length, const int64_t dedup_length, + const int batch_size, const int* slot_dims, const int* key2slot, + const uint32_t* gpu_sort_idx, const uint32_t* gpu_sort_offset, + const uint32_t* gpu_sort_lens, const size_t grad_value_size) = 0; + + virtual std::string ParseToString(const float* v, int param_size) = 0; +}; + +template +class AccessorWrapper : public VirtualAccessor { + public: + explicit AccessorWrapper() {} + virtual ~AccessorWrapper() {} + AccessorWrapper(const AccessorWrapper&) = delete; + AccessorWrapper& operator=(const AccessorWrapper&) = delete; + + virtual int Configure(std::unordered_map config) { + return gpu_accessor_.Configure(config); + } + + virtual size_t GetFeatureValueSize(int& mf_dim) { + return gpu_accessor_.common_feature_value.Size(mf_dim); + } + + virtual size_t GetPushValueSize(int& mf_dim) { + return gpu_accessor_.common_push_value.Size(mf_dim); + } + + virtual size_t GetPullValueSize(int& mf_dim) { + return gpu_accessor_.common_pull_value.Size(mf_dim); + } + + virtual void BuildFill(void* gpu_val, void* cpu_val, + paddle::distributed::ValueAccessor* cpu_table_accessor, + int mf_dim) { + gpu_accessor_.BuildFill((float*)(gpu_val), cpu_val, cpu_table_accessor, + mf_dim); + } + + virtual void DumpFill(float* gpu_val, + paddle::distributed::ValueAccessor* cpu_table_accessor, + int mf_dim) { + gpu_accessor_.DumpFill(gpu_val, cpu_table_accessor, mf_dim); + } + + virtual void CopyForPull(const paddle::platform::Place& place, + uint64_t** gpu_keys, + const std::vector& values, + const float* total_values_gpu, + const int64_t* gpu_len, const int slot_num, + const int hidden_size, const int64_t total_length, + int* gpu_dim, int feature_value_size) { + CopyForPullImpl(place, gpu_keys, values, total_values_gpu, gpu_len, + slot_num, hidden_size, total_length, gpu_dim, + feature_value_size); + } + + virtual void CopyForPull(const paddle::platform::Place& place, + const uint64_t* total_keys, float** gpu_values, + const float* total_values_gpu, + const int64_t* slot_lens, const int* key2slot, + const int hidden_size, const int64_t total_length, + const int* slot_dims, + const uint32_t* gpu_restore_idx, + int pull_value_size) { + CopyForPullDedupImpl(place, total_keys, gpu_values, total_values_gpu, + slot_lens, key2slot, hidden_size, total_length, + slot_dims, gpu_restore_idx, pull_value_size); + } + + virtual void CopyForPush(const paddle::platform::Place& place, + const std::vector& grad_values, + float* total_grad_values_gpu, + const std::vector& slot_lengths, + const uint64_t total_length, const int batch_size, + size_t grad_value_size, + std::vector& slot_vector, + std::vector& slot_mf_dim_vector) { + CopyForPushImpl(place, grad_values, total_grad_values_gpu, slot_lengths, + total_length, batch_size, grad_value_size, slot_vector, + slot_mf_dim_vector); + } + + virtual void CopyForPush(const paddle::platform::Place& place, + const uint64_t* total_keys, float** grad_values, + float* total_grad_values_gpu, const int* slots, + const int64_t* slot_lens, const int hidden_size, + const int64_t total_length, + const int64_t dedup_length, const int batch_size, + const int* slot_dims, const int* key2slot, + const uint32_t* d_restore_idx, + const size_t grad_value_size) { + CopyForPushDedupImpl(place, total_keys, grad_values, total_grad_values_gpu, + slots, slot_lens, hidden_size, total_length, + dedup_length, batch_size, slot_dims, key2slot, + d_restore_idx, grad_value_size); + } + + virtual void CopyForPush( + const paddle::platform::Place& place, const uint64_t* total_keys, + float** grad_values, float* total_grad_values_gpu, const int* slots, + const int64_t* slot_lens, const int hidden_size, + const int64_t total_length, const int64_t dedup_length, + const int batch_size, const int* slot_dims, const int* key2slot, + const uint32_t* gpu_sort_idx, const uint32_t* gpu_sort_offset, + const uint32_t* gpu_sort_lens, const size_t grad_value_size) { + CopyForPushDedupImpl(place, total_keys, grad_values, total_grad_values_gpu, + slots, slot_lens, hidden_size, total_length, + dedup_length, batch_size, slot_dims, key2slot, + gpu_sort_idx, gpu_sort_offset, gpu_sort_lens, + grad_value_size); + } + + void CopyForPullImpl(const paddle::platform::Place& place, + uint64_t** gpu_keys, const std::vector& values, + const float* total_values_gpu, const int64_t* gpu_len, + const int slot_num, const int hidden_size, + const int64_t total_length, int* gpu_dim, + int feature_value_size); + + void CopyForPushImpl(const paddle::platform::Place& place, + const std::vector& grad_values, + float* total_grad_values_gpu, + const std::vector& slot_lengths, + const uint64_t total_length, const int batch_size, + size_t grad_value_size, std::vector& slot_vector, + std::vector& slot_mf_dim_vector); + + void CopyForPullDedupImpl(const paddle::platform::Place& place, + const uint64_t* total_keys, float** gpu_values, + const float* total_values_gpu, + const int64_t* slot_lens, const int* key2slot, + const int hidden_size, const int64_t total_length, + const int* slot_dims, + const uint32_t* gpu_restore_idx, + int pull_value_size); + + void CopyForPushDedupImpl(const paddle::platform::Place& place, + const uint64_t* total_keys, float** grad_values, + float* total_grad_values_gpu, const int* slots, + const int64_t* slot_lens, const int hidden_size, + const int64_t total_length, + const int64_t dedup_length, const int batch_size, + const int* slot_dims, const int* key2slot, + const uint32_t* d_restore_idx, + const size_t grad_value_size); + + void CopyForPushDedupImpl( + const paddle::platform::Place& place, const uint64_t* total_keys, + float** grad_values, float* total_grad_values_gpu, const int* slots, + const int64_t* slot_lens, const int hidden_size, + const int64_t total_length, const int64_t dedup_length, + const int batch_size, const int* slot_dims, const int* key2slot, + const uint32_t* gpu_sort_idx, const uint32_t* gpu_sort_offset, + const uint32_t* gpu_sort_lens, const size_t grad_value_size); + + virtual std::string ParseToString(const float* v, int param_size) { + return gpu_accessor_.ParseToString(v, param_size); + } + + GPUAccessor gpu_accessor_; +}; + +class GlobalAccessorTransfor { + public: + static GlobalAccessorTransfor& GetInstance() { + static GlobalAccessorTransfor ins; + return ins; + } + void Init(std::string accessor_type) { + if (accessor_wrapper_ptr_ != nullptr) { + return; + } + if (accessor_type == "CtrDymfAccessor") { + accessor_wrapper_ptr_ = new AccessorWrapper(); + } else { + VLOG(0) << "GlobalAccessorTransfor Init not support accessor_type:" + << accessor_type; + accessor_wrapper_ptr_ = new AccessorWrapper(); + } + } + VirtualAccessor* GetAccessorWrapper() { return accessor_wrapper_ptr_; } + + private: + VirtualAccessor* accessor_wrapper_ptr_ = nullptr; +}; + } // end namespace framework } // end namespace paddle + #endif diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h index 11a52d631729c..a0acad9563ef0 100644 --- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h +++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h @@ -27,7 +27,8 @@ DECLARE_double(gpugraph_hbm_table_load_factor); namespace paddle { namespace framework { enum GraphTableType { EDGE_TABLE, FEATURE_TABLE }; -class GpuPsGraphTable : public HeterComm { +class GpuPsGraphTable + : public HeterComm { public: int get_table_offset(int gpu_id, GraphTableType type, int idx) const { int type_id = type; @@ -36,7 +37,8 @@ class GpuPsGraphTable : public HeterComm { } GpuPsGraphTable(std::shared_ptr resource, int topo_aware, int graph_table_num) - : HeterComm(1, resource) { + : HeterComm( + 1, resource) { load_factor_ = FLAGS_gpugraph_hbm_table_load_factor; VLOG(0) << "load_factor = " << load_factor_; @@ -108,8 +110,7 @@ class GpuPsGraphTable : public HeterComm { } } } - ~GpuPsGraphTable() { - } + ~GpuPsGraphTable() {} void build_graph_on_single_gpu(const GpuPsCommGraph &g, int gpu_id, int idx); void build_graph_fea_on_single_gpu(const GpuPsCommGraphFea &g, int gpu_id); void clear_graph_info(int gpu_id, int index); @@ -118,8 +119,8 @@ class GpuPsGraphTable : public HeterComm { void clear_feature_info(int index); void build_graph_from_cpu(const std::vector &cpu_node_list, int idx); - void build_graph_fea_from_cpu(const std::vector &cpu_node_list, - int idx); + void build_graph_fea_from_cpu( + const std::vector &cpu_node_list, int idx); NodeQueryResult graph_node_sample(int gpu_id, int sample_size); NeighborSampleResult graph_neighbor_sample_v3(NeighborSampleQuery q, bool cpu_switch); @@ -129,17 +130,16 @@ class GpuPsGraphTable : public HeterComm { uint64_t *key, int sample_size, int len, bool cpu_query_switch); - int get_feature_of_nodes(int gpu_id, uint64_t* d_walk, - uint64_t* d_offset, int size, int slot_num); + int get_feature_of_nodes(int gpu_id, uint64_t *d_walk, uint64_t *d_offset, + int size, int slot_num); NodeQueryResult query_node_list(int gpu_id, int idx, int start, int query_size); void display_sample_res(void *key, void *val, int len, int sample_len); - void move_result_to_source_gpu(int gpu_id, int gpu_num, - int sample_size, int *h_left, - int *h_right, - uint64_t *src_sample_res, - int *actual_sample_size); + void move_result_to_source_gpu(int gpu_id, int gpu_num, int sample_size, + int *h_left, int *h_right, + uint64_t *src_sample_res, + int *actual_sample_size); int init_cpu_table(const paddle::distributed::GraphParameter &graph); int gpu_num; diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable.h b/paddle/fluid/framework/fleet/heter_ps/hashtable.h index d63060cc5e391..2e4fd943b728e 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable.h +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable.h @@ -126,8 +126,9 @@ class HashTable { void get(const KeyType* d_keys, ValType* d_vals, size_t len, StreamType stream); - template - void get(const KeyType* d_keys, char* d_vals, size_t len, StreamType stream); + template + void get(const KeyType* d_keys, char* d_vals, size_t len, StreamType stream, + FVAccessor& fv_accessor); void show(); @@ -140,8 +141,8 @@ class HashTable { #if defined(PADDLE_WITH_CUDA) template - void update(const KeyType* d_keys, const float* d_grads, size_t len, - Sgd sgd, StreamType stream); + void update(const KeyType* d_keys, const float* d_grads, size_t len, Sgd sgd, + StreamType stream); template void update(const KeyType* d_keys, const char* d_grads, size_t len, Sgd sgd, @@ -168,14 +169,10 @@ class HashTable { << " push value size: " << push_grad_value_size_; } - void set_accessor(CommonFeatureValueAccessor& accessor) { - feature_value_accessor_ = accessor; - } void show_collision(int id) { return container_->print_collision(id); } std::unique_ptr rwlock_{nullptr}; - CommonFeatureValueAccessor feature_value_accessor_; private: #if defined(PADDLE_WITH_CUDA) diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu index 3d083cf996553..2586d716ada48 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu @@ -80,12 +80,12 @@ __global__ void search_kernel(Table* table, } } -template +template __global__ void dy_mf_search_kernel(Table* table, const typename Table::key_type* const keys, char* vals, size_t len, size_t pull_feature_value_size, - CommonFeatureValueAccessor feature_value_accessor) { + FVAccessor feature_value_accessor) { const size_t i = blockIdx.x * blockDim.x + threadIdx.x; // return; if (i < len) { @@ -94,28 +94,7 @@ __global__ void dy_mf_search_kernel(Table* table, uint64_t offset = i * pull_feature_value_size; float* cur = (float*)(vals + offset); float* input = it->second; - - cur[feature_value_accessor.common_pull_value.ShowIndex()] = - input[feature_value_accessor.common_feature_value.ShowIndex()]; - cur[feature_value_accessor.common_pull_value.ClickIndex()] = - input[feature_value_accessor.common_feature_value.ClickIndex()]; - cur[feature_value_accessor.common_pull_value.EmbedWIndex()] = - input[feature_value_accessor.common_feature_value.EmbedWIndex()]; - - int mf_size = int(input[feature_value_accessor.common_feature_value.MfSizeIndex()]); - if (mf_size == 0) { - cur[feature_value_accessor.common_pull_value.MfSizeIndex()] = 0; - return; - } - // set pull value real dim size - int mf_dim = int(input[feature_value_accessor.common_feature_value.MfDimIndex()]); - cur[feature_value_accessor.common_pull_value.MfSizeIndex()] = mf_dim; - - int embedx_off = feature_value_accessor.common_pull_value.EmbedxWIndex(); - int value_off = feature_value_accessor.common_feature_value.EmbedxWIndex(); - for (int k = 0; k < mf_dim; ++k) { - cur[embedx_off + k] = input[value_off + k]; - } + feature_value_accessor.PullValueFill(cur, input); } } } @@ -201,15 +180,16 @@ void HashTable::get(const KeyType* d_keys, ValType* d_vals, } template -template +template void HashTable::get(const KeyType* d_keys, char* d_vals, - size_t len, StreamType stream) { + size_t len, StreamType stream, + FVAccessor& fv_accessor) { if (len == 0) { return; } const int grid_size = (len - 1) / BLOCK_SIZE_ + 1; dy_mf_search_kernel<<>>( - container_, d_keys, d_vals, len, pull_feature_value_size_, feature_value_accessor_); + container_, d_keys, d_vals, len, pull_feature_value_size_, fv_accessor); } template @@ -337,14 +317,14 @@ template class HashTable; template class HashTable; template class HashTable; -template void HashTable::get< - cudaStream_t>(const unsigned long* d_keys, - float* d_vals, size_t len, - cudaStream_t stream); +template void HashTable::get( + const unsigned long* d_keys, float* d_vals, size_t len, + cudaStream_t stream); template void -HashTable::get( - const unsigned long* d_keys, char* d_vals, size_t len, cudaStream_t stream); +HashTable::get( + const unsigned long* d_keys, char* d_vals, size_t len, cudaStream_t stream, + CommonFeatureValueAccessor& fv_accessor); template void HashTable::get(const long* d_keys, int* d_vals, size_t len, @@ -353,7 +333,8 @@ template void HashTable::get(const long* d_keys, template void HashTable::get( const unsigned long* d_keys, int* d_vals, size_t len, cudaStream_t stream); template void HashTable::get( - const unsigned long* d_keys, unsigned long* d_vals, size_t len, cudaStream_t stream); + const unsigned long* d_keys, unsigned long* d_vals, size_t len, + cudaStream_t stream); template void HashTable::get( const unsigned long* d_keys, long* d_vals, size_t len, cudaStream_t stream); template void HashTable::get( @@ -368,15 +349,13 @@ template void HashTable::get( // const unsigned long* d_keys, char* d_vals, size_t len, cudaStream_t // stream); -template void HashTable::insert< - cudaStream_t>(const unsigned long* d_keys, - const float* d_vals, size_t len, - cudaStream_t stream); +template void HashTable::insert( + const unsigned long* d_keys, const float* d_vals, size_t len, + cudaStream_t stream); -template void HashTable:: - insert(const unsigned long* d_keys, size_t len, char* pool, - size_t feature_value_size, size_t start_index, - cudaStream_t stream); +template void HashTable::insert( + const unsigned long* d_keys, size_t len, char* pool, + size_t feature_value_size, size_t start_index, cudaStream_t stream); template void HashTable::insert(const long* d_keys, const int* d_vals, @@ -402,26 +381,27 @@ template void HashTable::insert( template void HashTable::insert( const long* d_keys, const unsigned int* d_vals, size_t len, cudaStream_t stream); - + template void HashTable::insert( const unsigned long* d_keys, const unsigned long* d_vals, size_t len, - cudaStream_t stream); + cudaStream_t stream); -template void HashTable:: - dump_to_cpu(int devid, cudaStream_t stream); +template void HashTable::dump_to_cpu( + int devid, cudaStream_t stream); template void -HashTable::update(const unsigned long* d_keys, const char* d_grads, size_t len, - SparseAdagradOptimizer sgd, - cudaStream_t stream); -template void -HashTable::update(const unsigned long* d_keys, const char* d_grads, size_t len, - SparseAdamOptimizer sgd, - cudaStream_t stream); +HashTable::update( + const unsigned long* d_keys, const char* d_grads, size_t len, + SparseAdagradOptimizer sgd, cudaStream_t stream); template void -HashTable::update(const unsigned long* d_keys, const char* d_grads, size_t len, - SparseAdamSharedOptimizer sgd, - cudaStream_t stream); +HashTable::update( + const unsigned long* d_keys, const char* d_grads, size_t len, + SparseAdamOptimizer sgd, cudaStream_t stream); +template void HashTable::update< + SparseAdamSharedOptimizer, cudaStream_t>(const unsigned long* d_keys, + const char* d_grads, size_t len, + SparseAdamSharedOptimizer sgd, + cudaStream_t stream); // template void HashTable::update< diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h index 72dfb476efe0a..0557b66d8655d 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h @@ -45,23 +45,20 @@ namespace framework { #define TYPEALIGN(ALIGNVAL, LEN) \ (((uint64_t)(LEN) + ((ALIGNVAL)-1)) & ~((uint64_t)((ALIGNVAL)-1))) -template +template class HeterComm { public: HeterComm(size_t capacity, std::shared_ptr resource); - HeterComm(size_t capacity, std::shared_ptr resource, - CommonFeatureValueAccessor& accessor); virtual ~HeterComm(); HeterComm(const HeterComm&) = delete; HeterComm& operator=(const HeterComm&) = delete; void split_input_to_shard(KeyType* d_keys, int* d_idx_ptr, size_t len, - int* left, int* right, int gpu_num); + int* left, int* right, int gpu_num); void merge_keys(int gpu_num, const KeyType* d_keys, size_t len, - KeyType* d_sorted_keys, - KeyType* d_merged_keys, - uint32_t* d_restore_idx, - size_t & uniq_len); + KeyType* d_sorted_keys, KeyType* d_merged_keys, + uint32_t* d_restore_idx, size_t& uniq_len); void merge_grad(int gpu_num, KeyType* d_keys, GradType* d_grads, size_t len, int& uniq_len); // NOLINT void dynamic_merge_grad(int gpu_num, KeyType* d_keys, float* d_grads, @@ -126,7 +123,7 @@ class HeterComm { max_mf_dim_ = max_mf_dim; } - void set_accessor(CommonFeatureValueAccessor& accessor) { + void set_accessor(FVAccessor& accessor) { feature_value_accessor_ = accessor; } #endif @@ -142,16 +139,12 @@ class HeterComm { void end_pass(); #if defined(PADDLE_WITH_CUDA) // dedup - int dedup_keys_and_fillidx(const int gpu_id, - const int total_fea_num, - const KeyType* d_keys, // input - KeyType* d_merged_keys, // output - KeyType* d_sorted_keys, - uint32_t* d_restore_idx, - uint32_t* d_sorted_idx, - uint32_t* d_offset, - uint32_t* d_merged_cnts, - bool filter_zero); + int dedup_keys_and_fillidx(const int gpu_id, const int total_fea_num, + const KeyType* d_keys, // input + KeyType* d_merged_keys, // output + KeyType* d_sorted_keys, uint32_t* d_restore_idx, + uint32_t* d_sorted_idx, uint32_t* d_offset, + uint32_t* d_merged_cnts, bool filter_zero); #endif struct Node { @@ -259,11 +252,11 @@ class HeterComm { ValType* src_val); void walk_to_src(int start_index, int gpu_num, int* h_left, int* h_right, char* src_val, size_t val_size); + protected: void pull_merge_sparse(int num, KeyType* d_keys, float* d_vals, size_t len); void pull_normal_sparse(int num, KeyType* d_keys, float* d_vals, size_t len); - protected: using Table = HashTable; using PtrTable = HashTable; std::vector tables_; @@ -274,7 +267,7 @@ class HeterComm { int block_size_{256}; std::unique_ptr heter_comm_kernel_; - CommonFeatureValueAccessor feature_value_accessor_; + FVAccessor feature_value_accessor_; private: int topo_aware_{0}; diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h index e97bcb561675b..3ce8a315af9fb 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h @@ -31,8 +31,12 @@ DECLARE_int32(gpugraph_dedup_pull_push_mode); namespace paddle { namespace framework { -template -HeterComm::HeterComm( + +template +HeterComm::HeterComm( size_t capacity, std::shared_ptr resource) { VLOG(1) << "Construct new HeterComm"; resource_ = resource; @@ -45,52 +49,26 @@ HeterComm::HeterComm( platform::CUDADeviceGuard guard(resource_->dev_id(i)); allocators_.push_back(std::make_shared( 8, 1, (unsigned int)-1, (size_t)-1, false, false)); // NOLINT -#endif - if (!multi_mf_dim_) { - auto table = new Table(capacity / load_factor_); - tables_.push_back(table); - } else { - VLOG(0) << "Error:use HeterComm Construct with accessor"; - return; - } - if (multi_node_) { - storage_[i].init(feanum_, resource_->dev_id(i)); - } - } - heter_comm_kernel_ = std::make_unique(block_size_); - init_path(); -} - -template -HeterComm::HeterComm( - size_t capacity, std::shared_ptr resource, - CommonFeatureValueAccessor& feature_value_accessor) { - VLOG(1) << "Construct new HeterComm"; - resource_ = resource; - storage_.resize(resource_->total_device()); - multi_mf_dim_ = resource->multi_mf(); - for (int i = 0; i < resource_->total_device(); ++i) { -#if defined(PADDLE_WITH_CUDA) - platform::CUDADeviceGuard guard(resource_->dev_id(i)); - allocators_.push_back(std::make_shared( - 8, 1, (unsigned int)-1, (size_t)-1, false, false)); // NOLINT #endif if (!multi_mf_dim_) { auto table = new Table(capacity / load_factor_); tables_.push_back(table); } else { max_mf_dim_ = resource_->max_mf_dim(); - feature_value_accessor_ = feature_value_accessor; - size_t val_type_size = TYPEALIGN( - 8, feature_value_accessor_.common_feature_value.Size(max_mf_dim_)); - size_t grad_type_size = TYPEALIGN( - 8, feature_value_accessor_.common_push_value.Size(max_mf_dim_)); - size_t pull_type_size = feature_value_accessor_.common_pull_value.Size(max_mf_dim_); + auto accessor_wrapper_ptr = + GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); + size_t val_type_size = + accessor_wrapper_ptr->GetFeatureValueSize(max_mf_dim_); + size_t grad_type_size = + accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_); + size_t pull_type_size = + accessor_wrapper_ptr->GetPullValueSize(max_mf_dim_); + VLOG(0) << " HeterComm init, max feature_value_size:" << val_type_size << ", feature_value_push_size:" << grad_type_size << ", feature_pull_type_size:" << pull_type_size; auto ptr_table = new PtrTable(capacity / load_factor_); - ptr_table->set_accessor(feature_value_accessor_); + // ptr_table->set_accessor(feature_value_accessor_); ptr_table->set_feature_value_size(pull_type_size, grad_type_size); ptr_tables_.push_back(ptr_table); } @@ -98,13 +76,15 @@ HeterComm::HeterComm( storage_[i].init(feanum_, resource_->dev_id(i)); } } - heter_comm_kernel_ = - std::make_unique(block_size_, feature_value_accessor_); + heter_comm_kernel_ = std::make_unique(block_size_); init_path(); } -template -void HeterComm::init_path() { +template +void HeterComm::init_path() { int total_device = resource_->total_device(); path_.resize(total_device); if (!topo_aware_) { @@ -156,11 +136,18 @@ void HeterComm::init_path() { } } -template +template template -void HeterComm::memory_copy( - DstPlace dst_place, void* dst, SrcPlace src_place, const void* src, - size_t count, StreamType stream) { +void HeterComm::memory_copy( + DstPlace dst_place, + void* dst, + SrcPlace src_place, + const void* src, + size_t count, + StreamType stream) { #if defined(PADDLE_WITH_CUDA) CUDA_CHECK(cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream)); if (stream == 0) { @@ -171,11 +158,12 @@ void HeterComm::memory_copy( #endif } -template -void HeterComm::create_storage(int start_index, - int end_index, - size_t keylen, - size_t vallen) { +template +void HeterComm::create_storage( + int start_index, int end_index, size_t keylen, size_t vallen) { #if defined(PADDLE_WITH_CUDA) auto& allocator = allocators_[start_index]; auto& nodes = path_[start_index][end_index].nodes_; @@ -184,11 +172,13 @@ void HeterComm::create_storage(int start_index, PADDLE_ENFORCE_GPU_SUCCESS(allocator->DeviceAllocate( resource_->dev_id(nodes[i].dev_num), (void**)&(nodes[i].key_storage), // NOLINT - keylen, resource_->remote_stream(nodes[i].dev_num, start_index))); + keylen, + resource_->remote_stream(nodes[i].dev_num, start_index))); PADDLE_ENFORCE_GPU_SUCCESS(allocator->DeviceAllocate( resource_->dev_id(nodes[i].dev_num), (void**)&(nodes[i].val_storage), // NOLINT - vallen, resource_->remote_stream(nodes[i].dev_num, start_index))); + vallen, + resource_->remote_stream(nodes[i].dev_num, start_index))); nodes[i].key_bytes_len = keylen; nodes[i].val_bytes_len = vallen; } @@ -207,9 +197,12 @@ void HeterComm::create_storage(int start_index, #endif } -template -void HeterComm::destroy_storage(int start_index, - int end_index) { +template +void HeterComm::destroy_storage( + int start_index, int end_index) { #if defined(PADDLE_WITH_CUDA) auto& allocator = allocators_[start_index]; auto& nodes = path_[start_index][end_index].nodes_; @@ -224,12 +217,17 @@ void HeterComm::destroy_storage(int start_index, #endif } -template -void HeterComm::walk_to_dest(int start_index, - int num, int* h_left, - int* h_right, - KeyType* src_key, - GradType* src_val) { +template +void HeterComm::walk_to_dest( + int start_index, + int num, + int* h_left, + int* h_right, + KeyType* src_key, + GradType* src_val) { int need_copy_val = 0; if (src_val) { need_copy_val = 1; @@ -249,18 +247,24 @@ void HeterComm::walk_to_dest(int start_index, auto src_place = DevPlace(src_dev_id); auto dst_place = DevPlace(dst_dev_id); - memory_copy(dst_place, node.key_storage, src_place, + memory_copy(dst_place, + node.key_storage, + src_place, reinterpret_cast(src_key + h_left[i]), - node.key_bytes_len, node.in_stream); + node.key_bytes_len, + node.in_stream); // #if defined(PADDLE_WITH_CUDA) // adapt for gpu-graph // cudaMemsetAsync(node.val_storage, -1, node.val_bytes_len, // node.in_stream); // #endif if (need_copy_val) { - memory_copy(dst_place, node.val_storage, src_place, + memory_copy(dst_place, + node.val_storage, + src_place, reinterpret_cast(src_val + h_left[i]), - node.val_bytes_len, node.in_stream); + node.val_bytes_len, + node.in_stream); } } while (!que.empty()) { @@ -282,13 +286,17 @@ void HeterComm::walk_to_dest(int start_index, auto src_place = DevPlace(src_dev_id); auto dst_place = DevPlace(dst_dev_id); - memory_copy(dst_place, cur_task.path->nodes_[cur_step + 1].key_storage, - src_place, cur_task.path->nodes_[cur_step].key_storage, + memory_copy(dst_place, + cur_task.path->nodes_[cur_step + 1].key_storage, + src_place, + cur_task.path->nodes_[cur_step].key_storage, cur_task.path->nodes_[cur_step + 1].key_bytes_len, cur_task.path->nodes_[cur_step + 1].in_stream); if (need_copy_val) { - memory_copy(dst_place, cur_task.path->nodes_[cur_step + 1].val_storage, - src_place, cur_task.path->nodes_[cur_step].val_storage, + memory_copy(dst_place, + cur_task.path->nodes_[cur_step + 1].val_storage, + src_place, + cur_task.path->nodes_[cur_step].val_storage, cur_task.path->nodes_[cur_step + 1].val_bytes_len, cur_task.path->nodes_[cur_step + 1].in_stream); } @@ -296,10 +304,18 @@ void HeterComm::walk_to_dest(int start_index, } } -template -void HeterComm::walk_to_dest( - int start_index, int gpu_num, int* h_left, int* h_right, KeyType* src_key, - char* src_val, size_t val_size) { +template +void HeterComm::walk_to_dest( + int start_index, + int gpu_num, + int* h_left, + int* h_right, + KeyType* src_key, + char* src_val, + size_t val_size) { int need_copy_val = 0; if (src_val) { need_copy_val = 1; @@ -313,13 +329,18 @@ void HeterComm::walk_to_dest( auto& node = path_[start_index][i].nodes_[0]; CopyTask t(&path_[start_index][i], 0); que.push(t); - CUDA_CHECK(cudaMemcpyAsync( - node.key_storage, reinterpret_cast(src_key + h_left[i]), - node.key_bytes_len, cudaMemcpyDefault, node.in_stream)); + CUDA_CHECK(cudaMemcpyAsync(node.key_storage, + reinterpret_cast(src_key + h_left[i]), + node.key_bytes_len, + cudaMemcpyDefault, + node.in_stream)); if (need_copy_val) { - CUDA_CHECK(cudaMemcpyAsync( - node.val_storage, src_val + uint64_t(h_left[i]) * uint64_t(val_size), - node.val_bytes_len, cudaMemcpyDefault, node.in_stream)); + CUDA_CHECK( + cudaMemcpyAsync(node.val_storage, + src_val + uint64_t(h_left[i]) * uint64_t(val_size), + node.val_bytes_len, + cudaMemcpyDefault, + node.in_stream)); } } while (!que.empty()) { @@ -333,25 +354,34 @@ void HeterComm::walk_to_dest( int cur_step = cur_task.step; CopyTask c(cur_task.path, cur_step + 1); que.push(c); - CUDA_CHECK(cudaMemcpyAsync( - cur_task.path->nodes_[cur_step + 1].key_storage, - cur_task.path->nodes_[cur_step].key_storage, - cur_task.path->nodes_[cur_step + 1].key_bytes_len, cudaMemcpyDefault, - cur_task.path->nodes_[cur_step + 1].in_stream)); + CUDA_CHECK( + cudaMemcpyAsync(cur_task.path->nodes_[cur_step + 1].key_storage, + cur_task.path->nodes_[cur_step].key_storage, + cur_task.path->nodes_[cur_step + 1].key_bytes_len, + cudaMemcpyDefault, + cur_task.path->nodes_[cur_step + 1].in_stream)); if (need_copy_val) { - CUDA_CHECK(cudaMemcpyAsync( - cur_task.path->nodes_[cur_step + 1].val_storage, - cur_task.path->nodes_[cur_step].val_storage, - cur_task.path->nodes_[cur_step + 1].val_bytes_len, - cudaMemcpyDefault, cur_task.path->nodes_[cur_step + 1].in_stream)); + CUDA_CHECK( + cudaMemcpyAsync(cur_task.path->nodes_[cur_step + 1].val_storage, + cur_task.path->nodes_[cur_step].val_storage, + cur_task.path->nodes_[cur_step + 1].val_bytes_len, + cudaMemcpyDefault, + cur_task.path->nodes_[cur_step + 1].in_stream)); } } } } -template -void HeterComm::walk_to_src( - int start_index, int gpu_num, int* h_left, int* h_right, char* src_val, +template +void HeterComm::walk_to_src( + int start_index, + int gpu_num, + int* h_left, + int* h_right, + char* src_val, size_t val_size) { std::queue que; for (int i = 0; i < gpu_num; i++) { @@ -362,8 +392,10 @@ void HeterComm::walk_to_src( auto& node = path_[start_index][i].nodes_[cur_step]; if (cur_step == 0) { CUDA_CHECK(cudaMemcpyAsync(src_val + uint64_t(h_left[i]) * val_size, - node.val_storage, node.val_bytes_len, - cudaMemcpyDefault, node.out_stream)); + node.val_storage, + node.val_bytes_len, + cudaMemcpyDefault, + node.out_stream)); } else { CopyTask t(&path_[start_index][i], cur_step - 1); que.push(t); @@ -385,24 +417,29 @@ void HeterComm::walk_to_src( if (cur_step > 0) { CopyTask c(cur_task.path, cur_step - 1); que.push(c); - CUDA_CHECK(cudaMemcpyAsync( - cur_task.path->nodes_[cur_step - 1].val_storage, - cur_task.path->nodes_[cur_step].val_storage, - cur_task.path->nodes_[cur_step - 1].val_bytes_len, cudaMemcpyDefault, - cur_task.path->nodes_[cur_step - 1].out_stream)); + CUDA_CHECK( + cudaMemcpyAsync(cur_task.path->nodes_[cur_step - 1].val_storage, + cur_task.path->nodes_[cur_step].val_storage, + cur_task.path->nodes_[cur_step - 1].val_bytes_len, + cudaMemcpyDefault, + cur_task.path->nodes_[cur_step - 1].out_stream)); } else if (cur_step == 0) { int end_index = cur_task.path->nodes_.back().dev_num; - CUDA_CHECK(cudaMemcpyAsync( - src_val + uint64_t(h_left[end_index]) * val_size, - cur_task.path->nodes_[cur_step].val_storage, - cur_task.path->nodes_[cur_step].val_bytes_len, cudaMemcpyDefault, - cur_task.path->nodes_[cur_step].out_stream)); + CUDA_CHECK( + cudaMemcpyAsync(src_val + uint64_t(h_left[end_index]) * val_size, + cur_task.path->nodes_[cur_step].val_storage, + cur_task.path->nodes_[cur_step].val_bytes_len, + cudaMemcpyDefault, + cur_task.path->nodes_[cur_step].out_stream)); } } } -template -HeterComm::~HeterComm() { +template +HeterComm::~HeterComm() { if (!multi_mf_dim_) { for (auto& table : tables_) { delete table; @@ -420,15 +457,23 @@ HeterComm::~HeterComm() { } } -template -void HeterComm::show_one_table(int gpu_num) { +template +void HeterComm::show_one_table( + int gpu_num) { if (!multi_mf_dim_) { tables_[gpu_num]->show(); } } -template -void HeterComm::show_table_collisions() { +template +void HeterComm:: + show_table_collisions() { size_t idx = 0; for (auto& table : tables_) { if (table != nullptr) { @@ -443,8 +488,11 @@ void HeterComm::show_table_collisions() { } } -template -int HeterComm::log2i(int x) { +template +int HeterComm::log2i(int x) { unsigned res = 0; while (x >>= 1) { ++res; @@ -452,13 +500,20 @@ int HeterComm::log2i(int x) { return res; } -template -int HeterComm::get_index_by_devid(int devid) { +template +int HeterComm::get_index_by_devid( + int devid) { return resource_->get_index_by_devid(devid); } -template -void HeterComm::set_sparse_sgd( +template +void HeterComm::set_sparse_sgd( const OptimizerConfig& optimizer_config) { for (int i = 0; i < resource_->total_device(); ++i) { AnyDeviceGuard guard(resource_->dev_id(i)); @@ -466,8 +521,11 @@ void HeterComm::set_sparse_sgd( } } -template -void HeterComm::set_embedx_sgd( +template +void HeterComm::set_embedx_sgd( const OptimizerConfig& optimizer_config) { for (int i = 0; i < resource_->total_device(); ++i) { AnyDeviceGuard guard(resource_->dev_id(i)); @@ -475,10 +533,18 @@ void HeterComm::set_embedx_sgd( } } -template -void HeterComm::build_ps( - int dev_num, KeyType* h_keys, ValType* h_vals, size_t len, - size_t chunk_size, int stream_num, int offset) { +template +void HeterComm::build_ps( + int dev_num, + KeyType* h_keys, + ValType* h_vals, + size_t len, + size_t chunk_size, + int stream_num, + int offset) { if (len <= 0) { return; } @@ -513,17 +579,24 @@ void HeterComm::build_ps( auto dst_place = place; auto src_place = platform::CPUPlace(); - memory_copy( - dst_place, reinterpret_cast(d_key_bufs[cur_stream]->ptr()), - src_place, h_keys + cur_len, sizeof(KeyType) * tmp_len, cur_use_stream); - memory_copy( - dst_place, reinterpret_cast(d_val_bufs[cur_stream]->ptr()), - src_place, h_vals + cur_len, sizeof(ValType) * tmp_len, cur_use_stream); + memory_copy(dst_place, + reinterpret_cast(d_key_bufs[cur_stream]->ptr()), + src_place, + h_keys + cur_len, + sizeof(KeyType) * tmp_len, + cur_use_stream); + memory_copy(dst_place, + reinterpret_cast(d_val_bufs[cur_stream]->ptr()), + src_place, + h_vals + cur_len, + sizeof(ValType) * tmp_len, + cur_use_stream); if (offset == -1) offset = dev_num; tables_[offset]->insert( reinterpret_cast(d_key_bufs[cur_stream]->ptr()), reinterpret_cast(d_val_bufs[cur_stream]->ptr()), - (size_t)tmp_len, cur_use_stream); + (size_t)tmp_len, + cur_use_stream); cur_stream += 1; cur_len += tmp_len; @@ -534,12 +607,18 @@ void HeterComm::build_ps( } } -template -void HeterComm::build_ps(int num, KeyType* h_keys, - char* pool, size_t len, - size_t feature_value_size, - size_t chunk_size, - int stream_num) { +template +void HeterComm::build_ps( + int num, + KeyType* h_keys, + char* pool, + size_t len, + size_t feature_value_size, + size_t chunk_size, + int stream_num) { if (len <= 0) { return; } @@ -572,12 +651,19 @@ void HeterComm::build_ps(int num, KeyType* h_keys, auto dst_place = place; auto src_place = platform::CPUPlace(); - memory_copy( - dst_place, reinterpret_cast(d_key_bufs[cur_stream]->ptr()), - src_place, h_keys + cur_len, sizeof(KeyType) * tmp_len, cur_use_stream); + memory_copy(dst_place, + reinterpret_cast(d_key_bufs[cur_stream]->ptr()), + src_place, + h_keys + cur_len, + sizeof(KeyType) * tmp_len, + cur_use_stream); ptr_tables_[num]->insert( - reinterpret_cast(d_key_bufs[cur_stream]->ptr()), tmp_len, - pool, feature_value_size, cur_len, cur_use_stream); + reinterpret_cast(d_key_bufs[cur_stream]->ptr()), + tmp_len, + pool, + feature_value_size, + cur_len, + cur_use_stream); cur_stream += 1; cur_len += tmp_len; } @@ -587,9 +673,15 @@ void HeterComm::build_ps(int num, KeyType* h_keys, } } -template -void HeterComm::merge_grad( - int dev_num, KeyType* d_keys, GradType* d_grads, size_t len, +template +void HeterComm::merge_grad( + int dev_num, + KeyType* d_keys, + GradType* d_grads, + size_t len, int& uniq_len) { // NOLINT int dev_id = resource_->dev_id(dev_num); DevPlace place = DevPlace(dev_id); @@ -601,37 +693,75 @@ void HeterComm::merge_grad( auto d_merge_grads = memory::Alloc(place, len * sizeof(GradType)); GradType* d_merge_grads_ptr = reinterpret_cast(d_merge_grads->ptr()); - heter_comm_kernel_->sort_pairs(NULL, temp_storage_bytes, d_keys, - d_merge_keys_ptr, d_grads, d_merge_grads_ptr, - len, 0, 8 * sizeof(KeyType), stream, false); + heter_comm_kernel_->sort_pairs(NULL, + temp_storage_bytes, + d_keys, + d_merge_keys_ptr, + d_grads, + d_merge_grads_ptr, + len, + 0, + 8 * sizeof(KeyType), + stream, + false); auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); - heter_comm_kernel_->sort_pairs( - d_temp_storage->ptr(), temp_storage_bytes, d_keys, d_merge_keys_ptr, - d_grads, d_merge_grads_ptr, len, 0, 8 * sizeof(KeyType), stream, false); + heter_comm_kernel_->sort_pairs(d_temp_storage->ptr(), + temp_storage_bytes, + d_keys, + d_merge_keys_ptr, + d_grads, + d_merge_grads_ptr, + len, + 0, + 8 * sizeof(KeyType), + stream, + false); temp_storage_bytes = 0; auto d_num_runs_out_mem = memory::Alloc(place, sizeof(int)); int* d_num_runs_out = reinterpret_cast(d_num_runs_out_mem->ptr()); - heter_comm_kernel_->reduce_by_key(NULL, temp_storage_bytes, d_merge_keys_ptr, - d_keys, d_merge_grads_ptr, d_grads, - d_num_runs_out, len, stream, false); + heter_comm_kernel_->reduce_by_key(NULL, + temp_storage_bytes, + d_merge_keys_ptr, + d_keys, + d_merge_grads_ptr, + d_grads, + d_num_runs_out, + len, + stream, + false); if (d_temp_storage->size() < temp_storage_bytes) { d_temp_storage = NULL; d_temp_storage = memory::Alloc(place, temp_storage_bytes); } - heter_comm_kernel_->reduce_by_key( - d_temp_storage->ptr(), temp_storage_bytes, d_merge_keys_ptr, d_keys, - d_merge_grads_ptr, d_grads, d_num_runs_out, len, stream, false); + heter_comm_kernel_->reduce_by_key(d_temp_storage->ptr(), + temp_storage_bytes, + d_merge_keys_ptr, + d_keys, + d_merge_grads_ptr, + d_grads, + d_num_runs_out, + len, + stream, + false); auto dst_place = platform::CPUPlace(); auto src_place = place; - memory_copy(dst_place, &uniq_len, src_place, d_num_runs_out, sizeof(int), - stream); + memory_copy( + dst_place, &uniq_len, src_place, d_num_runs_out, sizeof(int), stream); sync_stream(stream); } -template -void HeterComm::dynamic_merge_grad( - int gpu_num, KeyType* d_keys, float* d_grads, size_t len, int& uniq_len, - size_t& segment_len, bool enable_segment_merge_grad) { +template +void HeterComm::dynamic_merge_grad( + int gpu_num, + KeyType* d_keys, + float* d_grads, + size_t len, + int& uniq_len, + size_t& segment_len, + bool enable_segment_merge_grad) { int dev_id = resource_->dev_id(gpu_num); platform::CUDAPlace place = platform::CUDAPlace(dev_id); platform::CUDADeviceGuard guard(dev_id); @@ -639,8 +769,9 @@ void HeterComm::dynamic_merge_grad( size_t temp_storage_bytes; size_t grad_dim = max_mf_dim_; - size_t grad_value_size = - TYPEALIGN(8, feature_value_accessor_.common_push_value.Size(max_mf_dim_)); + auto accessor_wrapper_ptr = + GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); + size_t grad_value_size = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_); auto d_merge_keys = memory::Alloc(place, len * sizeof(KeyType)); KeyType* d_merge_keys_ptr = reinterpret_cast(d_merge_keys->ptr()); @@ -652,70 +783,129 @@ void HeterComm::dynamic_merge_grad( int* d_merged_size = (int*)&d_idx[len]; heter_comm_kernel_->fill_idx(d_idx, len, stream); - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs( - NULL, temp_storage_bytes, d_keys, d_merge_keys_ptr, d_idx, d_index, len, - 0, 8 * sizeof(KeyType), stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + cub::DeviceRadixSort::SortPairs(NULL, + temp_storage_bytes, + d_keys, + d_merge_keys_ptr, + d_idx, + d_index, + len, + 0, + 8 * sizeof(KeyType), + stream)); auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs( - d_temp_storage->ptr(), temp_storage_bytes, d_keys, d_merge_keys_ptr, - d_idx, d_index, len, 0, 8 * sizeof(KeyType), stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + cub::DeviceRadixSort::SortPairs(d_temp_storage->ptr(), + temp_storage_bytes, + d_keys, + d_merge_keys_ptr, + d_idx, + d_index, + len, + 0, + 8 * sizeof(KeyType), + stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); temp_storage_bytes = 0; - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRunLengthEncode::Encode( - NULL, temp_storage_bytes, d_merge_keys_ptr, d_keys, d_fea_num_info_ptr, - d_merged_size, len, stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + cub::DeviceRunLengthEncode::Encode(NULL, + temp_storage_bytes, + d_merge_keys_ptr, + d_keys, + d_fea_num_info_ptr, + d_merged_size, + len, + stream)); if (d_temp_storage->size() < temp_storage_bytes) { d_temp_storage = NULL; d_temp_storage = memory::Alloc(place, temp_storage_bytes); } - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRunLengthEncode::Encode( - d_temp_storage->ptr(), temp_storage_bytes, d_merge_keys_ptr, d_keys, - d_fea_num_info_ptr, d_merged_size, len, stream)); - - cudaMemcpyAsync((void*)&uniq_len, d_merged_size, sizeof(int), - cudaMemcpyDeviceToHost, stream); + PADDLE_ENFORCE_GPU_SUCCESS( + cub::DeviceRunLengthEncode::Encode(d_temp_storage->ptr(), + temp_storage_bytes, + d_merge_keys_ptr, + d_keys, + d_fea_num_info_ptr, + d_merged_size, + len, + stream)); + + cudaMemcpyAsync((void*)&uniq_len, + d_merged_size, + sizeof(int), + cudaMemcpyDeviceToHost, + stream); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); assert(d_merged_size > 0); uint32_t* d_offset = (uint32_t*)&d_index[len]; temp_storage_bytes = 0; - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum( - NULL, temp_storage_bytes, d_fea_num_info_ptr, d_offset, uniq_len, - stream)); + PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum(NULL, + temp_storage_bytes, + d_fea_num_info_ptr, + d_offset, + uniq_len, + stream)); if (d_temp_storage->size() < temp_storage_bytes) { d_temp_storage = NULL; d_temp_storage = memory::Alloc(place, temp_storage_bytes); } - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum( - d_temp_storage->ptr(), temp_storage_bytes, d_fea_num_info_ptr, d_offset, - uniq_len, stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + cub::DeviceScan::ExclusiveSum(d_temp_storage->ptr(), + temp_storage_bytes, + d_fea_num_info_ptr, + d_offset, + uniq_len, + stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); if (enable_segment_merge_grad) { - segment_merge_grad(gpu_num, d_merge_keys_ptr, d_grads, d_index, len, - d_fea_num_info_ptr, uniq_len, segment_len); - PADDLE_ENFORCE_GPU_SUCCESS( - cudaMemcpyAsync(d_keys, d_merge_keys_ptr, sizeof(KeyType) * segment_len, - cudaMemcpyDeviceToDevice, stream)); + segment_merge_grad(gpu_num, + d_merge_keys_ptr, + d_grads, + d_index, + len, + d_fea_num_info_ptr, + uniq_len, + segment_len); + PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(d_keys, + d_merge_keys_ptr, + sizeof(KeyType) * segment_len, + cudaMemcpyDeviceToDevice, + stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); } else { auto d_merge_grads = memory::Alloc(place, len * grad_value_size); float* d_merge_grads_ptr = reinterpret_cast(d_merge_grads->ptr()); - heter_comm_kernel_->merge_gradient( - d_keys, d_offset, d_fea_num_info_ptr, d_index, (char*)d_grads, - (char*)d_merge_grads_ptr, uniq_len, grad_dim, grad_value_size, merger_, - stream); - PADDLE_ENFORCE_GPU_SUCCESS( - cudaMemcpyAsync(d_grads, d_merge_grads_ptr, grad_value_size * uniq_len, - cudaMemcpyDeviceToDevice, stream)); + heter_comm_kernel_->merge_gradient(d_keys, + d_offset, + d_fea_num_info_ptr, + d_index, + (char*)d_grads, + (char*)d_merge_grads_ptr, + uniq_len, + grad_dim, + grad_value_size, + merger_, + stream, + feature_value_accessor_); + PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(d_grads, + d_merge_grads_ptr, + grad_value_size * uniq_len, + cudaMemcpyDeviceToDevice, + stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); } } -template -void HeterComm::segment_merge_grad( +template +void HeterComm::segment_merge_grad( int gpu_num, // the device number KeyType* d_keys, // the sorted keys list, which will be modified after merged @@ -734,8 +924,9 @@ void HeterComm::segment_merge_grad( auto stream = resource_->local_stream(gpu_num, 0); auto grad_dim = max_mf_dim_; - auto grad_value_size = - TYPEALIGN(8, feature_value_accessor_.common_push_value.Size(max_mf_dim_)); + auto accessor_wrapper_ptr = + GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); + size_t grad_value_size = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_); auto d_buffer1 = memory::Alloc(place, sizeof(uint32_t) * len); auto d_segments = reinterpret_cast(d_buffer1->ptr()); @@ -751,77 +942,127 @@ void HeterComm::segment_merge_grad( CUDA_CHECK(cudaMemsetAsync(d_segments_num, 0, sizeof(uint32_t), stream)); uint32_t segment_size = FLAGS_gpugraph_merge_grads_segment_size; - heter_comm_kernel_->split_segments(d_fea_num_info, uniq_len, d_segments, - d_segments_num, segment_size, stream); + heter_comm_kernel_->split_segments(d_fea_num_info, + uniq_len, + d_segments, + d_segments_num, + segment_size, + stream); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); size_t temp_storage_bytes = 0; PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceReduce::Sum( NULL, temp_storage_bytes, d_segments, d_segments_num, uniq_len, stream)); auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); - PADDLE_ENFORCE_GPU_SUCCESS( - cub::DeviceReduce::Sum(d_temp_storage->ptr(), temp_storage_bytes, - d_segments, d_segments_num, uniq_len, stream)); - CUDA_CHECK(cudaMemcpyAsync(&segments_num, d_segments_num, sizeof(uint32_t), - cudaMemcpyDeviceToHost, stream)); + PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceReduce::Sum(d_temp_storage->ptr(), + temp_storage_bytes, + d_segments, + d_segments_num, + uniq_len, + stream)); + CUDA_CHECK(cudaMemcpyAsync(&segments_num, + d_segments_num, + sizeof(uint32_t), + cudaMemcpyDeviceToHost, + stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); temp_storage_bytes = 0; - PADDLE_ENFORCE_GPU_SUCCESS( - cub::DeviceScan::ExclusiveSum(NULL, temp_storage_bytes, d_segments, - d_segments_offset, uniq_len, stream)); + PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum(NULL, + temp_storage_bytes, + d_segments, + d_segments_offset, + uniq_len, + stream)); if (d_temp_storage->size() < temp_storage_bytes) { d_temp_storage = NULL; d_temp_storage = memory::Alloc(place, temp_storage_bytes); } - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum( - d_temp_storage->ptr(), temp_storage_bytes, d_segments, d_segments_offset, - uniq_len, stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + cub::DeviceScan::ExclusiveSum(d_temp_storage->ptr(), + temp_storage_bytes, + d_segments, + d_segments_offset, + uniq_len, + stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); - heter_comm_kernel_->expand_segments(d_fea_num_info, d_segments_offset, - uniq_len, d_segments_fea_num_info, - segment_size, stream); + heter_comm_kernel_->expand_segments(d_fea_num_info, + d_segments_offset, + uniq_len, + d_segments_fea_num_info, + segment_size, + stream); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum( - NULL, temp_storage_bytes, d_segments_fea_num_info, - d_segments_fea_num_offset, segments_num, stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + cub::DeviceScan::ExclusiveSum(NULL, + temp_storage_bytes, + d_segments_fea_num_info, + d_segments_fea_num_offset, + segments_num, + stream)); if (d_temp_storage->size() < temp_storage_bytes) { d_temp_storage = NULL; d_temp_storage = memory::Alloc(place, temp_storage_bytes); } - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum( - d_temp_storage->ptr(), temp_storage_bytes, d_segments_fea_num_info, - d_segments_fea_num_offset, segments_num, stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + cub::DeviceScan::ExclusiveSum(d_temp_storage->ptr(), + temp_storage_bytes, + d_segments_fea_num_info, + d_segments_fea_num_offset, + segments_num, + stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); auto d_segments_keys = memory::Alloc(place, sizeof(KeyType) * segments_num); auto d_segments_keys_ptr = reinterpret_cast(d_segments_keys->ptr()); - heter_comm_kernel_->shrink_keys(d_keys, d_segments_fea_num_offset, - d_segments_keys_ptr, segments_num, stream); + heter_comm_kernel_->shrink_keys(d_keys, + d_segments_fea_num_offset, + d_segments_keys_ptr, + segments_num, + stream); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); auto d_segment_grads = memory::Alloc(place, segments_num * grad_value_size); auto d_segment_grads_ptr = reinterpret_cast(d_segment_grads->ptr()); - heter_comm_kernel_->merge_gradient( - d_segments_keys_ptr, d_segments_fea_num_offset, d_segments_fea_num_info, - d_index, (char*)d_grads, (char*)d_segment_grads_ptr, segments_num, - grad_dim, grad_value_size, merger_, stream); + heter_comm_kernel_->merge_gradient(d_segments_keys_ptr, + d_segments_fea_num_offset, + d_segments_fea_num_info, + d_index, + (char*)d_grads, + (char*)d_segment_grads_ptr, + segments_num, + grad_dim, + grad_value_size, + merger_, + stream, + feature_value_accessor_); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); - PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(d_keys, d_segments_keys_ptr, + PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(d_keys, + d_segments_keys_ptr, sizeof(KeyType) * segments_num, - cudaMemcpyDeviceToDevice, stream)); - PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(d_grads, d_segment_grads_ptr, + cudaMemcpyDeviceToDevice, + stream)); + PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(d_grads, + d_segment_grads_ptr, grad_value_size * segments_num, - cudaMemcpyDeviceToDevice, stream)); + cudaMemcpyDeviceToDevice, + stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); } -template -void HeterComm::split_input_to_shard( - KeyType* d_keys, int* d_idx_ptr, size_t len, int* left, int* right, +template +void HeterComm::split_input_to_shard( + KeyType* d_keys, + int* d_idx_ptr, + size_t len, + int* left, + int* right, int dev_num) { int total_device = resource_->total_device(); int dev_id = resource_->dev_id(dev_num); @@ -839,42 +1080,64 @@ void HeterComm::split_input_to_shard( int* d_shard_index_tmp_ptr = reinterpret_cast(d_shard_index_tmp->ptr()); heter_comm_kernel_->fill_idx(d_idx_tmp_ptr, len, stream); - heter_comm_kernel_->calc_shard_index(d_keys, len, d_shard_index_tmp_ptr, - total_device, stream); + heter_comm_kernel_->calc_shard_index( + d_keys, len, d_shard_index_tmp_ptr, total_device, stream); size_t temp_storage_bytes; const int num_bits = 1 + log2i(total_device); - heter_comm_kernel_->sort_pairs( - NULL, temp_storage_bytes, d_shard_index_tmp_ptr, d_shard_index_ptr, - d_idx_tmp_ptr, d_idx_ptr, len, 0, num_bits, stream); + heter_comm_kernel_->sort_pairs(NULL, + temp_storage_bytes, + d_shard_index_tmp_ptr, + d_shard_index_ptr, + d_idx_tmp_ptr, + d_idx_ptr, + len, + 0, + num_bits, + stream); auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); - heter_comm_kernel_->sort_pairs( - d_temp_storage->ptr(), temp_storage_bytes, d_shard_index_tmp_ptr, - d_shard_index_ptr, d_idx_tmp_ptr, d_idx_ptr, len, 0, num_bits, stream); - - heter_comm_kernel_->calc_shard_offset(d_shard_index_ptr, left, right, len, - total_device, stream); + heter_comm_kernel_->sort_pairs(d_temp_storage->ptr(), + temp_storage_bytes, + d_shard_index_tmp_ptr, + d_shard_index_ptr, + d_idx_tmp_ptr, + d_idx_ptr, + len, + 0, + num_bits, + stream); + + heter_comm_kernel_->calc_shard_offset( + d_shard_index_ptr, left, right, len, total_device, stream); sync_stream(stream); } -template -void HeterComm::merge_keys( - int gpu_num, const KeyType* d_keys, size_t len, // input - KeyType* d_sorted_keys, // output - KeyType* d_merged_keys, // output - uint32_t* d_restore_idx, // output - size_t& uniq_len) { // output +template +void HeterComm::merge_keys( + int gpu_num, + const KeyType* d_keys, + size_t len, // input + KeyType* d_sorted_keys, // output + KeyType* d_merged_keys, // output + uint32_t* d_restore_idx, // output + size_t& uniq_len) { // output int dev_id = resource_->dev_id(gpu_num); platform::CUDAPlace place = platform::CUDAPlace(dev_id); platform::CUDADeviceGuard guard(dev_id); auto stream = resource_->local_stream(gpu_num, 0); size_t grad_dim = max_mf_dim_; - size_t grad_value_size = TYPEALIGN(8, feature_value_accessor_.common_push_value.Size(max_mf_dim_)); + auto accessor_wrapper_ptr = + GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); + size_t grad_value_size = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_); auto d_fea_num_info = memory::Alloc(place, sizeof(uint32_t) * (len * 4 + 1)); - uint32_t* d_fea_num_info_ptr = reinterpret_cast(d_fea_num_info->ptr()); + uint32_t* d_fea_num_info_ptr = + reinterpret_cast(d_fea_num_info->ptr()); uint32_t* d_idx = (uint32_t*)&d_fea_num_info_ptr[len]; uint32_t* d_index = (uint32_t*)&d_idx[len]; uint32_t* d_offset = (uint32_t*)&d_index[len]; @@ -882,52 +1145,99 @@ void HeterComm::merge_keys( heter_comm_kernel_->fill_idx(d_idx, len, stream); size_t temp_storage_bytes; - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs( - NULL, temp_storage_bytes, d_keys, d_sorted_keys, d_idx, d_index, len, - 0, 8 * sizeof(KeyType), stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + cub::DeviceRadixSort::SortPairs(NULL, + temp_storage_bytes, + d_keys, + d_sorted_keys, + d_idx, + d_index, + len, + 0, + 8 * sizeof(KeyType), + stream)); auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs( - d_temp_storage->ptr(), temp_storage_bytes, d_keys, d_sorted_keys, - d_idx, d_index, len, 0, 8 * sizeof(KeyType), stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + cub::DeviceRadixSort::SortPairs(d_temp_storage->ptr(), + temp_storage_bytes, + d_keys, + d_sorted_keys, + d_idx, + d_index, + len, + 0, + 8 * sizeof(KeyType), + stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); temp_storage_bytes = 0; - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRunLengthEncode::Encode( - NULL, temp_storage_bytes, d_sorted_keys, d_merged_keys, d_fea_num_info_ptr, - d_merged_size, len, stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + cub::DeviceRunLengthEncode::Encode(NULL, + temp_storage_bytes, + d_sorted_keys, + d_merged_keys, + d_fea_num_info_ptr, + d_merged_size, + len, + stream)); if (d_temp_storage->size() < temp_storage_bytes) { d_temp_storage = NULL; d_temp_storage = memory::Alloc(place, temp_storage_bytes); } - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRunLengthEncode::Encode( - d_temp_storage->ptr(), temp_storage_bytes, d_sorted_keys, d_merged_keys, - d_fea_num_info_ptr, d_merged_size, len, stream)); - cudaMemcpyAsync((void*)&uniq_len, d_merged_size, sizeof(int), - cudaMemcpyDeviceToHost, stream); + PADDLE_ENFORCE_GPU_SUCCESS( + cub::DeviceRunLengthEncode::Encode(d_temp_storage->ptr(), + temp_storage_bytes, + d_sorted_keys, + d_merged_keys, + d_fea_num_info_ptr, + d_merged_size, + len, + stream)); + cudaMemcpyAsync((void*)&uniq_len, + d_merged_size, + sizeof(int), + cudaMemcpyDeviceToHost, + stream); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); temp_storage_bytes = 0; - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum( - NULL, temp_storage_bytes, d_fea_num_info_ptr, d_offset, uniq_len, - stream)); + PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum(NULL, + temp_storage_bytes, + d_fea_num_info_ptr, + d_offset, + uniq_len, + stream)); if (d_temp_storage->size() < temp_storage_bytes) { d_temp_storage = NULL; d_temp_storage = memory::Alloc(place, temp_storage_bytes); } - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum( - d_temp_storage->ptr(), temp_storage_bytes, d_fea_num_info_ptr, d_offset, uniq_len, - stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + cub::DeviceScan::ExclusiveSum(d_temp_storage->ptr(), + temp_storage_bytes, + d_fea_num_info_ptr, + d_offset, + uniq_len, + stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); - heter_comm_kernel_->fill_restore_idx( - true, len, uniq_len, d_merged_keys, d_index, d_offset, - d_fea_num_info_ptr, d_restore_idx, stream); + heter_comm_kernel_->fill_restore_idx(true, + len, + uniq_len, + d_merged_keys, + d_index, + d_offset, + d_fea_num_info_ptr, + d_restore_idx, + stream); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); } -template -void HeterComm::pull_merge_sparse( - int num, KeyType* d_keys, float* d_vals, size_t len) { +template +void HeterComm::pull_merge_sparse( + int num, KeyType* d_keys, float* d_vals, size_t len) { int total_device = resource_->total_device(); int dev_id = resource_->dev_id(num); DevPlace place = DevPlace(dev_id); @@ -952,18 +1262,24 @@ void HeterComm::pull_merge_sparse( auto xpu_context = xpu_dev_ctx.x_context(); int r = xpu::constant(xpu_context, d_left_ptr, total_device, -1); - PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + PADDLE_ENFORCE_EQ(r, + XPU_SUCCESS, platform::errors::External( - "XPU constant kernel return wrong value[%d %s]", r, + "XPU constant kernel return wrong value[%d %s]", + r, XPUAPIErrorMsg[r])); int r2 = xpu::constant(xpu_context, d_right_ptr, total_device, -1); - PADDLE_ENFORCE_EQ(r2, XPU_SUCCESS, + PADDLE_ENFORCE_EQ(r2, + XPU_SUCCESS, platform::errors::External( - "XPU constant kernel return wrong value[%d %s]", r2, + "XPU constant kernel return wrong value[%d %s]", + r2, XPUAPIErrorMsg[r2])); #endif - size_t val_type_size = feature_value_accessor_.common_pull_value.Size(max_mf_dim_); + auto accessor_wrapper_ptr = + GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); + size_t val_type_size = accessor_wrapper_ptr->GetPullValueSize(max_mf_dim_); VLOG(3) << "pull_sparse len:" << len << " val_type_size: " << val_type_size; auto d_sorted_keys = memory::Alloc(place, len * sizeof(KeyType)); auto d_sorted_keys_ptr = reinterpret_cast(d_sorted_keys->ptr()); @@ -977,28 +1293,38 @@ void HeterComm::pull_merge_sparse( auto d_shard_vals_ptr = reinterpret_cast(d_shard_vals->ptr()); size_t uniq_len = 0; - merge_keys(num, d_keys, len, - d_sorted_keys_ptr, - d_merged_keys_ptr, - d_restore_idx_ptr, - uniq_len); + merge_keys(num, + d_keys, + len, + d_sorted_keys_ptr, + d_merged_keys_ptr, + d_restore_idx_ptr, + uniq_len); sync_stream(stream); auto d_idx = memory::Alloc(place, uniq_len * sizeof(int)); auto d_idx_ptr = reinterpret_cast(d_idx->ptr()); - split_input_to_shard(d_merged_keys_ptr, d_idx_ptr, uniq_len, d_left_ptr, d_right_ptr, num); + split_input_to_shard( + d_merged_keys_ptr, d_idx_ptr, uniq_len, d_left_ptr, d_right_ptr, num); heter_comm_kernel_->fill_shard_key( - d_shard_keys_ptr, d_merged_keys_ptr, d_idx_ptr, uniq_len, - stream); + d_shard_keys_ptr, d_merged_keys_ptr, d_idx_ptr, uniq_len, stream); sync_stream(stream); auto dst_place = platform::CPUPlace(); auto src_place = place; - memory_copy(dst_place, h_left, src_place, d_left_ptr, - total_device * sizeof(int), stream); - memory_copy(dst_place, h_right, src_place, d_right_ptr, - total_device * sizeof(int), stream); + memory_copy(dst_place, + h_left, + src_place, + d_left_ptr, + total_device * sizeof(int), + stream); + memory_copy(dst_place, + h_right, + src_place, + d_right_ptr, + total_device * sizeof(int), + stream); if (!FLAGS_gpugraph_enable_gpu_direct_access) { for (int i = 0; i < total_device; ++i) { @@ -1006,8 +1332,8 @@ void HeterComm::pull_merge_sparse( if (h_left[i] == -1 || h_right[i] == -1) { continue; } - create_storage(num, i, shard_len * sizeof(KeyType), - shard_len * val_type_size); + create_storage( + num, i, shard_len * sizeof(KeyType), shard_len * val_type_size); } walk_to_dest(num, total_device, h_left, h_right, d_shard_keys_ptr, NULL); } @@ -1024,13 +1350,17 @@ void HeterComm::pull_merge_sparse( ptr_tables_[i]->rwlock_->RDLock(); if (!FLAGS_gpugraph_enable_gpu_direct_access) { ptr_tables_[i]->get(reinterpret_cast(node.key_storage), - node.val_storage, h_right[i] - h_left[i] + 1, - resource_->remote_stream(i, num)); + node.val_storage, + h_right[i] - h_left[i] + 1, + resource_->remote_stream(i, num), + feature_value_accessor_); } else { ptr_tables_[i]->get( d_shard_keys_ptr + h_left[i], reinterpret_cast(d_shard_vals_ptr) + h_left[i] * val_type_size, - h_right[i] - h_left[i] + 1, resource_->remote_stream(i, num)); + h_right[i] - h_left[i] + 1, + resource_->remote_stream(i, num), + feature_value_accessor_); } } @@ -1043,8 +1373,12 @@ void HeterComm::pull_merge_sparse( } if (!FLAGS_gpugraph_enable_gpu_direct_access) { - walk_to_src(num, total_device, h_left, h_right, - reinterpret_cast(d_shard_vals_ptr), val_type_size); + walk_to_src(num, + total_device, + h_left, + h_right, + reinterpret_cast(d_shard_vals_ptr), + val_type_size); for (int i = 0; i < total_device; ++i) { auto& node = path_[num][i].nodes_.front(); sync_stream(node.out_stream); @@ -1053,17 +1387,21 @@ void HeterComm::pull_merge_sparse( auto d_merged_vals = memory::Alloc(place, uniq_len * val_type_size); auto d_merged_vals_ptr = reinterpret_cast(d_merged_vals->ptr()); - heter_comm_kernel_->dy_mf_fill_dvals( - d_shard_vals_ptr, d_merged_vals_ptr, - d_idx_ptr, uniq_len, - val_type_size, stream); + heter_comm_kernel_->dy_mf_fill_dvals(d_shard_vals_ptr, + d_merged_vals_ptr, + d_idx_ptr, + uniq_len, + val_type_size, + stream); sync_stream(stream); - heter_comm_kernel_->unpack_merged_vals( - len, d_keys, - d_merged_vals_ptr, - d_restore_idx_ptr, - d_vals, val_type_size, stream); + heter_comm_kernel_->unpack_merged_vals(len, + d_keys, + d_merged_vals_ptr, + d_restore_idx_ptr, + d_vals, + val_type_size, + stream); sync_stream(stream); if (!FLAGS_gpugraph_enable_gpu_direct_access) { @@ -1075,9 +1413,12 @@ void HeterComm::pull_merge_sparse( } } } -template -void HeterComm::pull_normal_sparse( - int num, KeyType* d_keys, float* d_vals, size_t len) { +template +void HeterComm::pull_normal_sparse( + int num, KeyType* d_keys, float* d_vals, size_t len) { int total_device = resource_->total_device(); int dev_id = resource_->dev_id(num); DevPlace place = DevPlace(dev_id); @@ -1102,21 +1443,27 @@ void HeterComm::pull_normal_sparse( auto xpu_context = xpu_dev_ctx.x_context(); int r = xpu::constant(xpu_context, d_left_ptr, total_device, -1); - PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + PADDLE_ENFORCE_EQ(r, + XPU_SUCCESS, platform::errors::External( - "XPU constant kernel return wrong value[%d %s]", r, + "XPU constant kernel return wrong value[%d %s]", + r, XPUAPIErrorMsg[r])); int r2 = xpu::constant(xpu_context, d_right_ptr, total_device, -1); - PADDLE_ENFORCE_EQ(r2, XPU_SUCCESS, + PADDLE_ENFORCE_EQ(r2, + XPU_SUCCESS, platform::errors::External( - "XPU constant kernel return wrong value[%d %s]", r2, + "XPU constant kernel return wrong value[%d %s]", + r2, XPUAPIErrorMsg[r2])); #endif auto d_idx = memory::Alloc(place, len * sizeof(int)); int* d_idx_ptr = reinterpret_cast(d_idx->ptr()); - size_t val_type_size = feature_value_accessor_.common_pull_value.Size(max_mf_dim_); + auto accessor_wrapper_ptr = + GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); + size_t val_type_size = accessor_wrapper_ptr->GetPullValueSize(max_mf_dim_); VLOG(3) << "pull_sparse len:" << len << " val_type_size: " << val_type_size; auto d_shard_keys = memory::Alloc(place, len * sizeof(KeyType)); KeyType* d_shard_keys_ptr = reinterpret_cast(d_shard_keys->ptr()); @@ -1125,18 +1472,26 @@ void HeterComm::pull_normal_sparse( split_input_to_shard(d_keys, d_idx_ptr, len, d_left_ptr, d_right_ptr, num); - heter_comm_kernel_->fill_shard_key(d_shard_keys_ptr, d_keys, d_idx_ptr, len, - stream); + heter_comm_kernel_->fill_shard_key( + d_shard_keys_ptr, d_keys, d_idx_ptr, len, stream); sync_stream(stream); auto dst_place = platform::CPUPlace(); auto src_place = place; - memory_copy(dst_place, h_left, src_place, d_left_ptr, - total_device * sizeof(int), stream); - memory_copy(dst_place, h_right, src_place, d_right_ptr, - total_device * sizeof(int), stream); + memory_copy(dst_place, + h_left, + src_place, + d_left_ptr, + total_device * sizeof(int), + stream); + memory_copy(dst_place, + h_right, + src_place, + d_right_ptr, + total_device * sizeof(int), + stream); if (!FLAGS_gpugraph_enable_gpu_direct_access) { for (int i = 0; i < total_device; ++i) { @@ -1144,8 +1499,8 @@ void HeterComm::pull_normal_sparse( if (h_left[i] == -1 || h_right[i] == -1) { continue; } - create_storage(num, i, shard_len * sizeof(KeyType), - shard_len * val_type_size); + create_storage( + num, i, shard_len * sizeof(KeyType), shard_len * val_type_size); } walk_to_dest(num, total_device, h_left, h_right, d_shard_keys_ptr, NULL); } @@ -1161,13 +1516,17 @@ void HeterComm::pull_normal_sparse( ptr_tables_[i]->rwlock_->RDLock(); if (!FLAGS_gpugraph_enable_gpu_direct_access) { ptr_tables_[i]->get(reinterpret_cast(node.key_storage), - node.val_storage, h_right[i] - h_left[i] + 1, - resource_->remote_stream(i, num)); + node.val_storage, + h_right[i] - h_left[i] + 1, + resource_->remote_stream(i, num), + feature_value_accessor_); } else { ptr_tables_[i]->get( d_shard_keys_ptr + h_left[i], reinterpret_cast(d_shard_vals_ptr) + h_left[i] * val_type_size, - h_right[i] - h_left[i] + 1, resource_->remote_stream(i, num)); + h_right[i] - h_left[i] + 1, + resource_->remote_stream(i, num), + feature_value_accessor_); } } @@ -1179,17 +1538,20 @@ void HeterComm::pull_normal_sparse( ptr_tables_[i]->rwlock_->UNLock(); } if (!FLAGS_gpugraph_enable_gpu_direct_access) { - walk_to_src(num, total_device, h_left, h_right, - reinterpret_cast(d_shard_vals_ptr), val_type_size); + walk_to_src(num, + total_device, + h_left, + h_right, + reinterpret_cast(d_shard_vals_ptr), + val_type_size); for (int i = 0; i < total_device; ++i) { auto& node = path_[num][i].nodes_.front(); sync_stream(node.out_stream); } } - heter_comm_kernel_->dy_mf_fill_dvals(d_shard_vals_ptr, d_vals, d_idx_ptr, len, - val_type_size, stream); - + heter_comm_kernel_->dy_mf_fill_dvals( + d_shard_vals_ptr, d_vals, d_idx_ptr, len, val_type_size, stream); sync_stream(stream); if (!FLAGS_gpugraph_enable_gpu_direct_access) { for (int i = 0; i < total_device; ++i) { @@ -1201,9 +1563,12 @@ void HeterComm::pull_normal_sparse( } } -template -void HeterComm::pull_sparse( - int num, KeyType* d_keys, float* d_vals, size_t len) { +template +void HeterComm::pull_sparse( + int num, KeyType* d_keys, float* d_vals, size_t len) { if (len == 0) { return; } @@ -1215,13 +1580,17 @@ void HeterComm::pull_sparse( } #if defined(PADDLE_WITH_CUDA) -template +template template -void HeterComm::push_sparse(int dev_num, - KeyType* d_keys, - float* d_grads, - size_t len, - Sgd& sgd) { // NOLINT +void HeterComm::push_sparse( + int dev_num, + KeyType* d_keys, + float* d_grads, + size_t len, + Sgd& sgd) { // NOLINT if (len == 0) { return; } @@ -1229,8 +1598,9 @@ void HeterComm::push_sparse(int dev_num, int total_device = resource_->total_device(); int dev_id = resource_->dev_id(dev_num); - size_t grad_value_size = - TYPEALIGN(8, feature_value_accessor_.common_push_value.Size(max_mf_dim_)); + auto accessor_wrapper_ptr = + GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); + size_t grad_value_size = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_); DevPlace place = DevPlace(dev_id); AnyDeviceGuard guard(dev_id); auto stream = resource_->local_stream(dev_num, 0); @@ -1253,14 +1623,18 @@ void HeterComm::push_sparse(int dev_num, auto xpu_context = xpu_dev_ctx.x_context(); int r = xpu::constant(xpu_context, d_left_ptr, total_device, -1); - PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + PADDLE_ENFORCE_EQ(r, + XPU_SUCCESS, platform::errors::External( - "XPU constant kernel return wrong value[%d %s]", r, + "XPU constant kernel return wrong value[%d %s]", + r, XPUAPIErrorMsg[r])); int r2 = xpu::constant(xpu_context, d_right_ptr, total_device, -1); - PADDLE_ENFORCE_EQ(r2, XPU_SUCCESS, + PADDLE_ENFORCE_EQ(r2, + XPU_SUCCESS, platform::errors::External( - "XPU constant kernel return wrong value[%d %s]", r2, + "XPU constant kernel return wrong value[%d %s]", + r2, XPUAPIErrorMsg[r2])); #endif @@ -1281,35 +1655,49 @@ void HeterComm::push_sparse(int dev_num, // do two gradient merge // 1st. do segmented gradient merge // 2nd. do global gradient merge - dynamic_merge_grad(dev_num, d_keys, d_grads, len, uniq_len, segment_len, - true); + dynamic_merge_grad( + dev_num, d_keys, d_grads, len, uniq_len, segment_len, true); len = segment_len; uniq_len = 0; segment_len = 0; - dynamic_merge_grad(dev_num, d_keys, d_grads, len, uniq_len, segment_len, - false); + dynamic_merge_grad( + dev_num, d_keys, d_grads, len, uniq_len, segment_len, false); } else { // Perform gradient merge only once - dynamic_merge_grad(dev_num, d_keys, d_grads, len, uniq_len, segment_len, - false); + dynamic_merge_grad( + dev_num, d_keys, d_grads, len, uniq_len, segment_len, false); } } - split_input_to_shard(d_keys, d_idx_ptr, uniq_len, d_left_ptr, d_right_ptr, - dev_num); + split_input_to_shard( + d_keys, d_idx_ptr, uniq_len, d_left_ptr, d_right_ptr, dev_num); - heter_comm_kernel_->dy_mf_fill_shard_grads( - d_shard_keys_ptr, d_keys, d_shard_grads_ptr, d_grads, d_idx_ptr, uniq_len, - grad_value_size, stream); + heter_comm_kernel_->dy_mf_fill_shard_grads(d_shard_keys_ptr, + d_keys, + d_shard_grads_ptr, + d_grads, + d_idx_ptr, + uniq_len, + grad_value_size, + stream, + feature_value_accessor_); sync_stream(stream); auto dst_place = platform::CPUPlace(); auto src_place = place; - memory_copy(dst_place, h_left, src_place, d_left_ptr, - total_device * sizeof(int), stream); - memory_copy(dst_place, h_right, src_place, d_right_ptr, - total_device * sizeof(int), stream); + memory_copy(dst_place, + h_left, + src_place, + d_left_ptr, + total_device * sizeof(int), + stream); + memory_copy(dst_place, + h_right, + src_place, + d_right_ptr, + total_device * sizeof(int), + stream); if (!FLAGS_gpugraph_enable_gpu_direct_access) { for (int i = 0; i < total_device; ++i) { @@ -1317,12 +1705,17 @@ void HeterComm::push_sparse(int dev_num, if (h_left[i] == -1 || h_right[i] == -1) { continue; } - create_storage(dev_num, i, shard_len * sizeof(KeyType), - shard_len * grad_value_size); + create_storage( + dev_num, i, shard_len * sizeof(KeyType), shard_len * grad_value_size); } - walk_to_dest(dev_num, total_device, h_left, h_right, d_shard_keys_ptr, - reinterpret_cast(d_shard_grads_ptr), grad_value_size); + walk_to_dest(dev_num, + total_device, + h_left, + h_right, + d_shard_keys_ptr, + reinterpret_cast(d_shard_grads_ptr), + grad_value_size); } for (int i = 0; i < total_device; ++i) { @@ -1338,13 +1731,16 @@ void HeterComm::push_sparse(int dev_num, ptr_tables_[i]->rwlock_->WRLock(); if (!FLAGS_gpugraph_enable_gpu_direct_access) { ptr_tables_[i]->update(reinterpret_cast(node.key_storage), - node.val_storage, h_right[i] - h_left[i] + 1, sgd, + node.val_storage, + h_right[i] - h_left[i] + 1, + sgd, resource_->remote_stream(i, dev_num)); } else { ptr_tables_[i]->update(d_shard_keys_ptr + h_left[i], reinterpret_cast(d_shard_grads_ptr) + grad_value_size * h_left[i], - h_right[i] - h_left[i] + 1, sgd, + h_right[i] - h_left[i] + 1, + sgd, resource_->remote_stream(i, dev_num)); } } @@ -1371,11 +1767,12 @@ void HeterComm::push_sparse(int dev_num, } #elif defined(PADDLE_WITH_XPU_KP) -template -void HeterComm::push_sparse(int dev_num, - KeyType* d_keys, - GradType* d_grads, - size_t len) { +template +void HeterComm::push_sparse( + int dev_num, KeyType* d_keys, GradType* d_grads, size_t len) { if (len == 0) { return; } @@ -1405,14 +1802,18 @@ void HeterComm::push_sparse(int dev_num, auto xpu_context = xpu_dev_ctx.x_context(); int r = xpu::constant(xpu_context, d_left_ptr, total_device, -1); - PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + PADDLE_ENFORCE_EQ(r, + XPU_SUCCESS, platform::errors::External( - "XPU constant kernel return wrong value[%d %s]", r, + "XPU constant kernel return wrong value[%d %s]", + r, XPUAPIErrorMsg[r])); int r2 = xpu::constant(xpu_context, d_right_ptr, total_device, -1); - PADDLE_ENFORCE_EQ(r2, XPU_SUCCESS, + PADDLE_ENFORCE_EQ(r2, + XPU_SUCCESS, platform::errors::External( - "XPU constant kernel return wrong value[%d %s]", r2, + "XPU constant kernel return wrong value[%d %s]", + r2, XPUAPIErrorMsg[r2])); #endif @@ -1428,32 +1829,48 @@ void HeterComm::push_sparse(int dev_num, int uniq_len = len; merge_grad(dev_num, d_keys, d_grads, len, uniq_len); - split_input_to_shard(d_keys, d_idx_ptr, uniq_len, d_left_ptr, d_right_ptr, - dev_num); + split_input_to_shard( + d_keys, d_idx_ptr, uniq_len, d_left_ptr, d_right_ptr, dev_num); - heter_comm_kernel_->fill_shard_grads(d_shard_keys_ptr, d_keys, - d_shard_grads_ptr, d_grads, d_idx_ptr, - (long long)uniq_len, stream); + heter_comm_kernel_->fill_shard_grads(d_shard_keys_ptr, + d_keys, + d_shard_grads_ptr, + d_grads, + d_idx_ptr, + (long long)uniq_len, + stream); sync_stream(stream); auto dst_place = platform::CPUPlace(); auto src_place = place; - memory_copy(dst_place, h_left, src_place, d_left_ptr, - total_device * sizeof(int), stream); - memory_copy(dst_place, h_right, src_place, d_right_ptr, - total_device * sizeof(int), stream); + memory_copy(dst_place, + h_left, + src_place, + d_left_ptr, + total_device * sizeof(int), + stream); + memory_copy(dst_place, + h_right, + src_place, + d_right_ptr, + total_device * sizeof(int), + stream); for (int i = 0; i < total_device; ++i) { int shard_len = h_right[i] - h_left[i] + 1; if (h_left[i] == -1 || h_right[i] == -1) { continue; } - create_storage(dev_num, i, shard_len * sizeof(KeyType), - shard_len * sizeof(GradType)); + create_storage( + dev_num, i, shard_len * sizeof(KeyType), shard_len * sizeof(GradType)); } - walk_to_dest(dev_num, total_device, h_left, h_right, d_shard_keys_ptr, + walk_to_dest(dev_num, + total_device, + h_left, + h_right, + d_shard_keys_ptr, d_shard_grads_ptr); for (int i = 0; i < total_device; ++i) { @@ -1489,10 +1906,16 @@ void HeterComm::push_sparse(int dev_num, #endif #if defined(PADDLE_WITH_CUDA) -template +template template -void HeterComm::update_one_table( - int gpu_num, KeyType* d_keys, GradType* d_grads, size_t len, +void HeterComm::update_one_table( + int gpu_num, + KeyType* d_keys, + GradType* d_grads, + size_t len, Sgd& sgd) { // NOLINT if (len == 0) { return; @@ -1501,16 +1924,22 @@ void HeterComm::update_one_table( int dev_id = resource_->dev_id(gpu_num); platform::CUDADeviceGuard guard(dev_id); tables_[gpu_num]->rwlock_->WRLock(); - tables_[gpu_num]->update(d_keys, d_grads, len, sgd, - resource_->remote_stream(gpu_num, gpu_num)); + tables_[gpu_num]->update( + d_keys, d_grads, len, sgd, resource_->remote_stream(gpu_num, gpu_num)); tables_[gpu_num]->rwlock_->UNLock(); cudaStreamSynchronize(resource_->remote_stream(gpu_num, gpu_num)); } -template +template template -void HeterComm::push_sparse_multi_node( - int gpu_num, KeyType* d_keys, GradType* d_grads, size_t len, +void HeterComm::push_sparse_multi_node( + int gpu_num, + KeyType* d_keys, + GradType* d_grads, + size_t len, Sgd& sgd) { // NOLINT if (len == 0) { return; @@ -1521,15 +1950,23 @@ void HeterComm::push_sparse_multi_node( uniq_len = gather_one_node_grad(gpu_num, d_keys, d_grads, uniq_len); - uniq_len = gather_multi_node_grad(gpu_num, storage_[gpu_num].local_keys, - storage_[gpu_num].local_grads, uniq_len); + uniq_len = gather_multi_node_grad(gpu_num, + storage_[gpu_num].local_keys, + storage_[gpu_num].local_grads, + uniq_len); - update_one_table(gpu_num, storage_[gpu_num].local_keys, - storage_[gpu_num].local_grads, uniq_len, sgd); + update_one_table(gpu_num, + storage_[gpu_num].local_keys, + storage_[gpu_num].local_grads, + uniq_len, + sgd); } -template -int HeterComm::gather_one_node_grad( +template +int HeterComm::gather_one_node_grad( int gpu_num, KeyType* d_keys, GradType* d_grads, int len) { int total_gpu = resource_->total_device(); int dev_id = resource_->dev_id(gpu_num); @@ -1546,19 +1983,24 @@ int HeterComm::gather_one_node_grad( int* d_node_len = reinterpret_cast(d_node_len_mem->ptr()); h_node_len[gpu_num] = len; - cudaMemcpy(d_node_len + gpu_num, h_node_len + gpu_num, sizeof(int), + cudaMemcpy(d_node_len + gpu_num, + h_node_len + gpu_num, + sizeof(int), cudaMemcpyHostToDevice); // allgather grad len PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather( - (const void*)(d_node_len + gpu_num), (void*)d_node_len, 1, // NOLINT - ncclInt, // NOLINT - nccl_inner_comm, stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::ncclAllGather((const void*)(d_node_len + gpu_num), + (void*)d_node_len, + 1, // NOLINT + ncclInt, // NOLINT + nccl_inner_comm, + stream)); PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); - cudaMemcpy(h_node_len, d_node_len, sizeof(int) * total_gpu, - cudaMemcpyDeviceToHost); + cudaMemcpy( + h_node_len, d_node_len, sizeof(int) * total_gpu, cudaMemcpyDeviceToHost); for (int i = 0; i < total_gpu; ++i) { if (h_node_len[i] > max_size) { @@ -1572,9 +2014,13 @@ int HeterComm::gather_one_node_grad( PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather( d_keys, storage.all_keys, max_size, ncclUint64, nccl_inner_comm, stream)); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather( - d_grads, storage.all_grads, max_size * sizeof(GradType), ncclUint8, - nccl_inner_comm, stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::ncclAllGather(d_grads, + storage.all_grads, + max_size * sizeof(GradType), + ncclUint8, + nccl_inner_comm, + stream)); PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); @@ -1594,18 +2040,24 @@ int HeterComm::gather_one_node_grad( cudaMemset(d_left_ptr, -1, total_gpu * sizeof(int)); cudaMemset(d_right_ptr, -1, total_gpu * sizeof(int)); - split_input_to_shard(storage.all_keys + index, d_idx_ptr, h_node_len[i], - d_left_ptr, d_right_ptr, gpu_num); - cudaMemcpy(h_left, d_left_ptr, total_gpu * sizeof(int), - cudaMemcpyDeviceToHost); - cudaMemcpy(h_right, d_right_ptr, total_gpu * sizeof(int), - cudaMemcpyDeviceToHost); - - heter_comm_kernel_->fill_shard_grads( - storage.local_keys + merge_num, storage.all_keys + index, - storage.local_grads + merge_num, storage.all_grads + index, - d_idx_ptr + h_left[gpu_num], h_right[gpu_num] - h_left[gpu_num] + 1, - stream); + split_input_to_shard(storage.all_keys + index, + d_idx_ptr, + h_node_len[i], + d_left_ptr, + d_right_ptr, + gpu_num); + cudaMemcpy( + h_left, d_left_ptr, total_gpu * sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy( + h_right, d_right_ptr, total_gpu * sizeof(int), cudaMemcpyDeviceToHost); + + heter_comm_kernel_->fill_shard_grads(storage.local_keys + merge_num, + storage.all_keys + index, + storage.local_grads + merge_num, + storage.all_grads + index, + d_idx_ptr + h_left[gpu_num], + h_right[gpu_num] - h_left[gpu_num] + 1, + stream); merge_num = merge_num + h_right[gpu_num] - h_left[gpu_num] + 1; } @@ -1614,8 +2066,11 @@ int HeterComm::gather_one_node_grad( return ret; } -template -int HeterComm::gather_multi_node_grad( +template +int HeterComm::gather_multi_node_grad( int gpu_num, KeyType* d_keys, GradType* d_grads, int len) { int dev_id = resource_->dev_id(gpu_num); auto& storage = storage_[gpu_num]; @@ -1638,8 +2093,8 @@ int HeterComm::gather_multi_node_grad( d_node_len, d_node_len, 1, ncclInt, nccl_inter_comm, stream)); PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); - cudaMemcpy(h_node_len, d_node_len, sizeof(int) * node_size_, - cudaMemcpyDeviceToHost); + cudaMemcpy( + h_node_len, d_node_len, sizeof(int) * node_size_, cudaMemcpyDeviceToHost); for (int i = 0; i < node_size_; ++i) { if (h_node_len[i] > max_size) { @@ -1653,19 +2108,29 @@ int HeterComm::gather_multi_node_grad( PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather( d_keys, storage.all_keys, max_size, ncclUint64, nccl_inter_comm, stream)); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather( - d_grads, storage.all_grads, max_size * sizeof(GradType), ncclUint8, - nccl_inter_comm, stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::ncclAllGather(d_grads, + storage.all_grads, + max_size * sizeof(GradType), + ncclUint8, + nccl_inter_comm, + stream)); PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); int merge_num = 0; for (int i = 0; i < node_size_; ++i) { int index = i * max_size; - cudaMemcpyAsync(storage.local_keys + merge_num, storage.all_keys + index, - h_node_len[i], cudaMemcpyDefault, stream); - cudaMemcpyAsync(storage.local_grads + merge_num, storage.all_grads + index, - h_node_len[i], cudaMemcpyDefault, stream); + cudaMemcpyAsync(storage.local_keys + merge_num, + storage.all_keys + index, + h_node_len[i], + cudaMemcpyDefault, + stream); + cudaMemcpyAsync(storage.local_grads + merge_num, + storage.all_grads + index, + h_node_len[i], + cudaMemcpyDefault, + stream); merge_num += h_node_len[i]; } @@ -1675,8 +2140,11 @@ int HeterComm::gather_multi_node_grad( } #endif -template -void HeterComm::end_pass() { +template +void HeterComm::end_pass() { int total_device = resource_->total_device(); std::vector threads; @@ -1698,8 +2166,11 @@ void HeterComm::end_pass() { } #if defined(PADDLE_WITH_CUDA) -template -int HeterComm::dedup_keys_and_fillidx( +template +int HeterComm::dedup_keys_and_fillidx( const int gpu_id, const int total_fea_num, const KeyType* d_keys, // input @@ -1727,29 +2198,61 @@ int HeterComm::dedup_keys_and_fillidx( void* d_buf = NULL; size_t temp_storage_bytes = 0; - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs( - NULL, temp_storage_bytes, d_keys, d_sorted_keys, d_index_in, d_sorted_idx, - total_fea_num, 0, 8 * sizeof(KeyType), stream, false)); + PADDLE_ENFORCE_GPU_SUCCESS( + cub::DeviceRadixSort::SortPairs(NULL, + temp_storage_bytes, + d_keys, + d_sorted_keys, + d_index_in, + d_sorted_idx, + total_fea_num, + 0, + 8 * sizeof(KeyType), + stream, + false)); auto d_cache_ptr = memory::Alloc(place, temp_storage_bytes); d_buf = reinterpret_cast(d_cache_ptr->ptr()); - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRadixSort::SortPairs( - d_buf, temp_storage_bytes, d_keys, d_sorted_keys, d_index_in, - d_sorted_idx, total_fea_num, 0, 8 * sizeof(KeyType), stream, false)); + PADDLE_ENFORCE_GPU_SUCCESS( + cub::DeviceRadixSort::SortPairs(d_buf, + temp_storage_bytes, + d_keys, + d_sorted_keys, + d_index_in, + d_sorted_idx, + total_fea_num, + 0, + 8 * sizeof(KeyType), + stream, + false)); - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRunLengthEncode::Encode( - NULL, temp_storage_bytes, d_sorted_keys, d_merged_keys, d_merged_cnts, - d_merged_size, total_fea_num, stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + cub::DeviceRunLengthEncode::Encode(NULL, + temp_storage_bytes, + d_sorted_keys, + d_merged_keys, + d_merged_cnts, + d_merged_size, + total_fea_num, + stream)); if (d_cache_ptr->size() < temp_storage_bytes) { d_cache_ptr = NULL; d_cache_ptr = memory::Alloc(place, temp_storage_bytes); } d_buf = reinterpret_cast(d_cache_ptr->ptr()); - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceRunLengthEncode::Encode( - d_buf, temp_storage_bytes, d_sorted_keys, d_merged_keys, d_merged_cnts, - d_merged_size, total_fea_num, stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + cub::DeviceRunLengthEncode::Encode(d_buf, + temp_storage_bytes, + d_sorted_keys, + d_merged_keys, + d_merged_cnts, + d_merged_size, + total_fea_num, + stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync((void*)&merged_size, - (void*)d_merged_size, sizeof(int), - cudaMemcpyDeviceToHost, stream)); + (void*)d_merged_size, + sizeof(int), + cudaMemcpyDeviceToHost, + stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceScan::ExclusiveSum( @@ -1767,8 +2270,14 @@ int HeterComm::dedup_keys_and_fillidx( } // fill restore idx [1,3,5,2,4,6] = [1,2,1,3,2,1] heter_comm_kernel_->fill_restore_idx(filter_zero, - total_fea_num, merged_size, d_merged_keys, d_sorted_idx, - d_offset, d_merged_cnts, d_restore_idx, stream); + total_fea_num, + merged_size, + d_merged_keys, + d_sorted_idx, + d_offset, + d_merged_cnts, + d_restore_idx, + stream); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu index 8d4f2625be70d..93ad75195882b 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu @@ -117,12 +117,12 @@ __global__ void fill_dvals_kernel(ValType* d_shard_vals, ValType* d_vals, } } -template +template __global__ void merge_gradients_basic_kernel( const KeyType* d_keys, const uint32_t* offset, const uint32_t* fea_num, const uint32_t* index, const char* input, char* output, int n, size_t grad_value_size, DynamicGradMerger& merger, - CommonFeatureValueAccessor& feature_value_accessor) { + FVAccessor& feature_value_accessor) { const size_t i = blockIdx.x * blockDim.x + threadIdx.x; if (i < n) { @@ -143,12 +143,12 @@ __global__ void merge_gradients_basic_kernel( } } -template +template __global__ void merge_gradients_embedx_kernel( const KeyType* d_keys, const uint32_t* offset, const uint32_t* fea_num, const uint32_t* index, const char* input, char* output, int n, size_t grad_dim, size_t grad_value_size, DynamicGradMerger& merger, - CommonFeatureValueAccessor& feature_value_accessor) { + FVAccessor& feature_value_accessor) { const size_t i = blockIdx.x * blockDim.x + threadIdx.x; if (i < n) { @@ -363,11 +363,12 @@ void HeterCommKernel::reduce_by_key(void* d_temp_storage, debug_synchronous)); } -template +template void HeterCommKernel::dy_mf_fill_shard_grads( KeyType* d_shard_keys, KeyType* d_keys, float* d_shard_grads, float* d_grads, T* idx, long long len, size_t grad_value_size, - const StreamType& stream) { + const StreamType& stream, FVAccessor& feature_value_accessor) { int grid_size = (len - 1) / block_size_ + 1; size_t c_len = (size_t)len; @@ -383,21 +384,21 @@ void HeterCommKernel::dy_mf_fill_shard_grads( d_shard_grads, d_grads, idx, N, grad_value_size_float); } -template +template void HeterCommKernel::merge_gradient( const KeyType* d_keys, const uint32_t* offset, const uint32_t* fea_num, const uint32_t* index, const char* input, char* output, int n, size_t grad_dim, size_t grad_value_size, DynamicGradMerger& merger, - const StreamType& stream) { + const StreamType& stream, FVAccessor& feature_value_accessor) { int grid_size1 = (n - 1) / block_size_ + 1; merge_gradients_basic_kernel<<>>( d_keys, offset, fea_num, index, input, output, n, grad_value_size, merger, - feature_value_accessor_); + feature_value_accessor); if (grad_dim > 0) { int grid_size2 = (n * grad_dim - 1) / block_size_ + 1; merge_gradients_embedx_kernel<<>>( d_keys, offset, fea_num, index, input, output, n * grad_dim, grad_dim, - grad_value_size, merger, feature_value_accessor_); + grad_value_size, merger, feature_value_accessor); } } @@ -596,22 +597,23 @@ template void HeterCommKernel::reduce_by_key< int num_items, cudaStream_t stream, bool debug_synchronous); template void -HeterCommKernel::dy_mf_fill_shard_grads( +HeterCommKernel::dy_mf_fill_shard_grads< + unsigned long, int, cudaStream_t, CommonFeatureValueAccessor>( unsigned long* d_shard_keys, unsigned long* d_keys, float* d_shard_grads, float* d_grads, int* idx, long long len, size_t grad_value_size, - const cudaStream_t& stream); + const cudaStream_t& stream, CommonFeatureValueAccessor& feature_value_accessor); -template void HeterCommKernel::merge_gradient( +template void HeterCommKernel::merge_gradient( const uint32_t* d_keys, const uint32_t* offset, const uint32_t* fea_num, const uint32_t* index, const char* input, char* output, int n, size_t grad_dim, size_t grad_value_size, DynamicGradMerger& merger_, - const cudaStream_t& stream); + const cudaStream_t& stream, CommonFeatureValueAccessor& feature_value_accessor); -template void HeterCommKernel::merge_gradient( +template void HeterCommKernel::merge_gradient( const uint64_t* d_keys, const uint32_t* offset, const uint32_t* fea_num, const uint32_t* index, const char* input, char* output, int n, size_t grad_dim, size_t grad_value_size, DynamicGradMerger& merger_, - const cudaStream_t& stream); + const cudaStream_t& stream, CommonFeatureValueAccessor& feature_value_accessor); template void HeterCommKernel::dy_mf_fill_dvals( float* d_shard_vals, float* d_vals, int* idx, long long len, diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h index 05d93b1d8bcc0..cb02773bc034f 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h @@ -41,82 +41,48 @@ struct DynamicGradMerger { return out; } + template __device__ __forceinline__ void update_one( float* output, const float* input, - CommonFeatureValueAccessor& feature_value_accessor) { - output[feature_value_accessor.common_push_value.SlotIndex()] = - input[feature_value_accessor.common_push_value.SlotIndex()]; - output[feature_value_accessor.common_push_value.ShowIndex()] = - input[feature_value_accessor.common_push_value.ShowIndex()]; - output[feature_value_accessor.common_push_value.ClickIndex()] = - input[feature_value_accessor.common_push_value.ClickIndex()]; - output[feature_value_accessor.common_push_value.MfDimIndex()] = - input[feature_value_accessor.common_push_value.MfDimIndex()]; - output[feature_value_accessor.common_push_value.EmbedGIndex()] = - input[feature_value_accessor.common_push_value.EmbedGIndex()]; - for (int j = 0; - j < int(output[feature_value_accessor.common_push_value.MfDimIndex()]); - j++) { - output[feature_value_accessor.common_push_value.EmbedxGIndex() + j] = - input[feature_value_accessor.common_push_value.EmbedxGIndex() + j]; - } + FVAccessor& feature_value_accessor) { + feature_value_accessor.PushValueFill(output, input); } + template __device__ __forceinline__ void merge_one( float* output, const float* input, - CommonFeatureValueAccessor& feature_value_accessor) { - output[feature_value_accessor.common_push_value.ShowIndex()] += - input[feature_value_accessor.common_push_value.ShowIndex()]; - output[feature_value_accessor.common_push_value.ClickIndex()] += - input[feature_value_accessor.common_push_value.ClickIndex()]; - output[feature_value_accessor.common_push_value.EmbedGIndex()] += - input[feature_value_accessor.common_push_value.EmbedGIndex()]; - for (int j = 0; - j < int(output[feature_value_accessor.common_push_value.MfDimIndex()]); - j++) { - output[feature_value_accessor.common_push_value.EmbedxGIndex() + j] += - input[feature_value_accessor.common_push_value.EmbedxGIndex() + j]; - } + FVAccessor& feature_value_accessor) { + feature_value_accessor.MergePushValue(output, input); } + template __device__ __forceinline__ void update_basic( float* output, const float* input, - CommonFeatureValueAccessor& fv_accessor) { - output[fv_accessor.common_push_value.SlotIndex()] = - input[fv_accessor.common_push_value.SlotIndex()]; - output[fv_accessor.common_push_value.ShowIndex()] = - input[fv_accessor.common_push_value.ShowIndex()]; - output[fv_accessor.common_push_value.ClickIndex()] = - input[fv_accessor.common_push_value.ClickIndex()]; - output[fv_accessor.common_push_value.MfDimIndex()] = - input[fv_accessor.common_push_value.MfDimIndex()]; - output[fv_accessor.common_push_value.EmbedGIndex()] = - input[fv_accessor.common_push_value.EmbedGIndex()]; + FVAccessor& fv_accessor) { + fv_accessor.PushValueFillBasic(output, input); } + template __device__ __forceinline__ void merge_basic( float* output, const float* input, - CommonFeatureValueAccessor& fv_accessor) { - output[fv_accessor.common_push_value.ShowIndex()] += - input[fv_accessor.common_push_value.ShowIndex()]; - output[fv_accessor.common_push_value.ClickIndex()] += - input[fv_accessor.common_push_value.ClickIndex()]; - output[fv_accessor.common_push_value.EmbedGIndex()] += - input[fv_accessor.common_push_value.EmbedGIndex()]; + FVAccessor& fv_accessor) { + fv_accessor.MergePushValueBasic(output, input); } + template __device__ __forceinline__ void update_embedx( float* output, const float* input, size_t embedx_idx, - CommonFeatureValueAccessor& fv_accessor) { + FVAccessor& fv_accessor) { if (embedx_idx < output[fv_accessor.common_push_value.MfDimIndex()]) { output[fv_accessor.common_push_value.EmbedxGIndex() + embedx_idx] = input[fv_accessor.common_push_value.EmbedxGIndex() + embedx_idx]; } } + template __device__ __forceinline__ void merge_embedx( float* output, const float* input, size_t embedx_idx, - CommonFeatureValueAccessor& fv_accessor) { + FVAccessor& fv_accessor) { if (embedx_idx < output[fv_accessor.common_push_value.MfDimIndex()]) { output[fv_accessor.common_push_value.EmbedxGIndex() + embedx_idx] += input[fv_accessor.common_push_value.EmbedxGIndex() + embedx_idx]; @@ -129,11 +95,6 @@ class HeterCommKernel { HeterCommKernel() {} explicit HeterCommKernel(const int block_size) : block_size_(block_size) {} - explicit HeterCommKernel(const int block_size, - CommonFeatureValueAccessor& feature_value_accessor) - : block_size_(block_size), - feature_value_accessor_(feature_value_accessor) {} - template void fill_idx(T* idx, long long len, const StreamType& stream); @@ -182,18 +143,21 @@ class HeterCommKernel { StreamType stream = NULL, bool debug_synchronous = false); - template + template void dy_mf_fill_shard_grads(KeyType* d_shard_keys, KeyType* d_keys, float* d_shard_grads, float* d_grads, T* idx, long long len, size_t grad_value_size, - const StreamType& stream); + const StreamType& stream, + FVAccessor& feature_value_accessor); - template + template void merge_gradient(const KeyType* d_shard_keys, const uint32_t* offset, const uint32_t* fea_num, const uint32_t* index, const char* input, char* output, int n, size_t grad_dim, size_t grad_value_size, DynamicGradMerger& merger, - const StreamType& stream); + const StreamType& stream, + FVAccessor& feature_value_accessor); template void dy_mf_fill_dvals(float* d_shard_vals, float* d_vals, T* idx, diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc index 1ec006f580c96..86e339ca74a30 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/framework/fleet/heter_ps/heter_ps.h" #include +#include "paddle/fluid/framework/fleet/heter_ps/heter_ps.h" #ifdef PADDLE_WITH_HETERPS @@ -21,25 +21,39 @@ namespace paddle { namespace framework { HeterPsBase* HeterPsBase::get_instance( - size_t capacity, std::shared_ptr resource, - CommonFeatureValueAccessor feature_value_accessor, + size_t capacity, + std::shared_ptr resource, + std::unordered_map fleet_config, + std::string accessor_type, int optimizer_type) { - return new HeterPs(capacity, resource, feature_value_accessor, optimizer_type); + if (accessor_type == "CtrDymfAccessor" && + (optimizer_type == 1 || optimizer_type == 3 || optimizer_type == 4)) { + return new HeterPs( + capacity, resource, accessor_type, fleet_config, optimizer_type); + } else { + VLOG(0) << " HeterPsBase get_instance Warning: now only support " + "CtrDymfAccessor, but get " + << accessor_type_; + return new HeterPs( + capacity, resource, accessor_type, fleet_config, optimizer_type); + } } -HeterPs::HeterPs(size_t capacity, std::shared_ptr resource, - CommonFeatureValueAccessor feature_value_accessor, - int optimizer_type) { - comm_ = - std::make_shared>( - capacity, resource); - feature_value_accessor_ = feature_value_accessor; +HeterPs::HeterPs(size_t capacity, + std::shared_ptr resource, + std::unordered_map fleet_config, + std::string accessor_type, + int optimizer_type) { + comm_ = std::make_shared>( + capacity, resource); optimizer_type_ = optimizer_type; } HeterPs::~HeterPs() {} -void HeterPs::pull_sparse(int num, FeatureKey* d_keys, float* d_vals, +void HeterPs::pull_sparse(int num, + FeatureKey* d_keys, + float* d_vals, size_t len) { comm_->pull_sparse(num, d_keys, d_vals, len); } @@ -60,8 +74,10 @@ void HeterPs::end_pass() { comm_->end_pass(); } void HeterPs::show_one_table(int gpu_num) { comm_->show_one_table(gpu_num); } -void HeterPs::push_sparse(int num, FeatureKey* d_keys, - float* d_grads, size_t len) { +void HeterPs::push_sparse(int num, + FeatureKey* d_keys, + float* d_grads, + size_t len) { comm_->push_sparse(num, d_keys, d_grads, len); // comm_->push_sparse_multi_node(num, d_keys, d_grads, len, opt_); } diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu index b812250f012a0..32f037b248944 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu @@ -23,87 +23,132 @@ namespace framework { HeterPsBase* HeterPsBase::get_instance( size_t capacity, std::shared_ptr resource, - CommonFeatureValueAccessor feature_value_accessor, int optimizer_type) { - return new HeterPs(capacity, resource, feature_value_accessor, - optimizer_type); + std::unordered_map fleet_config, + std::string accessor_type, int optimizer_type) { + if (accessor_type == "CtrDymfAccessor" && + (optimizer_type == 1 || optimizer_type == 3 || optimizer_type == 4)) { + return new HeterPs( + capacity, resource, fleet_config, accessor_type, optimizer_type); + } else { + VLOG(0) << " HeterPsBase get_instance Warning: now only support " + "CtrDymfAccessor, but get " + << accessor_type; + return new HeterPs( + capacity, resource, fleet_config, accessor_type, optimizer_type); + } } -HeterPs::HeterPs(size_t capacity, std::shared_ptr resource, - CommonFeatureValueAccessor feature_value_accessor, - int optimizer_type) { - comm_ = std::make_shared>( - capacity, resource, feature_value_accessor); - feature_value_accessor_ = feature_value_accessor; +template +HeterPs::HeterPs( + size_t capacity, std::shared_ptr resource, + std::unordered_map fleet_config, + std::string accessor_type, int optimizer_type) { + comm_ = std::make_shared>( + capacity, resource); + feature_value_accessor_.Configure(fleet_config); + set_accessor(feature_value_accessor_); + accessor_type_ = accessor_type; optimizer_type_ = optimizer_type; } -HeterPs::~HeterPs() {} +template +HeterPs::~HeterPs() {} -void HeterPs::pull_sparse(int num, FeatureKey* d_keys, float* d_vals, - size_t len) { +template +void HeterPs::pull_sparse(int num, FeatureKey* d_keys, + float* d_vals, size_t len) { comm_->pull_sparse(num, d_keys, d_vals, len); } -void HeterPs::build_ps(int num, FeatureKey* h_keys, char* pool, size_t len, - size_t feature_value_size, size_t chunk_size, - int stream_num) { +template +void HeterPs::build_ps(int num, FeatureKey* h_keys, char* pool, + size_t len, size_t feature_value_size, + size_t chunk_size, int stream_num) { comm_->build_ps(num, h_keys, pool, len, feature_value_size, chunk_size, stream_num); } -int HeterPs::get_index_by_devid(int devid) { +template +int HeterPs::get_index_by_devid(int devid) { return comm_->get_index_by_devid(devid); } -void HeterPs::set_sparse_sgd(const OptimizerConfig& optimizer_config) { +template +void HeterPs::set_sparse_sgd( + const OptimizerConfig& optimizer_config) { comm_->set_sparse_sgd(optimizer_config); } -void HeterPs::set_embedx_sgd(const OptimizerConfig& optimizer_config) { +template +void HeterPs::set_embedx_sgd( + const OptimizerConfig& optimizer_config) { comm_->set_embedx_sgd(optimizer_config); } -void HeterPs::end_pass() { comm_->end_pass(); } +template +void HeterPs::end_pass() { + comm_->end_pass(); +} -void HeterPs::show_one_table(int gpu_num) { comm_->show_one_table(gpu_num); } +template +void HeterPs::show_one_table(int gpu_num) { + comm_->show_one_table(gpu_num); +} -void HeterPs::push_sparse(int num, FeatureKey* d_keys, float* d_grads, +template +void HeterPs::push_sparse(int num, FeatureKey* d_keys, float* d_grads, size_t len) { - if (optimizer_type_ == 3) { // adam - auto optimizer = SparseAdamOptimizer(feature_value_accessor_); - VLOG(5) << "INTO push_sparse SparseAdamOptimizer, EmbedDim():" - << optimizer.EmbedDim(); - comm_->push_sparse(num, d_keys, d_grads, len, optimizer); - } else if (optimizer_type_ == 4) { // shared_adam - auto optimizer = SparseAdamSharedOptimizer(feature_value_accessor_); - VLOG(5) << "INTO push_sparse SparseAdamSharedOptimizer, EmbedDim():" - << optimizer.EmbedDim(); - comm_->push_sparse(num, d_keys, d_grads, len, optimizer); + if (accessor_type_ == "CtrDymfAccessor") { + if (optimizer_type_ == 3) { // adam + auto optimizer = SparseAdamOptimizer(feature_value_accessor_); + VLOG(5) << "INTO push_sparse SparseAdamOptimizer, EmbedDim():" + << optimizer.EmbedDim(); + comm_->push_sparse(num, d_keys, d_grads, len, optimizer); + } else if (optimizer_type_ == 4) { // shared_adam + auto optimizer = SparseAdamSharedOptimizer(feature_value_accessor_); + VLOG(5) << "INTO push_sparse SparseAdamSharedOptimizer, EmbedDim():" + << optimizer.EmbedDim(); + comm_->push_sparse(num, d_keys, d_grads, len, optimizer); + } else if (optimizer_type_ == 1) { // adagrad { + auto optimizer = SparseAdagradOptimizer(feature_value_accessor_); + VLOG(5) << "INTO push_sparse SparseAdagradOptimizer, EmbedDim():" + << optimizer.EmbedDim(); + comm_->push_sparse(num, d_keys, d_grads, len, optimizer); + } else { + VLOG(0) << " push sparse Error: CtrDymfAccessor only support adagrad(1)," + "adam(3) or shared_adam(4), bug get optimizer type:" + << optimizer_type_; + } } else { - auto optimizer = SparseAdagradOptimizer(feature_value_accessor_); - VLOG(5) << "INTO push_sparse SparseAdagradOptimizer, EmbedDim():" - << optimizer.EmbedDim(); - comm_->push_sparse(num, d_keys, d_grads, len, optimizer); + VLOG(0) << " push sparse Error: now only support CtrDymfAccessor, but get " + << accessor_type_; } } -void HeterPs::set_nccl_comm_and_size(const std::vector& inner_comms, - const std::vector& inter_comms, - int comm_size) { +template +void HeterPs::set_nccl_comm_and_size( + const std::vector& inner_comms, + const std::vector& inter_comms, int comm_size) { comm_->set_nccl_comm_and_size(inner_comms, inter_comms, comm_size); } -void HeterPs::set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) { +template +void HeterPs::set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) { comm_->set_multi_mf_dim(multi_mf_dim, max_mf_dim); } -void HeterPs::set_accessor(CommonFeatureValueAccessor& accessor) { +template +void HeterPs::set_accessor(FVAccessor& accessor) { comm_->set_accessor(accessor); } -void HeterPs::show_table_collisions() { comm_->show_table_collisions(); } +template +void HeterPs::show_table_collisions() { + comm_->show_table_collisions(); +} -int HeterPs::dedup_keys_and_fillidx(const int gpu_id, +template +int HeterPs::dedup_keys_and_fillidx(const int gpu_id, const int total_fea_num, const FeatureKey* d_keys, // input FeatureKey* d_merged_keys, // output diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h index caa069fb30613..a8314e2e28e22 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h @@ -26,12 +26,13 @@ limitations under the License. */ namespace paddle { namespace framework { +template class HeterPs : public HeterPsBase { public: HeterPs() {} HeterPs(size_t capacity, std::shared_ptr resource, - CommonFeatureValueAccessor feature_value_accessor, - int optimizer_type); + std::unordered_map fleet_config, + std::string accessor_type, int optimizer_type); virtual ~HeterPs(); HeterPs(const HeterPs&) = delete; HeterPs& operator=(const HeterPs&) = delete; @@ -48,7 +49,8 @@ class HeterPs : public HeterPsBase { const std::vector& inter_comms, int comm_size) override; void set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) override; - void set_accessor(CommonFeatureValueAccessor& accessor) override; + + void set_accessor(FVAccessor& accessor); #endif void set_sparse_sgd(const OptimizerConfig& optimizer_config) override; @@ -73,9 +75,10 @@ class HeterPs : public HeterPsBase { bool filter_zero); #endif private: - std::shared_ptr> comm_; + std::shared_ptr> comm_; #if defined(PADDLE_WITH_CUDA) - CommonFeatureValueAccessor feature_value_accessor_; + FVAccessor feature_value_accessor_; + std::string accessor_type_; int optimizer_type_; #endif }; diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h index e5fe095f9b011..840ad8f001b74 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h @@ -43,7 +43,6 @@ class HeterPsBase { const std::vector& inner_comms, const std::vector& inter_comms, int comm_size) = 0; virtual void set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) = 0; - virtual void set_accessor(CommonFeatureValueAccessor& accessor) = 0; #endif virtual void end_pass() = 0; @@ -57,7 +56,9 @@ class HeterPsBase { static HeterPsBase* get_instance( size_t capacity, std::shared_ptr resource, - CommonFeatureValueAccessor feature_value_accessor, int optimizer_type); + // CommonFeatureValueAccessor feature_value_accessor, + std::unordered_map fleet_config, + std::string accessor_type, int optimizer_type); #if defined(PADDLE_WITH_CUDA) // dedup virtual int dedup_keys_and_fillidx(const int gpu_id, diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc index 5fc21d2b0e495..4d64375f93656 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc @@ -28,12 +28,12 @@ limitations under the License. */ #ifdef PADDLE_WITH_HETERPS #include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h" -#include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h" #include #include #include "paddle/fluid/framework/data_set.h" +#include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_utils.h" #include "paddle/fluid/platform/timer.h" #if defined(PADDLE_WITH_PSCORE) #include "paddle/fluid/distributed/ps/table/depends/feature_value.h" @@ -45,10 +45,12 @@ namespace paddle { namespace framework { #ifdef PADDLE_WITH_PSLIB -void AfsWrapper::init(const std::string& fs_name, const std::string& fs_user, - const std::string& pass_wd, const std::string& conf) { - int ret = afs_handler_.init(fs_name.c_str(), fs_user.c_str(), pass_wd.c_str(), - conf.c_str()); +void AfsWrapper::init(const std::string& fs_name, + const std::string& fs_user, + const std::string& pass_wd, + const std::string& conf) { + int ret = afs_handler_.init( + fs_name.c_str(), fs_user.c_str(), pass_wd.c_str(), conf.c_str()); if (ret != 0) { LOG(ERROR) << "AFS Init Error"; } @@ -100,8 +102,8 @@ void PSGPUWrapper::InitAfsApi(const std::string& fs_name, const std::string& fs_user, const std::string& pass_wd, const std::string& conf) { - int ret = afs_handler_.init(fs_name.c_str(), fs_user.c_str(), pass_wd.c_str(), - conf.c_str()); + int ret = afs_handler_.init( + fs_name.c_str(), fs_user.c_str(), pass_wd.c_str(), conf.c_str()); if (ret != 0) { VLOG(0) << "AFS Init Error"; } @@ -148,16 +150,20 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task) { VLOG(0) << "total len: " << total_len; auto gen_dynamic_mf_func = [this]( const std::deque& total_data, - int begin_index, int end_index, int i) { + int begin_index, + int end_index, + int i) { for (auto iter = total_data.begin() + begin_index; - iter != total_data.begin() + end_index; iter++) { + iter != total_data.begin() + end_index; + iter++) { const auto& ins = *iter; const auto& feasign_v = ins->slot_uint64_feasigns_.slot_values; const auto& slot_offset = ins->slot_uint64_feasigns_.slot_offsets; for (size_t slot_idx = 0; slot_idx < slot_offset_vector_.size(); slot_idx++) { for (size_t j = slot_offset[slot_offset_vector_[slot_idx]]; - j < slot_offset[slot_offset_vector_[slot_idx] + 1]; j++) { + j < slot_offset[slot_offset_vector_[slot_idx] + 1]; + j++) { int shard_id = feasign_v[j] % thread_keys_shard_num_; int dim_id = slot_index_vec_[slot_idx]; if (feasign_v[j] != 0) { @@ -170,8 +176,11 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task) { }; for (int i = 0; i < thread_keys_thread_num_; i++) { threads.push_back( - std::thread(gen_dynamic_mf_func, std::ref(vec_data), begin, - begin + len_per_thread + (i < remain ? 1 : 0), i)); + std::thread(gen_dynamic_mf_func, + std::ref(vec_data), + begin, + begin + len_per_thread + (i < remain ? 1 : 0), + i)); begin += len_per_thread + (i < remain ? 1 : 0); } @@ -192,9 +201,12 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task) { len_per_thread = total_len / thread_keys_thread_num_; remain = total_len % thread_keys_thread_num_; auto gen_func = [this](const std::deque& total_data, - int begin_index, int end_index, int i) { + int begin_index, + int end_index, + int i) { for (auto iter = total_data.begin() + begin_index; - iter != total_data.begin() + end_index; iter++) { + iter != total_data.begin() + end_index; + iter++) { const auto& ins = *iter; const auto& feasign_v = ins.uint64_feasigns_; for (const auto feasign : feasign_v) { @@ -206,8 +218,11 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task) { }; for (int i = 0; i < thread_keys_thread_num_; i++) { threads.push_back( - std::thread(gen_func, std::ref(vec_data), begin, - begin + len_per_thread + (i < remain ? 1 : 0), i)); + std::thread(gen_func, + std::ref(vec_data), + begin, + begin + len_per_thread + (i < remain ? 1 : 0), + i)); begin += len_per_thread + (i < remain ? 1 : 0); } for (std::thread& t : threads) { @@ -227,19 +242,25 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task) { VLOG(0) << "GpuGraphTotalKeys: " << total_len; remain = total_len % thread_keys_thread_num_; auto gen_graph_data_func = [this](const std::vector& total_data, - int begin_index, int end_index, int i) { + int begin_index, + int end_index, + int i) { for (auto iter = total_data.begin() + begin_index; - iter != total_data.begin() + end_index; iter++) { + iter != total_data.begin() + end_index; + iter++) { uint64_t cur_key = *iter; int shard_id = cur_key % thread_keys_shard_num_; this->thread_keys_[i][shard_id].insert(cur_key); } }; auto gen_graph_dynamic_mf_func = - [this](const std::vector& total_data, int begin_index, - int end_index, int i) { + [this](const std::vector& total_data, + int begin_index, + int end_index, + int i) { for (auto iter = total_data.begin() + begin_index; - iter != total_data.begin() + end_index; iter++) { + iter != total_data.begin() + end_index; + iter++) { uint64_t cur_key = *iter; int shard_id = cur_key % thread_keys_shard_num_; // TODO: feasign <-> slot <-> multi_dim @@ -250,13 +271,19 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task) { if (!multi_mf_dim_) { VLOG(1) << "psgpu graph wrapper genfunc"; threads.push_back( - std::thread(gen_graph_data_func, std::ref(vec_data), begin, - begin + len_per_thread + (i < remain ? 1 : 0), i)); + std::thread(gen_graph_data_func, + std::ref(vec_data), + begin, + begin + len_per_thread + (i < remain ? 1 : 0), + i)); } else { VLOG(1) << "psgpu graph wrapper genfunc with dynamic mf"; threads.push_back( - std::thread(gen_graph_dynamic_mf_func, std::ref(vec_data), begin, - begin + len_per_thread + (i < remain ? 1 : 0), i)); + std::thread(gen_graph_dynamic_mf_func, + std::ref(vec_data), + begin, + begin + len_per_thread + (i < remain ? 1 : 0), + i)); } begin += len_per_thread + (i < remain ? 1 : 0); } @@ -271,8 +298,8 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task) { // merge thread_keys to shard_keys auto merge_ins_dynamic_mf_func = [this, gpu_task](int shard_num, int dim_id) { for (int i = 0; i < thread_keys_thread_num_; ++i) { - gpu_task->batch_add_keys(shard_num, dim_id, - thread_dim_keys_[i][shard_num][dim_id]); + gpu_task->batch_add_keys( + shard_num, dim_id, thread_dim_keys_[i][shard_num][dim_id]); thread_dim_keys_[i][shard_num][dim_id].clear(); } }; @@ -349,82 +376,87 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { timeline.Start(); - auto ptl_dynamic_mf_func = [this, &local_dim_keys, &local_dim_ptr, - &fleet_ptr](int i, int j) { - size_t key_size = local_dim_keys[i][j].size(); - int32_t status = -1; - int32_t cnt = 0; + auto ptl_dynamic_mf_func = + [this, &local_dim_keys, &local_dim_ptr, &fleet_ptr](int i, int j) { + size_t key_size = local_dim_keys[i][j].size(); + int32_t status = -1; + int32_t cnt = 0; #ifdef PADDLE_WITH_PSLIB - while (true) { - auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr( - i, reinterpret_cast(local_dim_ptr[i][j].data()), - this->table_id_, local_dim_keys[i][j].data(), key_size); - bool flag = true; - - tt.wait(); - - try { - status = tt.get(); - } catch (const std::future_error& e) { - VLOG(0) << "Caught a future_error with code" << e.code() - << ", Message:" << e.what(); - } - if (status != 0) { - VLOG(0) << "fleet pull sparse failed, status[" << status << "]"; - sleep(sleep_seconds_before_fail_exit_); - flag = false; - cnt++; - } - if (cnt > 3) { - VLOG(0) << "fleet pull sparse failed, retry 3 times"; - exit(-1); - } + while (true) { + auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr( + i, + reinterpret_cast(local_dim_ptr[i][j].data()), + this->table_id_, + local_dim_keys[i][j].data(), + key_size); + bool flag = true; + + tt.wait(); + + try { + status = tt.get(); + } catch (const std::future_error& e) { + VLOG(0) << "Caught a future_error with code" << e.code() + << ", Message:" << e.what(); + } + if (status != 0) { + VLOG(0) << "fleet pull sparse failed, status[" << status << "]"; + sleep(sleep_seconds_before_fail_exit_); + flag = false; + cnt++; + } + if (cnt > 3) { + VLOG(0) << "fleet pull sparse failed, retry 3 times"; + exit(-1); + } - if (flag) { - break; - } - } + if (flag) { + break; + } + } #endif #ifdef PADDLE_WITH_PSCORE - while (true) { - auto tt = fleet_ptr->worker_ptr_->PullSparsePtr( - reinterpret_cast(local_dim_ptr[i][j].data()), this->table_id_, - local_dim_keys[i][j].data(), key_size); - bool flag = true; - - tt.wait(); - - try { - status = tt.get(); - } catch (const std::future_error& e) { - VLOG(0) << "Caught a future_error with code" << e.code() - << ", Message:" << e.what(); - } - if (status != 0) { - VLOG(0) << "fleet pull sparse failed, status[" << status << "]"; - sleep(sleep_seconds_before_fail_exit_); - flag = false; - cnt++; - } - if (cnt > 3) { - VLOG(0) << "fleet pull sparse failed, retry 3 times"; - exit(-1); - } + while (true) { + auto tt = fleet_ptr->worker_ptr_->PullSparsePtr( + reinterpret_cast(local_dim_ptr[i][j].data()), + this->table_id_, + local_dim_keys[i][j].data(), + key_size); + bool flag = true; + + tt.wait(); + + try { + status = tt.get(); + } catch (const std::future_error& e) { + VLOG(0) << "Caught a future_error with code" << e.code() + << ", Message:" << e.what(); + } + if (status != 0) { + VLOG(0) << "fleet pull sparse failed, status[" << status << "]"; + sleep(sleep_seconds_before_fail_exit_); + flag = false; + cnt++; + } + if (cnt > 3) { + VLOG(0) << "fleet pull sparse failed, retry 3 times"; + exit(-1); + } - if (flag) { - break; - } - } + if (flag) { + break; + } + } #endif - if (status != 0) { - LOG(ERROR) << "fleet pull sparse failed, status[" << status << "]"; - sleep(300); - exit(-1); - } else { - VLOG(0) << "FleetWrapper Pull sparse to local done with table size: " - << local_dim_keys[i][j].size(); - } - }; + if (status != 0) { + LOG(ERROR) << "fleet pull sparse failed, status[" << status << "]"; + sleep(300); + exit(-1); + } else { + VLOG(0) << "FleetWrapper Pull sparse to local done with table size: " + << local_dim_keys[i][j].size(); + } + }; threads.resize(thread_keys_shard_num_ * multi_mf_dim_); for (int i = 0; i < thread_keys_shard_num_; i++) { @@ -455,8 +487,11 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { bool record_status = false; auto& device_task_keys = gpu_task->device_task_keys_; auto& device_task_ptrs = gpu_task->device_task_ptr_; - auto build_pull_dynamic_mf_func = [this, device_num, &local_dim_keys, - &local_dim_ptr, &device_dim_keys, + auto build_pull_dynamic_mf_func = [this, + device_num, + &local_dim_keys, + &local_dim_ptr, + &device_dim_keys, &device_dim_ptr, &device_dim_mutex](int i, int j) { std::vector> task_keys(device_num); @@ -488,8 +523,13 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { device_dim_mutex[dev][j]->unlock(); } }; - auto build_func = [device_num, record_status, &pass_values, &local_keys, - &local_ptr, &device_task_keys, &device_task_ptrs](int i) { + auto build_func = [device_num, + record_status, + &pass_values, + &local_keys, + &local_ptr, + &device_task_keys, + &device_task_ptrs](int i) { auto& task_keys = device_task_keys[i]; #ifdef PADDLE_WITH_PSLIB auto& task_ptrs = device_task_ptrs[i]; @@ -584,163 +624,123 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr gpu_task) { return; } std::vector threads(device_num); + auto accessor_wrapper_ptr = + GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); HeterPs_ = HeterPsBase::get_instance( - size_max, resource_, feature_value_accessor_, optimizer_type_); + size_max, resource_, fleet_config_, accessor_class_, optimizer_type_); #ifdef PADDLE_WITH_CUDA HeterPs_->set_nccl_comm_and_size(inner_comms_, inter_comms_, node_size_); HeterPs_->set_sparse_sgd(optimizer_config_); HeterPs_->set_embedx_sgd(optimizer_config_); #endif - auto build_dynamic_mf_func = - [this, &gpu_task](int i, int j) { - this->HeterPs_->set_multi_mf_dim(multi_mf_dim_, max_mf_dim_); - // this->HeterPs_->set_accessor(feature_value_accessor_); - int mf_dim = this->index_dim_vec_[j]; - VLOG(0) << "building table: " << i << "with mf dim: " << mf_dim - << " feature_value_dim:" - << feature_value_accessor_.common_feature_value.Dim(mf_dim) - << " feature_value_size:" - << feature_value_accessor_.common_feature_value.Size(mf_dim); - size_t feature_value_size = TYPEALIGN( - 8, feature_value_accessor_.common_feature_value.Size(mf_dim)); - auto& device_dim_keys = gpu_task->device_dim_keys_[i][j]; - auto& device_dim_ptrs = gpu_task->device_dim_ptr_[i][j]; - size_t len = device_dim_keys.size(); - CHECK(len == device_dim_ptrs.size()); - this->mem_pools_[i * this->multi_mf_dim_ + j] = - new MemoryPool(len, feature_value_size); - auto& mem_pool = this->mem_pools_[i * this->multi_mf_dim_ + j]; - for (size_t k = 0; k < len; k++) { - float* val = (float*)(mem_pool->mem_address(k)); - float* ptr_val = device_dim_ptrs[k]->data(); - size_t dim = device_dim_ptrs[k]->size(); + auto build_dynamic_mf_func = [this, &gpu_task, &accessor_wrapper_ptr](int i, + int j) { + this->HeterPs_->set_multi_mf_dim(multi_mf_dim_, max_mf_dim_); + // this->HeterPs_->set_accessor(feature_value_accessor_); + int mf_dim = this->index_dim_vec_[j]; + VLOG(0) << "building table: " << i << "with mf dim: " << mf_dim + << " feature_value_size:" + << accessor_wrapper_ptr->GetFeatureValueSize(mf_dim); + size_t feature_value_size = + accessor_wrapper_ptr->GetFeatureValueSize(mf_dim); + auto& device_dim_keys = gpu_task->device_dim_keys_[i][j]; + auto& device_dim_ptrs = gpu_task->device_dim_ptr_[i][j]; + size_t len = device_dim_keys.size(); + CHECK(len == device_dim_ptrs.size()); + this->mem_pools_[i * this->multi_mf_dim_ + j] = + new MemoryPool(len, feature_value_size); + auto& mem_pool = this->mem_pools_[i * this->multi_mf_dim_ + j]; + #ifdef PADDLE_WITH_PSLIB - val->delta_score = - ptr_val[paddle::ps::DownpourCtrDymfAccessor:: - DownpourCtrDymfFeatureValue::delta_score_index()]; - val->show = ptr_val[paddle::ps::DownpourCtrDymfAccessor:: - DownpourCtrDymfFeatureValue::show_index()]; - val->clk = ptr_val[paddle::ps::DownpourCtrDymfAccessor:: - DownpourCtrDymfFeatureValue::click_index()]; - val->slot = - int(ptr_val[paddle::ps::DownpourCtrDymfAccessor:: - DownpourCtrDymfFeatureValue::slot_index()]); - val->lr = ptr_val[paddle::ps::DownpourCtrDymfAccessor:: - DownpourCtrDymfFeatureValue::embed_w_index()]; - val->lr_g2sum = - ptr_val[paddle::ps::DownpourCtrDymfAccessor:: - DownpourCtrDymfFeatureValue::embed_g2sum_index()]; - // TODO(xuefeng) set mf_dim while using DownpourCtrDymfAccessor + for (size_t k = 0; k < len; k++) { + float* val = (float*)(mem_pool->mem_address(k)); + float* ptr_val = device_dim_ptrs[k]->data(); + size_t dim = device_dim_ptrs[k]->size(); + val->delta_score = ptr_val[paddle::ps::DownpourCtrDymfAccessor:: - DownpourCtrDymfFeatureValue::mf_dim_index()] = - float(mf_dim); - val->mf_dim = mf_dim; - if (dim > 8) { // CpuPS alreay expand as mf_dim - val->mf_size = mf_dim + 1; - for (int x = 0; x < val->mf_dim + 1; x++) { - val->mf[x] = ptr_val[x + 8]; - } - } else { - val->mf_size = 0; - for (int x = 0; x < val->mf_dim + 1; x++) { - val->mf[x] = 0; - } - } - } -#endif -#ifdef PADDLE_WITH_PSCORE - VLOG(5) << "cpu build " << k - << " cpuptr: " << (uint64_t)(device_dim_ptrs[k]) - << " |: " << cpu_table_accessor_->ParseToString(ptr_val, dim); - val[feature_value_accessor_.common_feature_value.DeltaScoreIndex()] = - ptr_val[cpu_table_accessor_->common_feature_value - .DeltaScoreIndex()]; - val[feature_value_accessor_.common_feature_value.ShowIndex()] = - ptr_val[cpu_table_accessor_->common_feature_value.ShowIndex()]; - val[feature_value_accessor_.common_feature_value.ClickIndex()] = - ptr_val[cpu_table_accessor_->common_feature_value.ClickIndex()]; - val[feature_value_accessor_.common_feature_value.SlotIndex()] = - ptr_val[cpu_table_accessor_->common_feature_value.SlotIndex()]; - val[feature_value_accessor_.common_feature_value.EmbedWIndex()] = - ptr_val[cpu_table_accessor_->common_feature_value.EmbedWIndex()]; - for (int i = 0; - i < feature_value_accessor_.common_feature_value.EmbedDim(); i++) { - val[feature_value_accessor_.common_feature_value.EmbedG2SumIndex() + - i] = ptr_val - [cpu_table_accessor_->common_feature_value.EmbedG2SumIndex() + i]; + DownpourCtrDymfFeatureValue::delta_score_index()]; + val->show = ptr_val[paddle::ps::DownpourCtrDymfAccessor:: + DownpourCtrDymfFeatureValue::show_index()]; + val->clk = ptr_val[paddle::ps::DownpourCtrDymfAccessor:: + DownpourCtrDymfFeatureValue::click_index()]; + val->slot = int(ptr_val[paddle::ps::DownpourCtrDymfAccessor:: + DownpourCtrDymfFeatureValue::slot_index()]); + val->lr = ptr_val[paddle::ps::DownpourCtrDymfAccessor:: + DownpourCtrDymfFeatureValue::embed_w_index()]; + val->lr_g2sum = + ptr_val[paddle::ps::DownpourCtrDymfAccessor:: + DownpourCtrDymfFeatureValue::embed_g2sum_index()]; + // TODO(xuefeng) set mf_dim while using DownpourCtrDymfAccessor + ptr_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue:: + mf_dim_index()] = float(mf_dim); + val->mf_dim = mf_dim; + if (dim > 8) { // CpuPS alreay expand as mf_dim + val->mf_size = mf_dim + 1; + for (int x = 0; x < val->mf_dim + 1; x++) { + val->mf[x] = ptr_val[x + 8]; } - - *(reinterpret_cast( - val + feature_value_accessor_.common_feature_value.CpuPtrIndex())) = - (uint64_t)(device_dim_ptrs[k]); - - ptr_val[cpu_table_accessor_->common_feature_value.MfDimIndex()] = - float(mf_dim); - val[feature_value_accessor_.common_feature_value.MfDimIndex()] = mf_dim; - if (dim > cpu_table_accessor_->GetAccessorInfo().dim - - cpu_table_accessor_->GetAccessorInfo().mf_size / - sizeof(float)) { - val[feature_value_accessor_.common_feature_value.MfSizeIndex()] = - feature_value_accessor_.common_feature_value.MFSize(mf_dim) / - sizeof(float); - - for (int x = 0; - x < - int(feature_value_accessor_.common_feature_value.MFSize(mf_dim) / - sizeof(float)); - x++) { - val[feature_value_accessor_.common_feature_value - .EmbedxG2SumIndex() + - x] = ptr_val[cpu_table_accessor_->common_feature_value - .EmbedxG2SumIndex() + - x]; - } - } else { - val[feature_value_accessor_.common_feature_value.MfSizeIndex()] = 0; - for (int x = feature_value_accessor_.common_feature_value - .EmbedxG2SumIndex(); - x < - int(feature_value_accessor_.common_feature_value.Size(mf_dim) / - sizeof(float)); - x++) { - val[x] = 0; - } + } else { + val->mf_size = 0; + for (int x = 0; x < val->mf_dim + 1; x++) { + val->mf[x] = 0; } - VLOG(5) << "build " << k << " : " - << feature_value_accessor_.ParseToString( - val, feature_value_accessor_.common_feature_value.Dim( - mf_dim)); } + } +#endif +#ifdef PADDLE_WITH_PSCORE + for (size_t k = 0; k < len; k++) { + void* val = mem_pool->mem_address(k); + // float* ptr_val = device_dim_ptrs[k]->data(); + // size_t dim = device_dim_ptrs[k]->size(); + // VLOG(5) << "cpu build " << k + // << " cpuptr: " << (uint64_t)(device_dim_ptrs[k]) + // << " |: " << cpu_table_accessor_->ParseToString(ptr_val, + // dim); + accessor_wrapper_ptr->BuildFill( + val, device_dim_ptrs[k], cpu_table_accessor_, mf_dim); + VLOG(5) << "build " << k << " : " + << accessor_wrapper_ptr->ParseToString( + (float*)(val), + int(accessor_wrapper_ptr->GetFeatureValueSize(mf_dim) / + sizeof(float))); + } #endif - platform::CUDADeviceGuard guard(resource_->dev_id(i)); + platform::CUDADeviceGuard guard(resource_->dev_id(i)); - this->hbm_pools_[i * this->multi_mf_dim_ + j] = new HBMMemoryPool(mem_pool); - auto& cur_pool = this->hbm_pools_[i * this->multi_mf_dim_ + j]; + this->hbm_pools_[i * this->multi_mf_dim_ + j] = new HBMMemoryPool(mem_pool); + auto& cur_pool = this->hbm_pools_[i * this->multi_mf_dim_ + j]; - this->HeterPs_->build_ps(i, device_dim_keys.data(), cur_pool->mem(), len, - feature_value_size, 500000, 2); + this->HeterPs_->build_ps(i, + device_dim_keys.data(), + cur_pool->mem(), + len, + feature_value_size, + 500000, + 2); - if (device_dim_keys.size() > 0) { - VLOG(0) << "show ptr table: " << i - << " table kv size: " << device_dim_keys.size() << "dim: " << mf_dim - << " len: " << len; - this->HeterPs_->show_one_table(i); - } - delete mem_pool; -}; -threads.resize(device_num* multi_mf_dim_); -for (int i = 0; i < device_num; i++) { - for (int j = 0; j < multi_mf_dim_; j++) { - threads[i + j * device_num] = std::thread(build_dynamic_mf_func, i, j); + if (device_dim_keys.size() > 0) { + VLOG(0) << "show ptr table: " << i + << " table kv size: " << device_dim_keys.size() + << "dim: " << mf_dim << " len: " << len; + this->HeterPs_->show_one_table(i); + } + delete mem_pool; + }; + + threads.resize(device_num * multi_mf_dim_); + for (int i = 0; i < device_num; i++) { + for (int j = 0; j < multi_mf_dim_; j++) { + threads[i + j * device_num] = std::thread(build_dynamic_mf_func, i, j); + } } -} -for (std::thread& t : threads) { - t.join(); -} -timeline.Pause(); -VLOG(0) << "GpuPs build table total costs: " << timeline.ElapsedSec() << " s."; + for (std::thread& t : threads) { + t.join(); + } + timeline.Pause(); + VLOG(0) << "GpuPs build table total costs: " << timeline.ElapsedSec() + << " s."; } void PSGPUWrapper::LoadIntoMemory(bool is_shuffle) { @@ -833,7 +833,8 @@ void PSGPUWrapper::BeginPass() { } if (FLAGS_gpugraph_dedup_pull_push_mode) { VLOG(0) << "BeginPass end, cost time: " << timer.ElapsedSec() - << "s, enable pull push dedup mode=" << FLAGS_gpugraph_dedup_pull_push_mode; + << "s, enable pull push dedup mode=" + << FLAGS_gpugraph_dedup_pull_push_mode; } else { VLOG(0) << "BeginPass end, cost time: " << timer.ElapsedSec() << "s"; } @@ -856,152 +857,79 @@ void PSGPUWrapper::EndPass() { } } - auto dump_pool_to_cpu_func = - [this](int i, int j) { - PADDLE_ENFORCE_GPU_SUCCESS(cudaSetDevice(this->resource_->dev_id(i))); - auto& hbm_pool = this->hbm_pools_[i * this->multi_mf_dim_ + j]; - auto& device_keys = this->current_task_->device_dim_keys_[i][j]; - size_t len = device_keys.size(); - int mf_dim = this->index_dim_vec_[j]; - size_t feature_value_size = TYPEALIGN( - 8, feature_value_accessor_.common_feature_value.Size(mf_dim)); - VLOG(0) << "dump pool to cpu table: " << i << "with mf dim: " << mf_dim - << " key_len :" << len - << " feature_value_size:" << feature_value_size; - - char* test_build_values = (char*)malloc(feature_value_size * len); - cudaMemcpy(test_build_values, hbm_pool->mem(), feature_value_size * len, - cudaMemcpyDeviceToHost); - - CHECK(len == hbm_pool->capacity()); - uint64_t unuse_key = std::numeric_limits::max(); - for (size_t index = 0; index < len; ++index) { - if (device_keys[index] == unuse_key) { - continue; - } - size_t offset = index * feature_value_size; - float* gpu_val = (float*)(test_build_values + offset); + auto accessor_wrapper_ptr = + GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); + auto dump_pool_to_cpu_func = [this, &accessor_wrapper_ptr](int i, int j) { + PADDLE_ENFORCE_GPU_SUCCESS(cudaSetDevice(this->resource_->dev_id(i))); + auto& hbm_pool = this->hbm_pools_[i * this->multi_mf_dim_ + j]; + auto& device_keys = this->current_task_->device_dim_keys_[i][j]; + size_t len = device_keys.size(); + int mf_dim = this->index_dim_vec_[j]; + size_t feature_value_size = + accessor_wrapper_ptr->GetFeatureValueSize(mf_dim); + VLOG(0) << "dump pool to cpu table: " << i << "with mf dim: " << mf_dim + << " key_len :" << len + << " feature_value_size:" << feature_value_size; + + char* test_build_values = (char*)malloc(feature_value_size * len); + cudaMemcpy(test_build_values, + hbm_pool->mem(), + feature_value_size * len, + cudaMemcpyDeviceToHost); + + CHECK(len == hbm_pool->capacity()); + uint64_t unuse_key = std::numeric_limits::max(); + for (size_t index = 0; index < len; ++index) { + if (device_keys[index] == unuse_key) { + continue; + } + size_t offset = index * feature_value_size; + float* gpu_val = (float*)(test_build_values + offset); #ifdef PADDLE_WITH_PSLIB - auto* downpour_value = - (paddle::ps::DownpourFixedFeatureValue*)(gpu_val->cpu_ptr); - int downpour_value_size = downpour_value->size(); - if (gpu_val->mf_size > 0 && downpour_value_size == 8) { - downpour_value->resize(gpu_val->mf_dim + 1 + downpour_value_size); - } - float* cpu_val = downpour_value->data(); - cpu_val[paddle::ps::DownpourCtrDymfAccessor:: - DownpourCtrDymfFeatureValue::delta_score_index()] = - gpu_val->delta_score; - cpu_val[paddle::ps::DownpourCtrDymfAccessor:: - DownpourCtrDymfFeatureValue::show_index()] = - gpu_val->show; - cpu_val[paddle::ps::DownpourCtrDymfAccessor:: - DownpourCtrDymfFeatureValue::click_index()] = - gpu_val->clk; - cpu_val[paddle::ps::DownpourCtrDymfAccessor:: - DownpourCtrDymfFeatureValue::embed_w_index()] = - gpu_val->lr; - cpu_val[paddle::ps::DownpourCtrDymfAccessor:: - DownpourCtrDymfFeatureValue::embed_g2sum_index()] = - gpu_val->lr_g2sum; - cpu_val[paddle::ps::DownpourCtrDymfAccessor:: - DownpourCtrDymfFeatureValue::slot_index()] = - gpu_val->slot; - - if (gpu_val->mf_size > 0) { - for (int x = 0; x < gpu_val->mf_dim + 1; x++) { - cpu_val[x + 8] = gpu_val->mf[x]; - } - } - } + // TODO: pslib DumpFill #endif #ifdef PADDLE_WITH_PSCORE - auto* downpour_value = (paddle::distributed::FixedFeatureValue*)(*( - reinterpret_cast( - gpu_val + - feature_value_accessor_.common_feature_value.CpuPtrIndex()))); - size_t downpour_value_size = downpour_value->size(); - if (gpu_val[feature_value_accessor_.common_feature_value - .MfSizeIndex()] > 0 && - downpour_value_size == - (cpu_table_accessor_->GetAccessorInfo().dim - - int(cpu_table_accessor_->GetAccessorInfo().mf_size / - sizeof(float)))) { // cpu_accessor - downpour_value->resize( - cpu_table_accessor_->common_feature_value.Dim(mf_dim)); - } - float* cpu_val = downpour_value->data(); - - cpu_val[cpu_table_accessor_->common_feature_value.DeltaScoreIndex()] = - gpu_val[feature_value_accessor_.common_feature_value - .DeltaScoreIndex()]; - cpu_val[cpu_table_accessor_->common_feature_value.ShowIndex()] = - gpu_val[feature_value_accessor_.common_feature_value.ShowIndex()]; - cpu_val[cpu_table_accessor_->common_feature_value.ClickIndex()] = - gpu_val[feature_value_accessor_.common_feature_value.ClickIndex()]; - cpu_val[cpu_table_accessor_->common_feature_value.EmbedWIndex()] = - gpu_val[feature_value_accessor_.common_feature_value.EmbedWIndex()]; - cpu_val[cpu_table_accessor_->common_feature_value.SlotIndex()] = - gpu_val[feature_value_accessor_.common_feature_value.SlotIndex()]; - - for (int i = 0; - i < feature_value_accessor_.common_feature_value.EmbedDim(); i++) { - cpu_val[cpu_table_accessor_->common_feature_value.EmbedG2SumIndex() + - i] = gpu_val[feature_value_accessor_.common_feature_value - .EmbedG2SumIndex() + - i]; - } + accessor_wrapper_ptr->DumpFill(gpu_val, cpu_table_accessor_, mf_dim); + // auto* downpour_value = (paddle::distributed::FixedFeatureValue*)(*( + // reinterpret_cast(gpu_val))); + // float* cpu_val = downpour_value->data(); + // VLOG(5) << "dump to cpu " << index << " gpu_value: " + // << accessor_wrapper_ptr->ParseToString(gpu_val, + // int(accessor_wrapper_ptr->GetFeatureValueSize(mf_dim) / + // sizeof(float))) + // << " \t cpu_value:" + // << cpu_table_accessor_->ParseToString(cpu_val, + // downpour_value->size()); + } +#endif + free(test_build_values); + }; - if (gpu_val[feature_value_accessor_.common_feature_value - .MfSizeIndex()] > 0) { - for (int x = 0; - x < - int(feature_value_accessor_.common_feature_value.MFSize(mf_dim) / - sizeof(float)); - x++) { - cpu_val[cpu_table_accessor_->common_feature_value - .EmbedxG2SumIndex() + - x] = gpu_val[feature_value_accessor_.common_feature_value - .EmbedxG2SumIndex() + - x]; - } - } - VLOG(5) << "dump to cpu " << index << " : " - << feature_value_accessor_.ParseToString( - gpu_val, - feature_value_accessor_.common_feature_value.Dim(mf_dim)) - << " ===== CPU:" - << cpu_table_accessor_->ParseToString(cpu_val, - downpour_value->size()); + if (multi_mf_dim_) { + VLOG(0) << "psgpu wrapper dump pool: multi_mf_dim_: " << multi_mf_dim_; + size_t device_num = heter_devices_.size(); + std::vector threads(device_num * multi_mf_dim_); + for (size_t i = 0; i < device_num; i++) { + for (int j = 0; j < multi_mf_dim_; j++) { + threads[i + j * device_num] = std::thread(dump_pool_to_cpu_func, i, j); } -#endif - free(test_build_values); -}; -if (multi_mf_dim_) { - VLOG(0) << "psgpu wrapper dump pool: multi_mf_dim_: " << multi_mf_dim_; - size_t device_num = heter_devices_.size(); - std::vector threads(device_num * multi_mf_dim_); - for (size_t i = 0; i < device_num; i++) { - for (int j = 0; j < multi_mf_dim_; j++) { - threads[i + j * device_num] = std::thread(dump_pool_to_cpu_func, i, j); + } + for (std::thread& t : threads) { + t.join(); } } - for (std::thread& t : threads) { - t.join(); + if (keysize_max != 0) { + HeterPs_->end_pass(); } -} -if (keysize_max != 0) { - HeterPs_->end_pass(); -} -VLOG(0) << "HeterPs_->end_pass end"; -for (size_t i = 0; i < hbm_pools_.size(); i++) { - delete hbm_pools_[i]; -} -gpu_task_pool_.Push(current_task_); -current_task_ = nullptr; -gpu_free_channel_->Put(current_task_); -timer.Pause(); -VLOG(0) << "EndPass end, cost time: " << timer.ElapsedSec() << "s"; + VLOG(0) << "HeterPs_->end_pass end"; + for (size_t i = 0; i < hbm_pools_.size(); i++) { + delete hbm_pools_[i]; + } + gpu_task_pool_.Push(current_task_); + current_task_ = nullptr; + gpu_free_channel_->Put(current_task_); + timer.Pause(); + VLOG(0) << "EndPass end, cost time: " << timer.ElapsedSec() << "s"; } void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, @@ -1026,7 +954,10 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, platform::Timer pull_gpups_timer; all_timer.Start(); - size_t feature_value_size = feature_value_accessor_.common_pull_value.Size(max_mf_dim_); + auto accessor_wrapper_ptr = + GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); + size_t feature_value_size = + accessor_wrapper_ptr->GetPullValueSize(max_mf_dim_); VLOG(3) << "PullSparse max_dim:" << max_mf_dim_ << " pull_feature_value_size:" << pull_type_size_; @@ -1067,25 +998,40 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, int64_t* slot_lens = dev.slot_lens.mutable_data( (slot_num + 1) * sizeof(int64_t), place); - cudaMemcpyAsync(gpu_keys, keys.data(), keys.size() * sizeof(uint64_t*), - cudaMemcpyHostToDevice, stream); - cudaMemcpyAsync(slot_lens, slot_lengths_lod.data(), + cudaMemcpyAsync(gpu_keys, + keys.data(), + keys.size() * sizeof(uint64_t*), + cudaMemcpyHostToDevice, + stream); + cudaMemcpyAsync(slot_lens, + slot_lengths_lod.data(), slot_lengths_lod.size() * sizeof(int64_t), - cudaMemcpyHostToDevice, stream); + cudaMemcpyHostToDevice, + stream); - cudaMemcpyAsync(gpu_slot_dims, slot_dim.data(), - slot_dim.size() * sizeof(int), cudaMemcpyHostToDevice, + cudaMemcpyAsync(gpu_slot_dims, + slot_dim.data(), + slot_dim.size() * sizeof(int), + cudaMemcpyHostToDevice, stream); float** gpu_values = dev.values_ptr_tensor.mutable_data( values.size() * sizeof(float*), place); - cudaMemcpyAsync(gpu_values, values.data(), values.size() * sizeof(float*), - cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(gpu_values, + values.data(), + values.size() * sizeof(float*), + cudaMemcpyHostToDevice, + stream); int* key2slot = dev.keys2slot.mutable_data( (total_length * 5) * sizeof(int), place); - this->CopyKeys(place, gpu_keys, total_keys, slot_lens, slot_num, - static_cast(total_length), key2slot); + this->CopyKeys(place, + gpu_keys, + total_keys, + slot_lens, + slot_num, + static_cast(total_length), + key2slot); uint32_t* d_restore_idx = reinterpret_cast(&key2slot[total_length]); @@ -1101,7 +1047,8 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, reinterpret_cast(&d_merged_keys[total_length]); int dedup_size = HeterPs_->dedup_keys_and_fillidx( - devid_2_index, static_cast(total_length), + devid_2_index, + static_cast(total_length), total_keys, // input d_merged_keys, // output d_sorted_keys, // sort keys @@ -1110,11 +1057,13 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, d_offset, // offset d_merged_cnts, FLAGS_gpugraph_dedup_pull_push_mode & 0x02); -// printf("device %d, end dedup_keys_and_fillidx total %d, " -// "dedup_size %d, slot num: %d, value size: %d\n", -// device_id, int(total_length), dedup_size, slot_num, int(feature_value_size)); + // printf("device %d, end dedup_keys_and_fillidx total %d, " + // "dedup_size %d, slot num: %d, value size: %d\n", + // device_id, int(total_length), dedup_size, slot_num, + // int(feature_value_size)); - PADDLE_ENFORCE_GT(dedup_size, 0, + PADDLE_ENFORCE_GT(dedup_size, + 0, platform::errors::PreconditionNotMet( "dedup keys need more than zero failed in BoxPS.")); dev.dedup_key_length = dedup_size; @@ -1123,13 +1072,21 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, float* total_values_gpu = dev.pull_push_tensor.mutable_data(total_bytes, place); pull_gpups_timer.Start(); - HeterPs_->pull_sparse(devid_2_index, d_merged_keys, total_values_gpu, - dedup_size); + HeterPs_->pull_sparse( + devid_2_index, d_merged_keys, total_values_gpu, dedup_size); // values.size() not sure equal slot_num - this->CopyForPull(place, total_keys, gpu_values, total_values_gpu, - slot_lens, key2slot, max_mf_dim_ + 3, total_length, - gpu_slot_dims, d_restore_idx); + accessor_wrapper_ptr->CopyForPull(place, + total_keys, + gpu_values, + total_values_gpu, + slot_lens, + key2slot, + max_mf_dim_ + 3, + total_length, + gpu_slot_dims, + d_restore_idx, + feature_value_size); } else { size_t total_length = std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL); @@ -1150,32 +1107,48 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, memory::Alloc(place, slot_lengths.size() * sizeof(int64_t)); uint64_t** gpu_keys = reinterpret_cast(buf_key->ptr()); int64_t* gpu_len = reinterpret_cast(buf_length->ptr()); - cudaMemcpy(gpu_keys, keys.data(), keys.size() * sizeof(uint64_t*), + cudaMemcpy(gpu_keys, + keys.data(), + keys.size() * sizeof(uint64_t*), + cudaMemcpyHostToDevice); + cudaMemcpy(gpu_len, + slot_lengths_lod.data(), + slot_lengths.size() * sizeof(int64_t), cudaMemcpyHostToDevice); - cudaMemcpy(gpu_len, slot_lengths_lod.data(), - slot_lengths.size() * sizeof(int64_t), cudaMemcpyHostToDevice); auto buf_dim = memory::Alloc(place, slot_dim.size() * sizeof(int)); int* gpu_dim = reinterpret_cast(buf_dim->ptr()); - cudaMemcpy(gpu_dim, slot_dim.data(), slot_dim.size() * sizeof(int), + cudaMemcpy(gpu_dim, + slot_dim.data(), + slot_dim.size() * sizeof(int), cudaMemcpyHostToDevice); - this->CopyKeys(place, gpu_keys, total_keys, gpu_len, + this->CopyKeys(place, + gpu_keys, + total_keys, + gpu_len, static_cast(slot_lengths.size()), static_cast(total_length)); VLOG(3) << "Begin call PullSparseGPU in GPUPS, dev: " << devid_2_index << " len: " << total_length; pull_gpups_timer.Start(); - HeterPs_->pull_sparse(devid_2_index, total_keys, total_values_gpu, - total_length); + HeterPs_->pull_sparse( + devid_2_index, total_keys, total_values_gpu, total_length); VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length << "]"; - this->CopyForPull(place, gpu_keys, values, total_values_gpu, gpu_len, - static_cast(slot_lengths.size()), hidden_size, - total_length, gpu_dim); + accessor_wrapper_ptr->CopyForPull(place, + gpu_keys, + values, + total_values_gpu, + gpu_len, + static_cast(slot_lengths.size()), + hidden_size, + total_length, + gpu_dim, + feature_value_size); } pull_gpups_timer.Pause(); @@ -1207,28 +1180,41 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, memory::Alloc(place, slot_lengths.size() * sizeof(int64_t)); uint64_t** xpu_keys = reinterpret_cast(buf_key->ptr()); int64_t* xpu_len = reinterpret_cast(buf_length->ptr()); - PADDLE_ENFORCE_XPU_SUCCESS(xpu_memcpy(xpu_keys, keys.data(), + PADDLE_ENFORCE_XPU_SUCCESS(xpu_memcpy(xpu_keys, + keys.data(), keys.size() * sizeof(uint64_t*), XPU_HOST_TO_DEVICE)); - PADDLE_ENFORCE_XPU_SUCCESS(xpu_memcpy(xpu_len, slot_lengths_lod.data(), + PADDLE_ENFORCE_XPU_SUCCESS(xpu_memcpy(xpu_len, + slot_lengths_lod.data(), slot_lengths.size() * sizeof(int64_t), XPU_HOST_TO_DEVICE)); - this->CopyKeys(place, xpu_keys, total_keys, xpu_len, + this->CopyKeys(place, + xpu_keys, + total_keys, + xpu_len, static_cast(slot_lengths.size()), static_cast(total_length)); VLOG(3) << "Begin call PullSparseGPU in GPUPS, dev: " << devid_2_index << " len: " << total_length; pull_gpups_timer.Start(); - HeterPs_->pull_sparse(devid_2_index, total_keys, total_values_gpu, + HeterPs_->pull_sparse(devid_2_index, + total_keys, + total_values_gpu, static_cast(total_length)); pull_gpups_timer.Pause(); VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length << "]"; - this->CopyForPull(place, xpu_keys, values, total_values_gpu, xpu_len, - static_cast(slot_lengths.size()), hidden_size, - total_length); + accessor_wrapper_ptr->CopyForPull(place, + xpu_keys, + values, + total_values_gpu, + xpu_len, + static_cast(slot_lengths.size()), + hidden_size, + total_length, + feature_value_size); #endif } else { PADDLE_THROW(platform::errors::PreconditionNotMet( @@ -1246,12 +1232,14 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place, const std::vector& keys, const std::vector& grad_values, const std::vector& slot_lengths, - const int hidden_size, const int batch_size) { + const int hidden_size, + const int batch_size) { platform::Timer all_timer; platform::Timer push_gpups_timer; all_timer.Start(); - size_t grad_value_size = - TYPEALIGN(8, feature_value_accessor_.common_push_value.Size(max_mf_dim_)); + auto accessor_wrapper_ptr = + GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); + size_t grad_value_size = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_); if (platform::is_cpu_place(place)) { PADDLE_THROW(platform::errors::Unimplemented( @@ -1275,17 +1263,22 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place, if (!dev.d_slot_vector.IsInitialized()) { int* buf_slot_vector = dev.d_slot_vector.mutable_data(slot_num * sizeof(int), place); - cudaMemcpyAsync(buf_slot_vector, slot_vector_.data(), - slot_num * sizeof(int), cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(buf_slot_vector, + slot_vector_.data(), + slot_num * sizeof(int), + cudaMemcpyHostToDevice, + stream); } const int64_t* slot_lens = dev.slot_lens.data(); const int* d_slot_vector = dev.d_slot_vector.data(); const int* key2slot = dev.keys2slot.data(); float** gpu_values = dev.values_ptr_tensor.data(); - cudaMemcpyAsync(gpu_values, grad_values.data(), + cudaMemcpyAsync(gpu_values, + grad_values.data(), grad_values.size() * sizeof(float*), - cudaMemcpyHostToDevice, stream); + cudaMemcpyHostToDevice, + stream); uint64_t* d_merged_keys = &total_keys[total_length]; @@ -1296,26 +1289,50 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place, // dedup rate more than 3 if (total_length > dedup_size * 3) { const uint32_t* d_restore_idx = - reinterpret_cast(&key2slot[total_length]); - this->CopyForPush(place, total_keys, gpu_values, total_grad_values_gpu, - d_slot_vector, slot_lens, max_mf_dim_ + 3, total_length, - dedup_size, batch_size, slot_dims, key2slot, - d_restore_idx, grad_value_size); + reinterpret_cast(&key2slot[total_length]); + accessor_wrapper_ptr->CopyForPush(place, + total_keys, + gpu_values, + total_grad_values_gpu, + d_slot_vector, + slot_lens, + max_mf_dim_ + 3, + total_length, + dedup_size, + batch_size, + slot_dims, + key2slot, + d_restore_idx, + grad_value_size); } else { const uint32_t* d_sorted_idx = - reinterpret_cast(&key2slot[total_length * 2]); + reinterpret_cast(&key2slot[total_length * 2]); const uint32_t* d_offset = - reinterpret_cast(&d_sorted_idx[total_length]); + reinterpret_cast(&d_sorted_idx[total_length]); const uint32_t* d_merged_cnts = - reinterpret_cast(&d_offset[total_length]); - this->CopyForPush(place, d_merged_keys, gpu_values, total_grad_values_gpu, - d_slot_vector, slot_lens, max_mf_dim_ + 3, total_length, - dedup_size, batch_size, slot_dims, key2slot, - d_sorted_idx, d_offset, d_merged_cnts, grad_value_size); + reinterpret_cast(&d_offset[total_length]); + accessor_wrapper_ptr->CopyForPush(place, + d_merged_keys, + gpu_values, + total_grad_values_gpu, + d_slot_vector, + slot_lens, + max_mf_dim_ + 3, + total_length, + dedup_size, + batch_size, + slot_dims, + key2slot, + d_sorted_idx, + d_offset, + d_merged_cnts, + grad_value_size); } push_gpups_timer.Start(); - HeterPs_->push_sparse(devid_2_index, d_merged_keys, total_grad_values_gpu, + HeterPs_->push_sparse(devid_2_index, + d_merged_keys, + total_grad_values_gpu, static_cast(dedup_size)); } else { int64_t total_length = @@ -1332,13 +1349,22 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place, reinterpret_cast(total_keys_tensor.data()); VLOG(3) << "Begin copy grad tensor to gpups struct"; - this->CopyForPush(place, grad_values, total_grad_values_gpu, slot_lengths, - total_length, batch_size, grad_value_size); + accessor_wrapper_ptr->CopyForPush(place, + grad_values, + total_grad_values_gpu, + slot_lengths, + total_length, + batch_size, + grad_value_size, + slot_vector_, + slot_mf_dim_vector_); VLOG(3) << "Begin call PushSparseGPU in GPUPS, dev: " << devid_2_index << " len: " << total_length; push_gpups_timer.Start(); - HeterPs_->push_sparse(devid_2_index, total_keys, total_grad_values_gpu, + HeterPs_->push_sparse(devid_2_index, + total_keys, + total_grad_values_gpu, static_cast(total_length)); } push_gpups_timer.Pause(); @@ -1359,13 +1385,21 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place, uint64_t* total_keys = reinterpret_cast(total_keys_tensor.data()); VLOG(3) << "Begin copy grad tensor to xpups struct"; - this->CopyForPush(place, grad_values, total_grad_values_gpu, slot_lengths, - hidden_size, total_length, batch_size); + accessor_wrapper_ptr->CopyForPush(place, + grad_values, + total_grad_values_gpu, + slot_lengths, + hidden_size, + total_length, + batch_size, + slot_vector_); VLOG(3) << "Begin call PushSparseXPU in XPUPS, dev: " << devid_2_index << " len: " << total_length; push_gpups_timer.Start(); - HeterPs_->push_sparse(devid_2_index, total_keys, total_grad_values_gpu, + HeterPs_->push_sparse(devid_2_index, + total_keys, + total_grad_values_gpu, static_cast(total_length)); push_gpups_timer.Pause(); #endif @@ -1382,6 +1416,6 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place, VLOG(3) << "End PushSparseGrad"; } -} // end namespace framework +} // namespace framework } // end namespace paddle #endif diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu index d9db24e0d0183..73506538e2358 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu @@ -31,88 +31,6 @@ const int CUDA_NUM_THREADS = platform::PADDLE_CUDA_NUM_THREADS; #define GET_BLOCK(N) ((N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS) #define CUDA_BLOCK(N) GET_BLOCK(N), CUDA_NUM_THREADS, 0 -__global__ void PullCopy(float** dest, const FeatureValue* src, - const int64_t* len, int hidden, int slot_num, - int total_len, uint64_t** keys) { - CUDA_KERNEL_LOOP(i, total_len) { - int low = 0; - int high = slot_num - 1; - while (low < high) { - int mid = (low + high) / 2; - if (i < len[mid]) - high = mid; - else - low = mid + 1; - } - int x = low; - int y = i - (x ? len[x - 1] : 0); - if (*(keys[x] + y) == 0) { - *(dest[x] + y * hidden) = 0; - *(dest[x] + y * hidden + 1) = 0; - *(dest[x] + y * hidden + 2) = 0; - } else { - *(dest[x] + y * hidden) = (src + i)->show; - *(dest[x] + y * hidden + 1) = (src + i)->clk; - *(dest[x] + y * hidden + 2) = (src + i)->lr; - } - if ((src + i)->mf_size == 0 || *(keys[x] + y) == 0) { - for (int j = 0; j < hidden - 3; j++) { - *(dest[x] + y * hidden + 3 + j) = 0; - } - } else { - for (int j = 0; j < hidden - 3; j++) { - *(dest[x] + y * hidden + 3 + j) = (src + i)->mf[1 + j]; - } - } - } -} - -template -__global__ void PullCopy(float** dest, const float* src, const int64_t* len, - int slot_num, int total_len, uint64_t** keys, - uint64_t max_val_size, int* gpu_dim, - TAccess accessor) { - CUDA_KERNEL_LOOP(i, total_len) { - int low = 0; - int high = slot_num - 1; - while (low < high) { - int mid = (low + high) / 2; - if (i < len[mid]) - high = mid; - else - low = mid + 1; - } - int x = low; - int y = i - (x ? len[x - 1] : 0); - float* feature_value_ptr = - (float*)((char*)src + uint64_t(i) * uint64_t(max_val_size)); - int mf_dim = gpu_dim[x] - 3; - if (*(keys[x] + y) == 0) { - *(dest[x] + y * (mf_dim + 3)) = 0; - *(dest[x] + y * (mf_dim + 3) + 1) = 0; - *(dest[x] + y * (mf_dim + 3) + 2) = 0; - } else { - *(dest[x] + y * (mf_dim + 3)) = - feature_value_ptr[accessor.ShowIndex()]; - *(dest[x] + y * (mf_dim + 3) + 1) = - feature_value_ptr[accessor.ClickIndex()]; - *(dest[x] + y * (mf_dim + 3) + 2) = - feature_value_ptr[accessor.EmbedWIndex()]; - } - - if (feature_value_ptr[accessor.MfSizeIndex()] == 0 || - *(keys[x] + y) == 0) { - for (int j = 0; j < mf_dim; j++) { - *(dest[x] + y * (mf_dim + 3) + 3 + j) = 0; - } - } else { - for (int j = 0; j < mf_dim; j++) { - *(dest[x] + y * (mf_dim + 3) + 3 + j) = - feature_value_ptr[accessor.EmbedxWIndex() + j]; - } - } - } -} __global__ void CopyKeysKernel(uint64_t** src_keys, uint64_t* dest_total_keys, const int64_t* len, int slot_num, @@ -158,84 +76,8 @@ __global__ void PushCopy(FeaturePushValue* dest, float** src, int64_t* len, } } -__global__ void PushCopyWithPool( - float* dest, float** src, int64_t* len, int slot_num, uint64_t total_len, - int bs, int* slot_vector, int* mf_dim_vector, size_t grad_value_size, - CommonFeatureValueAccessor feature_value_accessor) { - CUDA_KERNEL_LOOP(i, total_len) { - int low = 0; - int high = slot_num - 1; - while (low < high) { - int mid = (low + high) / 2; - if (i < len[mid]) - high = mid; - else - low = mid + 1; - } - int x = low; - int y = i - (x ? len[low - 1] : 0); - float* cur = (float*)((char*)dest + i * grad_value_size); - - cur[feature_value_accessor.common_push_value.SlotIndex()] = - (float)slot_vector[x]; - int mf_dim = mf_dim_vector[x]; - cur[feature_value_accessor.common_push_value.MfDimIndex()] = mf_dim; - - cur[feature_value_accessor.common_push_value.ShowIndex()] = - *(src[x] + y * (mf_dim + 3)); - cur[feature_value_accessor.common_push_value.ClickIndex()] = - *(src[x] + y * (mf_dim + 3) + 1); - cur[feature_value_accessor.common_push_value.EmbedGIndex()] = - *(src[x] + y * (mf_dim + 3) + 2) * -1. * bs; - for (int j = 0; j < mf_dim; j++) { - cur[feature_value_accessor.common_push_value.EmbedxGIndex() + j] = - *(src[x] + y * (mf_dim + 3) + 3 + j) * -1. * bs; - } - } -} PSGPUWrapper::~PSGPUWrapper() { delete HeterPs_; } -void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place, - uint64_t** gpu_keys, - const std::vector& values, - const FeatureValue* total_values_gpu, - const int64_t* gpu_len, const int slot_num, - const int hidden_size, - const int64_t total_length) { - auto stream = dynamic_cast( - platform::DeviceContextPool::Instance().Get(place)) - ->stream(); - auto buf_value = memory::Alloc(place, values.size() * sizeof(float*)); - float** gpu_values = reinterpret_cast(buf_value->ptr()); - cudaMemcpy(gpu_values, values.data(), values.size() * sizeof(float*), - cudaMemcpyHostToDevice); - - PullCopy<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>( - gpu_values, total_values_gpu, gpu_len, hidden_size, slot_num, - total_length, gpu_keys); - cudaStreamSynchronize(stream); -} - -void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place, - uint64_t** gpu_keys, - const std::vector& values, - const float* total_values_gpu, - const int64_t* gpu_len, const int slot_num, - const int hidden_size, - const int64_t total_length, int* gpu_dim) { - auto stream = dynamic_cast( - platform::DeviceContextPool::Instance().Get(place)) - ->stream(); - auto buf_value = memory::Alloc(place, values.size() * sizeof(float*)); - float** gpu_values = reinterpret_cast(buf_value->ptr()); - cudaMemcpy(gpu_values, values.data(), values.size() * sizeof(float*), - cudaMemcpyHostToDevice); - PullCopy<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>( - gpu_values, total_values_gpu, gpu_len, slot_num, total_length, gpu_keys, - pull_type_size_, gpu_dim, feature_value_accessor_.common_pull_value); - cudaStreamSynchronize(stream); -} - void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place, uint64_t** origin_keys, uint64_t* total_keys, const int64_t* gpu_len, int slot_num, @@ -271,6 +113,7 @@ __global__ void CopyKeysKernel2( dest_total_keys[i] = src_keys[low][y]; } } + void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place, uint64_t** origin_keys, uint64_t* total_keys, const int64_t* slot_lens, int slot_num, @@ -282,320 +125,6 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place, total_len, origin_keys, total_keys, slot_num, slot_lens, key2slot); cudaStreamSynchronize(stream); } -template -__global__ void PullDedupCopy( - const size_t N, const uint64_t* total_keys, float** dest, const float* src, - const int64_t* slot_lens, uint64_t max_val_size, const int* slot_dims, - const int hidden, const int* key2slot, const uint32_t* restore_idx, - TAccess accessor) { - CUDA_KERNEL_LOOP(idx, N) { - int i = idx / hidden; - int off = idx % hidden; - - int x = key2slot[i]; - int y = i - slot_lens[x]; - - assert(slot_dims[x] == hidden); - float* dest_ptr = dest[x] + y * hidden; - // 0 key fill zero - if (total_keys[i] == 0) { - *(dest_ptr + off) = 0; - return; - } - - float* src_ptr = - (float*)((char*)src + - uint64_t(restore_idx[i]) * uint64_t(max_val_size)); - switch (off) { - case 0: - *(dest_ptr + off) = src_ptr[accessor.ShowIndex()]; - break; - case 1: - *(dest_ptr + off) = src_ptr[accessor.ClickIndex()]; - break; - case 2: - *(dest_ptr + off) = src_ptr[accessor.EmbedWIndex()]; - break; - default: - if (src_ptr[accessor.MfSizeIndex()] == 0) { - *(dest_ptr + off) = 0; - } else { - *(dest_ptr + off) = - src_ptr[accessor.EmbedxWIndex() + off - 3]; - } - break; - } - } -} -void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place, - const uint64_t* total_keys, float** gpu_values, - const float* total_values_gpu, - const int64_t* slot_lens, const int* key2slot, - const int hidden_size, - const int64_t total_length, - const int* slot_dims, - const uint32_t* gpu_restore_idx) { - auto stream = dynamic_cast( - platform::DeviceContextPool::Instance().Get(place)) - ->stream(); - size_t N = total_length * hidden_size; - PullDedupCopy<<>>( - N, total_keys, gpu_values, total_values_gpu, slot_lens, pull_type_size_, - slot_dims, hidden_size, key2slot, gpu_restore_idx, - feature_value_accessor_.common_pull_value); - cudaStreamSynchronize(stream); -} -void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place, - const std::vector& grad_values, - FeaturePushValue* total_grad_values_gpu, - const std::vector& slot_lengths, - const int hidden_size, - const int64_t total_length, - const int batch_size) { - auto stream = dynamic_cast( - platform::DeviceContextPool::Instance().Get(place)) - ->stream(); - auto slot_lengths_lod = slot_lengths; - for (int i = 1; i < slot_lengths_lod.size(); i++) { - slot_lengths_lod[i] += slot_lengths_lod[i - 1]; - } - auto buf_grad_value = - memory::Alloc(place, grad_values.size() * sizeof(float*)); - auto buf_length = memory::Alloc(place, slot_lengths.size() * sizeof(int64_t)); - auto buf_slot_vector = - memory::Alloc(place, slot_lengths_lod.size() * sizeof(int)); - - float** gpu_values = reinterpret_cast(buf_grad_value->ptr()); - int64_t* gpu_len = reinterpret_cast(buf_length->ptr()); - int* d_slot_vector = reinterpret_cast(buf_slot_vector->ptr()); - - cudaMemcpy(gpu_values, grad_values.data(), - grad_values.size() * sizeof(float*), cudaMemcpyHostToDevice); - cudaMemcpy(gpu_len, slot_lengths_lod.data(), - slot_lengths.size() * sizeof(int64_t), cudaMemcpyHostToDevice); - cudaMemcpy(d_slot_vector, slot_vector_.data(), - slot_lengths_lod.size() * sizeof(int), cudaMemcpyHostToDevice); - - PushCopy<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>( - total_grad_values_gpu, gpu_values, gpu_len, hidden_size, - slot_lengths.size(), total_length, batch_size, d_slot_vector); - cudaStreamSynchronize(stream); -} - -void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place, - const std::vector& grad_values, - float* total_grad_values_gpu, - const std::vector& slot_lengths, - const uint64_t total_length, - const int batch_size, size_t grad_value_size) { - auto stream = dynamic_cast( - platform::DeviceContextPool::Instance().Get(place)) - ->stream(); - auto slot_lengths_lod = slot_lengths; - for (int i = 1; i < slot_lengths_lod.size(); i++) { - slot_lengths_lod[i] += slot_lengths_lod[i - 1]; - } - auto buf_grad_value = - memory::Alloc(place, grad_values.size() * sizeof(float*)); - auto buf_length = memory::Alloc(place, slot_lengths.size() * sizeof(int64_t)); - auto buf_slot_vector = - memory::Alloc(place, slot_lengths_lod.size() * sizeof(int)); - auto buf_mf_dim_vector = - memory::Alloc(place, slot_lengths_lod.size() * sizeof(int)); - float** gpu_values = reinterpret_cast(buf_grad_value->ptr()); - int64_t* gpu_len = reinterpret_cast(buf_length->ptr()); - int* d_slot_vector = reinterpret_cast(buf_slot_vector->ptr()); - int* d_mf_dim_vector = reinterpret_cast(buf_mf_dim_vector->ptr()); - cudaMemcpy(gpu_values, grad_values.data(), - grad_values.size() * sizeof(float*), cudaMemcpyHostToDevice); - cudaMemcpy(gpu_len, slot_lengths_lod.data(), - slot_lengths.size() * sizeof(int64_t), cudaMemcpyHostToDevice); - cudaMemcpy(d_slot_vector, slot_vector_.data(), - slot_lengths_lod.size() * sizeof(int), cudaMemcpyHostToDevice); - cudaMemcpy(d_mf_dim_vector, slot_mf_dim_vector_.data(), - slot_lengths_lod.size() * sizeof(int), cudaMemcpyHostToDevice); - PushCopyWithPool<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>( - total_grad_values_gpu, gpu_values, gpu_len, slot_lengths.size(), - total_length, batch_size, d_slot_vector, d_mf_dim_vector, grad_value_size, - feature_value_accessor_); - cudaStreamSynchronize(stream); -} - -template -__global__ void PushMergeCopyAtomic( - const size_t N, const uint64_t* total_keys, float* dest, float** src, - const int hidden, const int bs, const int* slot_vector, - const int* slot_dims, const int64_t* slot_lens, const int* key2slot, - const uint32_t* d_restore_idx, size_t grad_value_size, - TAccess accessor) { - CUDA_KERNEL_LOOP(idx, N) { - int i = idx / hidden; - int off = idx % hidden; - // filter 0 keys - if (total_keys[i] == 0) { - return; - } - - int x = key2slot[i]; - int y = i - slot_lens[x]; - - const float* ptr = src[x] + y * hidden; - float* cur = (float*)((char*)dest + d_restore_idx[i] * grad_value_size); - int mf_dim = slot_dims[x] - 3; - switch (off) { - case 0: - cur[accessor.SlotIndex()] = (float)slot_vector[x]; - cur[accessor.MfDimIndex()] = mf_dim; - paddle::platform::CudaAtomicAdd( - &cur[accessor.ShowIndex()], *(ptr + off)); - break; - case 1: - paddle::platform::CudaAtomicAdd( - &cur[accessor.ClickIndex()], *(ptr + off)); - break; - case 2: - paddle::platform::CudaAtomicAdd( - &cur[accessor.EmbedGIndex()], *(ptr + off) * -1. * bs); - break; - default: - int embedx_idx = off - 3; - if (mf_dim < embedx_idx) { - return; - } - paddle::platform::CudaAtomicAdd( - &cur[accessor.EmbedxGIndex() + embedx_idx], *(ptr + off) * -1. * bs); - break; - } - } -} - -void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place, - const uint64_t* total_keys, float** grad_values, - float* total_grad_values_gpu, const int* slots, - const int64_t* slot_lens, const int hidden_size, - const int64_t total_length, - const int64_t dedup_length, const int batch_size, - const int* slot_dims, const int* key2slot, - const uint32_t* d_restore_idx, - const size_t grad_value_size) { - auto stream = dynamic_cast( - platform::DeviceContextPool::Instance().Get(place)) - ->stream(); - cudaMemsetAsync(total_grad_values_gpu, 0, dedup_length * grad_value_size, - stream); - size_t N = total_length * hidden_size; - PushMergeCopyAtomic<<>>( - N, total_keys, total_grad_values_gpu, grad_values, hidden_size, - batch_size, slots, slot_dims, slot_lens, key2slot, d_restore_idx, - grad_value_size, feature_value_accessor_.common_push_value); - - cudaStreamSynchronize(stream); -} - -#define SUM_GRAD_VALUE \ - for (uint32_t j = 0; j < count; ++j) { \ - const uint32_t& pos = d_sort_idx[start + j]; \ - const int& x = key2slot[pos]; \ - y = pos - slot_lens[x]; \ - val += *(reinterpret_cast(src[x] + y * hidden + off)); \ - } - -template -__global__ void PushMergeCopy( - const size_t N, const uint64_t* total_keys, float* dest, float** src, - const int hidden, const int bs, const int* slot_vector, - const int* slot_dims, const int64_t* slot_lens, const int* key2slot, - const uint32_t* d_sort_idx, - const uint32_t* d_sort_offset, - const uint32_t* d_sort_cnt, size_t grad_value_size, - TAccess accessor) { - CUDA_KERNEL_LOOP(idx, N) { - int i = idx / hidden; - int off = idx % hidden; - // filter 0 keys - float* cur = (float*)((char*)dest + i * grad_value_size); - - if (total_keys[i] == 0) { - switch (off) { - case 0: - cur[accessor.SlotIndex()] = 0; - cur[accessor.MfDimIndex()] = 0; - cur[accessor.ShowIndex()] = 0.0; - break; - case 1: - cur[accessor.ClickIndex()] = 0.0; - break; - case 2: - cur[accessor.EmbedGIndex()] = 0.0; - break; - default: - cur[accessor.EmbedxGIndex() + off - 3] = 0.0; - break; - } - return; - } - - const uint32_t& start = d_sort_offset[i]; - const uint32_t& count = d_sort_cnt[i]; - const uint32_t& pos = d_sort_idx[start]; - - const int& x = key2slot[pos]; - int y = pos - slot_lens[x]; - int mf_dim = slot_dims[x] - 3; - - double val = 0.0; - - switch (off) { - case 0: - cur[accessor.SlotIndex()] = (float)slot_vector[x]; - cur[accessor.MfDimIndex()] = mf_dim; - SUM_GRAD_VALUE - cur[accessor.ShowIndex()] = val; - break; - case 1: - SUM_GRAD_VALUE - cur[accessor.ClickIndex()] = val; - break; - case 2: - SUM_GRAD_VALUE - cur[accessor.EmbedGIndex()] = val * -1. * bs; - break; - default: - int embedx_idx = off - 3; - if (mf_dim < embedx_idx) { - cur[accessor.EmbedxGIndex() + embedx_idx] = 0.0; - return; - } - SUM_GRAD_VALUE - cur[accessor.EmbedxGIndex() + embedx_idx] = val * -1. * bs; - break; - } - } -} - -void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place, - const uint64_t* total_keys, float** grad_values, - float* total_grad_values_gpu, const int* slots, - const int64_t* slot_lens, const int hidden_size, - const int64_t total_length, const int64_t dedup_length, - const int batch_size, const int* slot_dims, - const int* key2slot, - const uint32_t* gpu_sort_idx, - const uint32_t* gpu_sort_offset, - const uint32_t* gpu_sort_lens, - const size_t grad_value_size) { - auto stream = dynamic_cast( - platform::DeviceContextPool::Instance().Get(place)) - ->stream(); - // merge all grad to one - size_t N = dedup_length * hidden_size; - PushMergeCopy<<>>( - N, total_keys, total_grad_values_gpu, grad_values, hidden_size, - batch_size, slots, slot_dims, slot_lens, key2slot, - gpu_sort_idx, gpu_sort_offset, gpu_sort_lens, - grad_value_size, feature_value_accessor_.common_push_value); - cudaStreamSynchronize(stream); -} void PSGPUWrapper::SetSparseSGD(float nonclk_coeff, float clk_coeff, float min_bound, float max_bound, diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h index babee603b1517..b69cbccd0c1c9 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h @@ -65,9 +65,6 @@ limitations under the License. */ namespace paddle { namespace framework { -#define TYPEALIGN(ALIGNVAL, LEN) \ - (((uint64_t)(LEN) + ((ALIGNVAL)-1)) & ~((uint64_t)((ALIGNVAL)-1))) - class Dataset; #ifdef PADDLE_WITH_PSLIB @@ -174,53 +171,6 @@ class PSGPUWrapper { void CopyKeys(const paddle::platform::Place& place, uint64_t** origin_keys, uint64_t* total_keys, const int64_t* gpu_len, int slot_num, int total_len, int* key2slot); - void CopyForPull(const paddle::platform::Place& place, uint64_t** gpu_keys, - const std::vector& values, - const FeatureValue* total_values_gpu, const int64_t* gpu_len, - const int slot_num, const int hidden_size, - const int64_t total_length); - void CopyForPull(const paddle::platform::Place& place, uint64_t** gpu_keys, - const std::vector& values, - const float* total_values_gpu, const int64_t* gpu_len, - const int slot_num, const int hidden_size, - const int64_t total_length, int* gpu_dim); - void CopyForPull(const paddle::platform::Place& place, - const uint64_t* total_keys, float** gpu_values, - const float* total_values_gpu, const int64_t* slot_lens, - const int* key2slot, const int hidden_size, - const int64_t total_length, const int* slot_dims, - const uint32_t* gpu_restore_idx); - void CopyForPush(const paddle::platform::Place& place, - const std::vector& grad_values, - FeaturePushValue* total_grad_values_gpu, - const std::vector& slot_lengths, - const int hidden_size, const int64_t total_length, - const int batch_size); - void CopyForPush(const paddle::platform::Place& place, - const std::vector& grad_values, - float* total_grad_values_gpu, - const std::vector& slot_lengths, - const uint64_t total_length, const int batch_size, - size_t grad_value_size); - void CopyForPush(const paddle::platform::Place& place, - const uint64_t* total_keys, float** grad_values, - float* total_grad_values_gpu, const int* slots, - const int64_t* slot_lens, const int hidden_size, - const int64_t total_length, const int64_t dedup_length, - const int batch_size, const int* slot_dims, - const int* key2slot, const uint32_t* d_restore_idx, - const size_t grad_value_size); - void CopyForPush(const paddle::platform::Place& place, - const uint64_t* total_keys, float** grad_values, - float* total_grad_values_gpu, const int* slots, - const int64_t* slot_lens, const int hidden_size, - const int64_t total_length, const int64_t dedup_length, - const int batch_size, const int* slot_dims, - const int* key2slot, - const uint32_t* gpu_sort_idx, - const uint32_t* gpu_sort_offset, - const uint32_t* gpu_sort_lens, - const size_t grad_value_size); void BuildGPUTask(std::shared_ptr gpu_task); void PreBuildTask(std::shared_ptr gpu_task); @@ -402,7 +352,7 @@ class PSGPUWrapper { auto sparse_table_accessor = sparse_table.accessor(); auto sparse_table_accessor_parameter = sparse_table_accessor.ctr_accessor_param(); - auto accessor_class = sparse_table_accessor.accessor_class(); + accessor_class_ = sparse_table_accessor.accessor_class(); std::unordered_map config; config["embedx_dim"] = sparse_table_accessor.embedx_dim(); @@ -413,14 +363,17 @@ class PSGPUWrapper { config["nodeid_slot"] = sparse_table_accessor.graph_sgd_param().nodeid_slot(); config["feature_learning_rate"] = sparse_table_accessor.graph_sgd_param().feature_learning_rate(); - if (accessor_class == "CtrDymfAccessor") { + if (accessor_class_ == "CtrDymfAccessor") { // optimizer config for embed_w and embedx add_sparse_optimizer(config, sparse_table_accessor.embed_sgd_param()); add_sparse_optimizer(config, sparse_table_accessor.embedx_sgd_param(), "mf_"); } - feature_value_accessor_.Configure(config); + fleet_config_ = config; + GlobalAccessorTransfor::GetInstance().Init(accessor_class_); + GlobalAccessorTransfor::GetInstance().GetAccessorWrapper()->Configure( + config); InitializeGPUServer(config); } #endif @@ -505,24 +458,8 @@ class PSGPUWrapper { optimizer_type_ = (config.find("optimizer_type") == config.end()) ? 1 : int(config["optimizer_type"]); - embedx_dim_ = (config.find("embedx_dim") == config.end()) - ? 8 - : int(config["embedx_dim"]); - if (optimizer_type_ == 3) { // adam - embed_sgd_dim_ = 4; - embedx_sgd_dim_ = embedx_dim_ * 2 + 2; - } else if (optimizer_type_ == 4) { // shared_adam - embed_sgd_dim_ = 4; - embedx_sgd_dim_ = 4; - } else { - embed_sgd_dim_ = 1; - embedx_sgd_dim_ = 1; - } - VLOG(0) << "InitializeGPUServer embed_sgd_dim_:" << embed_sgd_dim_ - << " embedx_sgd_dim_:" << embedx_sgd_dim_ - << " embedx_dim_:" << embedx_dim_ - << " optimizer_type_:" << optimizer_type_ + VLOG(0) << "InitializeGPUServer optimizer_type_:" << optimizer_type_ << " nodeid_slot:" << nodeid_slot << " feature_learning_rate:" << feature_learning_rate; } @@ -610,12 +547,13 @@ class PSGPUWrapper { for (size_t i = 0; i < slot_index_vec_.size(); i++) { slot_index_vec_[i] = dim_index_map[slot_mf_dim_vector_[i]]; } - val_type_size_ = TYPEALIGN( - 8, feature_value_accessor_.common_feature_value.Size(max_mf_dim_)); - grad_type_size_ = TYPEALIGN( - 8, feature_value_accessor_.common_push_value.Size(max_mf_dim_)); - pull_type_size_ = feature_value_accessor_.common_pull_value.Size(max_mf_dim_); - VLOG(0) << "InitSlotInfo: val_type_size_" << val_type_size_ + + auto accessor_wrapper_ptr = + GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); + val_type_size_ = accessor_wrapper_ptr->GetFeatureValueSize(max_mf_dim_); + grad_type_size_ = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_); + pull_type_size_ = accessor_wrapper_ptr->GetPullValueSize(max_mf_dim_); + VLOG(0) << "InitS lotInfo: val_type_size_" << val_type_size_ << " grad_type_size_:" << grad_type_size_ << " pull_type_size_:" << pull_type_size_; slot_info_initialized_ = true; @@ -638,13 +576,10 @@ class PSGPUWrapper { #ifdef PADDLE_WITH_PSCORE void SetTableAccessor(paddle::distributed::ValueAccessor* accessor) { - cpu_table_accessor_ = - dynamic_cast(accessor); + cpu_table_accessor_ = accessor; } #endif - CommonFeatureValueAccessor feature_value_accessor_; - private: static std::shared_ptr s_instance_; Dataset* dataset_; @@ -699,11 +634,10 @@ class PSGPUWrapper { bool slot_info_initialized_ = false; int use_afs_api_ = 0; int optimizer_type_ = 1; - int embed_sgd_dim_ = 1; - int embedx_sgd_dim_ = 1; - int embedx_dim_ = 8; + std::string accessor_class_; + std::unordered_map fleet_config_; #ifdef PADDLE_WITH_PSCORE - paddle::distributed::CtrDymfAccessor* cpu_table_accessor_; + paddle::distributed::ValueAccessor* cpu_table_accessor_; #endif #ifdef PADDLE_WITH_CUDA diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps b/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps index f1084dc4d758b..df8ad45bb472d 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps @@ -169,33 +169,6 @@ __global__ void PushCopy(FeaturePushValue* dest, float** src, long long* len, PSGPUWrapper::~PSGPUWrapper() { delete HeterPs_; } -void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place, - uint64_t** gpu_keys, - const std::vector& values, - const FeatureValue* total_values_gpu, - const int64_t* gpu_len, const int slot_num, - const int hidden_size, - const int64_t total_length) { - XPUStream stream = nullptr; - auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); - stream = static_cast(dev_ctx) - ->x_context() - ->xpu_stream; - float* buf_value = nullptr; - xpu_malloc(reinterpret_cast(&buf_value), - values.size() * sizeof(float*)); - float** gpu_values = reinterpret_cast(&buf_value); - xpu_memcpy(gpu_values, values.data(), values.size() * sizeof(float*), - XPU_HOST_TO_DEVICE); - - unsigned long long** c_keys = (unsigned long long**)gpu_keys; - const long long* c_len = (const long long*)gpu_len; - PullCopy<<<2, 64, stream>>>(gpu_values, total_values_gpu, c_len, hidden_size, - slot_num, total_length, c_keys); - - xpu_wait(stream); -} - void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place, uint64_t** origin_keys, uint64_t* total_keys, const int64_t* gpu_len, int slot_num, @@ -213,51 +186,6 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place, xpu_wait(stream); } -void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place, - const std::vector& grad_values, - FeaturePushValue* total_grad_values_gpu, - const std::vector& slot_lengths, - const int hidden_size, - const int64_t total_length, - const int batch_size) { - XPUStream stream = nullptr; - auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); - stream = static_cast(dev_ctx) - ->x_context() - ->xpu_stream; - auto slot_lengths_lod = slot_lengths; - for (size_t i = 1; i < slot_lengths_lod.size(); i++) { - slot_lengths_lod[i] += slot_lengths_lod[i - 1]; - } - - float* buf_grad_value = nullptr; - int64_t* buf_length = nullptr; - int* buf_slot_vector = nullptr; - - xpu_malloc(reinterpret_cast(&buf_grad_value), - grad_values.size() * sizeof(float*)); - xpu_malloc(reinterpret_cast(&buf_length), - slot_lengths.size() * sizeof(int64_t)); - xpu_malloc(reinterpret_cast(&buf_slot_vector), - slot_lengths_lod.size() * sizeof(int)); - - float** gpu_values = reinterpret_cast(&buf_grad_value); - int64_t* gpu_len = reinterpret_cast(buf_length); - int* d_slot_vector = reinterpret_cast(buf_slot_vector); - xpu_memcpy(gpu_values, grad_values.data(), - grad_values.size() * sizeof(float*), XPU_HOST_TO_DEVICE); - xpu_memcpy(gpu_len, slot_lengths_lod.data(), - slot_lengths.size() * sizeof(int64_t), XPU_HOST_TO_DEVICE); - xpu_memcpy(d_slot_vector, slot_vector_.data(), - slot_lengths_lod.size() * sizeof(int), XPU_HOST_TO_DEVICE); - - long long* c_len = (long long*)gpu_len; - PushCopy<<<2, 64, stream>>>(total_grad_values_gpu, gpu_values, c_len, - hidden_size, slot_lengths.size(), total_length, - batch_size, d_slot_vector); - xpu_wait(stream); -} - } // end namespace framework } // end namespace paddle #endif diff --git a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py index ffc3f2b21a476..f81ea5f5572c5 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py @@ -326,6 +326,13 @@ def test_fleet_desc_configs(self): .accessor.embed_sgd_param.adagrad.initial_range, 0.0001) + strategy = paddle.distributed.fleet.DistributedStrategy() + configs = {} + configs['emb'] = {"sparse_optimizer": "shared_adam"} + strategy.fleet_desc_configs = configs + self.assertEqual(strategy.sparse_table_configs[0] + .accessor.embed_sgd_param.adam.beta1_decay_rate, 0.9) + def test_trainer_desc_configs(self): strategy = paddle.distributed.fleet.DistributedStrategy() configs = {