From b50ef420f97a3551f5ed65c3e14a8417caf35d44 Mon Sep 17 00:00:00 2001 From: danleifeng <52735331+danleifeng@users.noreply.github.com> Date: Wed, 20 Jul 2022 15:02:37 +0800 Subject: [PATCH] =?UTF-8?q?=E3=80=90GPUPS=E3=80=91Adam=20accessor=20(#4391?= =?UTF-8?q?9)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add adam/sharedadam optimzier for gpups;edit optimizer struct;test=develop --- .../distributed/ps/table/ctr_dymf_accessor.cc | 28 +- .../distributed/ps/table/ctr_dymf_accessor.h | 19 +- .../distributed/ps/table/sparse_sgd_rule.cc | 84 ++- .../distributed/ps/table/sparse_sgd_rule.h | 23 + paddle/fluid/distributed/ps/table/table.cc | 1 + .../distributed/ps/wrapper/CMakeLists.txt | 1 + paddle/fluid/distributed/ps/wrapper/fleet.cc | 45 +- .../framework/distributed_strategy.proto | 6 +- paddle/fluid/framework/fleet/CMakeLists.txt | 15 +- paddle/fluid/framework/fleet/heter_context.h | 2 - .../framework/fleet/heter_ps/CMakeLists.txt | 8 +- .../framework/fleet/heter_ps/feature_value.cu | 192 +++++ .../framework/fleet/heter_ps/feature_value.h | 705 ++++++++++++++++++ .../fleet/heter_ps/graph_gpu_ps_table.h | 6 +- .../framework/fleet/heter_ps/hashtable.h | 12 +- .../fleet/heter_ps/hashtable_kernel.cu | 162 ++-- .../framework/fleet/heter_ps/heter_comm.h | 27 +- .../framework/fleet/heter_ps/heter_comm_inl.h | 433 ++++++----- .../fleet/heter_ps/heter_comm_kernel.cu | 173 +++-- .../fleet/heter_ps/heter_comm_kernel.h | 52 +- .../framework/fleet/heter_ps/heter_ps.cc | 43 +- .../framework/fleet/heter_ps/heter_ps.cu | 145 ++-- .../fluid/framework/fleet/heter_ps/heter_ps.h | 25 +- .../framework/fleet/heter_ps/heter_ps_base.h | 20 +- .../fluid/framework/fleet/heter_ps/mem_pool.h | 14 - .../framework/fleet/heter_ps/optimizer.cuh.h | 472 ++++++++++-- .../framework/fleet/heter_ps/optimizer_conf.h | 28 +- .../fluid/framework/fleet/ps_gpu_wrapper.cc | 310 +++----- .../fluid/framework/fleet/ps_gpu_wrapper.cu | 326 +------- paddle/fluid/framework/fleet/ps_gpu_wrapper.h | 211 ++++-- .../fluid/framework/fleet/ps_gpu_wrapper.kps | 179 ++--- .../fleet/base/distributed_strategy.py | 15 + python/paddle/distributed/ps/the_one_ps.py | 2 +- .../tests/unittests/test_dist_fleet_ps13.py | 201 +++++ .../test_fleet_distributed_strategy.py | 8 + tools/parallel_UT_rule.py | 3 +- 36 files changed, 2714 insertions(+), 1282 deletions(-) create mode 100644 paddle/fluid/framework/fleet/heter_ps/feature_value.cu create mode 100644 python/paddle/fluid/tests/unittests/test_dist_fleet_ps13.py diff --git a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc index c65eac99acc03..4feee70fed751 100644 --- a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc +++ b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.cc @@ -31,6 +31,7 @@ int CtrDymfAccessor::Initialize() { _embedx_sgd_rule = CREATE_PSCORE_CLASS(SparseValueSGDRule, name); _embedx_sgd_rule->LoadConfig(_config.embedx_sgd_param(), _config.embedx_dim()); + common_feature_value.optimizer_name = name; common_feature_value.embed_sgd_dim = _embed_sgd_rule->Dim(); common_feature_value.embedx_dim = _config.embedx_dim(); @@ -42,7 +43,10 @@ int CtrDymfAccessor::Initialize() { if (_config.ctr_accessor_param().show_scale()) { _show_scale = true; } - VLOG(0) << " INTO CtrDymfAccessor::Initialize()"; + VLOG(0) << " INTO CtrDymfAccessor::Initialize(); embed_sgd_dim:" + << common_feature_value.embed_sgd_dim + << " embedx_dim:" << common_feature_value.embedx_dim + << " embedx_sgd_dim:" << common_feature_value.embedx_sgd_dim; InitAccessorInfo(); return 0; } @@ -53,9 +57,9 @@ void CtrDymfAccessor::InitAccessorInfo() { auto embedx_dim = _config.embedx_dim(); VLOG(0) << "InitAccessorInfo embedx_dim:" << embedx_dim; - _accessor_info.select_dim = 3 + embedx_dim; + _accessor_info.select_dim = 4 + embedx_dim; _accessor_info.select_size = _accessor_info.select_dim * sizeof(float); - _accessor_info.update_dim = 4 + embedx_dim; + _accessor_info.update_dim = 5 + embedx_dim; _accessor_info.update_size = _accessor_info.update_dim * sizeof(float); _accessor_info.mf_size = (embedx_dim + common_feature_value.embedx_sgd_dim) * sizeof(float); @@ -179,8 +183,10 @@ int32_t CtrDymfAccessor::Create(float** values, size_t num) { value[common_feature_value.ClickIndex()] = 0; value[common_feature_value.SlotIndex()] = -1; value[common_feature_value.MfDimIndex()] = -1; - _embed_sgd_rule->InitValue(value + common_feature_value.EmbedWIndex(), - value + common_feature_value.EmbedG2SumIndex()); + _embed_sgd_rule->InitValue( + value + common_feature_value.EmbedWIndex(), + value + common_feature_value.EmbedG2SumIndex(), + false); // adam embed init not zero, adagrad embed init zero _embedx_sgd_rule->InitValue(value + common_feature_value.EmbedxWIndex(), value + common_feature_value.EmbedxG2SumIndex(), false); @@ -293,22 +299,14 @@ std::string CtrDymfAccessor::ParseToString(const float* v, int param) { i++) { os << " " << v[i]; } - // os << " " << common_feature_value.Slot(const_cast(v)) << " " - // << common_feature_value.MfDim(const_cast(v)); auto show = common_feature_value.Show(const_cast(v)); auto click = common_feature_value.Click(const_cast(v)); auto score = ShowClickScore(show, click); + auto mf_dim = int(common_feature_value.MfDim(const_cast(v))); if (score >= _config.embedx_threshold() && param > common_feature_value.EmbedxG2SumIndex()) { - // VLOG(1) << "common_feature_value.EmbedxG2SumIndex():" - // << common_feature_value.EmbedxG2SumIndex(); - // VLOG(1) << "common_feature_value.EmbedxWIndex():" - // << common_feature_value.EmbedxWIndex(); - // VLOG(1) << "common_feature_value.MfDim():" - // << common_feature_value.MfDim(const_cast(v)); for (auto i = common_feature_value.EmbedxG2SumIndex(); - i < common_feature_value.EmbedxWIndex() + - common_feature_value.MfDim(const_cast(v)); + i < common_feature_value.Dim(mf_dim); ++i) { os << " " << v[i]; } diff --git a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h index a360030cb7d3d..b820d617d06ae 100644 --- a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h +++ b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h @@ -54,10 +54,24 @@ class CtrDymfAccessor : public ValueAccessor { int ClickIndex() { return ShowIndex() + 1; } int EmbedWIndex() { return ClickIndex() + 1; } int EmbedG2SumIndex() { return EmbedWIndex() + 1; } - int SlotIndex() { return EmbedG2SumIndex() + 1; } + int SlotIndex() { return EmbedG2SumIndex() + embed_sgd_dim; } int MfDimIndex() { return SlotIndex() + 1; } int EmbedxG2SumIndex() { return MfDimIndex() + 1; } - int EmbedxWIndex() { return EmbedxG2SumIndex() + 1; } + int EmbedxWIndex() { return EmbedxG2SumIndex() + embedx_sgd_dim; } + + // 根据mf_dim计算的总长度 + int Dim(int& mf_dim) { + int tmp_embedx_sgd_dim = 1; + if (optimizer_name == "SparseAdamSGDRule") { // adam + tmp_embedx_sgd_dim = mf_dim * 2 + 2; + } else if (optimizer_name == "SparseSharedAdamSGDRule") { // shared_adam + tmp_embedx_sgd_dim = 4; + } + return 7 + embed_sgd_dim + tmp_embedx_sgd_dim + mf_dim; + } + + // 根据mf_dim计算的总byte数 + int Size(int& mf_dim) { return (Dim(mf_dim)) * sizeof(float); } float& UnseenDays(float* val) { return val[UnseenDaysIndex()]; } float& DeltaScore(float* val) { return val[DeltaScoreIndex()]; } @@ -73,6 +87,7 @@ class CtrDymfAccessor : public ValueAccessor { int embed_sgd_dim; int embedx_dim; int embedx_sgd_dim; + std::string optimizer_name; }; struct CtrDymfPushValue { diff --git a/paddle/fluid/distributed/ps/table/sparse_sgd_rule.cc b/paddle/fluid/distributed/ps/table/sparse_sgd_rule.cc index 07562f566d326..014d6e450ab4a 100644 --- a/paddle/fluid/distributed/ps/table/sparse_sgd_rule.cc +++ b/paddle/fluid/distributed/ps/table/sparse_sgd_rule.cc @@ -213,7 +213,6 @@ void SparseAdamSGDRule::UpdateValueWork(float* w, float beta1_pow_ = *beta1_pow; float beta2_pow_ = *beta2_pow; - // lr not change in one update lr *= sqrt(1 - beta2_pow_) / (1 - beta1_pow_); for (size_t i = 0; i < _embedding_dim; i++) { // Calculation @@ -252,5 +251,88 @@ void SparseAdamSGDRule::InitValueWork(float* value, *(sgd + Beta1PowIndex()) = _beta1_decay_rate; *(sgd + Beta2PowIndex()) = _beta2_decay_rate; } + +void SparseSharedAdamSGDRule::LoadConfig( + const SparseCommonSGDRuleParameter& param, size_t emb_dim) { + _embedding_dim = emb_dim; + auto adam_param = param.adam(); + learning_rate_ = adam_param.learning_rate(); + _initial_range = adam_param.initial_range(); + _beta1_decay_rate = adam_param.beta1_decay_rate(); + _beta2_decay_rate = adam_param.beta2_decay_rate(); + _ada_epsilon = adam_param.ada_epsilon(); + if (adam_param.weight_bounds_size() == 0) { + _min_bound = -std::numeric_limits::max(); + _max_bound = std::numeric_limits::max(); + } else { + CHECK(adam_param.weight_bounds_size() >= 2) + << "invalid repeated size for weight_bounds:" + << adam_param.weight_bounds_size(); + _min_bound = adam_param.weight_bounds(0); + _max_bound = adam_param.weight_bounds(1); + } +} + +void SparseSharedAdamSGDRule::UpdateValueWork(float* w, + float* sgd, + const float* grad, + float scale) { + float* gsum = sgd + GSumIndex(); + float* g2sum = sgd + G2SumIndex(); + float* beta1_pow = sgd + Beta1PowIndex(); + float* beta2_pow = sgd + Beta2PowIndex(); + const float* g = grad; + + float lr = learning_rate_; + float beta1_pow_ = *beta1_pow; + float beta2_pow_ = *beta2_pow; + float gsum_ = *gsum; + float g2sum_ = *g2sum; + + lr *= sqrt(1 - beta2_pow_) / (1 - beta1_pow_); + double sum_gsum = 0.0; + double sum_g2sum = 0.0; + for (int i = 0; i < _embedding_dim; i++) { + // Calculation + double new_gsum = + _beta1_decay_rate * gsum_ + (1 - _beta1_decay_rate) * g[i]; + double new_g2sum = + _beta2_decay_rate * g2sum_ + (1 - _beta2_decay_rate) * g[i] * g[i]; + w[i] = w[i] - lr * (new_gsum / (sqrt(new_g2sum) + _ada_epsilon)); + BoundValue(w[i]); + sum_gsum += new_gsum; + sum_g2sum += new_g2sum; + } + // update beta_pow_decay + (*gsum) = sum_gsum / _embedding_dim; + (*g2sum) = sum_g2sum / _embedding_dim; + (*beta1_pow) *= _beta1_decay_rate; + (*beta2_pow) *= _beta2_decay_rate; +} + +void SparseSharedAdamSGDRule::InitValueWork(float* value, + float* sgd, + bool zero_init) { + for (int i = 0; i < _embedding_dim; ++i) { + if (zero_init) { + value[i] = 0.0; + BoundValue(value[i]); + } else { + value[i] = + (local_uniform_real_distribution()(local_random_engine()) * + 2 - + 1) * + _initial_range; + BoundValue(value[i]); + } + } + // init rule gsum and g2sum + for (int i = GSumIndex(); i < Beta1PowIndex(); i++) { + sgd[i] = 0.0; + } + // init beta1_pow and beta2_pow + *(sgd + Beta1PowIndex()) = _beta1_decay_rate; + *(sgd + Beta2PowIndex()) = _beta2_decay_rate; +} } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/ps/table/sparse_sgd_rule.h b/paddle/fluid/distributed/ps/table/sparse_sgd_rule.h index f62cffdf232e7..4fed331ba93ec 100644 --- a/paddle/fluid/distributed/ps/table/sparse_sgd_rule.h +++ b/paddle/fluid/distributed/ps/table/sparse_sgd_rule.h @@ -144,5 +144,28 @@ class SparseAdamSGDRule : public SparseValueSGDRule { float _beta2_decay_rate; float _ada_epsilon; }; + +class SparseSharedAdamSGDRule : public SparseValueSGDRule { + public: + virtual void LoadConfig(const SparseCommonSGDRuleParameter& param, + size_t emb_dim); + virtual void UpdateValueWork(float* w, + float* sgd, + const float* push_value, + float scale); + virtual void InitValueWork(float* value, float* sgd, bool zero_init); + virtual size_t Dim() { return 4; } + size_t GSumIndex() { return 0; } + size_t G2SumIndex() { return GSumIndex() + 1; } + size_t Beta1PowIndex() { return G2SumIndex() + 1; } + size_t Beta2PowIndex() { return Beta1PowIndex() + 1; } + + protected: + float learning_rate_; + float _beta1_decay_rate; + float _beta2_decay_rate; + float _ada_epsilon; +}; + } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/ps/table/table.cc b/paddle/fluid/distributed/ps/table/table.cc index cfa286f1c3f7f..3e6d5a9941206 100644 --- a/paddle/fluid/distributed/ps/table/table.cc +++ b/paddle/fluid/distributed/ps/table/table.cc @@ -49,6 +49,7 @@ REGISTER_PSCORE_CLASS(SparseValueSGDRule, StdAdaGradSGDRule); REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseAdamSGDRule); REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseNaiveSGDRule); REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseAdaGradSGDRule); +REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseSharedAdamSGDRule); int32_t TableManager::Initialize() { static bool initialized = false; diff --git a/paddle/fluid/distributed/ps/wrapper/CMakeLists.txt b/paddle/fluid/distributed/ps/wrapper/CMakeLists.txt index 8b5457ef9eea5..c9cd883dabb69 100644 --- a/paddle/fluid/distributed/ps/wrapper/CMakeLists.txt +++ b/paddle/fluid/distributed/ps/wrapper/CMakeLists.txt @@ -13,6 +13,7 @@ cc_library( op_registry fs shell + ps_gpu_wrapper ${RPC_DEPS}) target_link_libraries(fleet z) diff --git a/paddle/fluid/distributed/ps/wrapper/fleet.cc b/paddle/fluid/distributed/ps/wrapper/fleet.cc index bbefeba559916..3d7190cf55336 100644 --- a/paddle/fluid/distributed/ps/wrapper/fleet.cc +++ b/paddle/fluid/distributed/ps/wrapper/fleet.cc @@ -18,6 +18,10 @@ limitations under the License. */ #include "paddle/fluid/distributed/ps/service/communicator/communicator.h" #include "paddle/fluid/distributed/ps/table/table.h" +#include "paddle/fluid/distributed/ps/wrapper/fleet.h" +#if defined PADDLE_WITH_HETERPS && defined PADDLE_WITH_PSCORE +#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h" +#endif namespace paddle { namespace distributed { @@ -129,6 +133,13 @@ void FleetWrapper::InitWorker(const std::string& dist_desc, worker_ptr_ = std::shared_ptr( paddle::distributed::PSClientFactory::Create(ps_param)); worker_ptr_->Configure(ps_param, dense_pull_regions, ps_env_, index); +#if defined PADDLE_WITH_HETERPS && defined PADDLE_WITH_PSCORE + VLOG(3) << "FleetWrapper::InitWorker InitializeGPUServer"; + auto* accessor = worker_ptr_->GetTableAccessor(0); + auto ps_gpu_wrapper = paddle::framework::PSGPUWrapper::GetInstance(); + ps_gpu_wrapper->InitializeGPUServer(ps_param); + ps_gpu_wrapper->SetTableAccessor(accessor); +#endif } } else { VLOG(3) << "Client can be initialized only once"; @@ -525,11 +536,11 @@ void FleetWrapper::PushSparseFromTensorAsync( int batch_size = -1; bool batch_size_consist = true; for (auto* input : *inputs) { - int cur_batch_size = + size_t cur_batch_size = input->lod().size() ? input->lod()[0].size() - 1 : input->dims()[0]; if (batch_size == -1) { - batch_size = cur_batch_size; - } else if (batch_size != cur_batch_size) { + batch_size = int(cur_batch_size); + } else if (batch_size != int(cur_batch_size)) { // CHECK(batch_size == cur_batch_size); // NOLINT batch_size_consist = false; break; @@ -537,12 +548,12 @@ void FleetWrapper::PushSparseFromTensorAsync( } CHECK(batch_size > 0); // NOLINT - int show_size = + size_t show_size = shows->lod().size() ? shows->lod()[0].size() - 1 : shows->dims()[0]; - CHECK(show_size == batch_size || show_size == 1); - int clk_size = + CHECK(show_size == size_t(batch_size) || show_size == 1); + size_t clk_size = clks->lod().size() ? clks->lod()[0].size() - 1 : clks->dims()[0]; - CHECK(clk_size == batch_size || clk_size == 1); + CHECK(clk_size == size_t(batch_size) || clk_size == 1); CHECK(outputs->size() == inputs->size()); std::vector push_keys; @@ -601,12 +612,10 @@ void FleetWrapper::PushSparseFromTensorAsync( // in // ctr_accessor.h push_values.back()[0] = 2; // TODO(zhaocaibei123): slot - push_values.back()[1] = (static_cast(i) >= show_size - ? 1 - : static_cast(show_tensor[i])); - push_values.back()[2] = (static_cast(i) >= clk_size - ? 0 - : static_cast(clk_tensor[i])); + push_values.back()[1] = + (i >= show_size ? 1 : static_cast(show_tensor[i])); + push_values.back()[2] = + (i >= clk_size ? 0 : static_cast(clk_tensor[i])); float* data = push_values.back().data() + 3; memcpy(data, g + output_len, sizeof(float) * fea_dim); } @@ -630,12 +639,10 @@ void FleetWrapper::PushSparseFromTensorAsync( // slot show clk grad... consistent with CtrCommonPushValue defined in // ctr_accessor.h push_values.back()[0] = 2; // TODO(zhaocaibei123): slot - push_values.back()[1] = (static_cast(i) >= show_size - ? 1 - : static_cast(show_tensor[i])); - push_values.back()[2] = (static_cast(i) >= clk_size - ? 0 - : static_cast(clk_tensor[i])); + push_values.back()[1] = + (i >= show_size ? 1 : static_cast(show_tensor[i])); + push_values.back()[2] = + (i >= clk_size ? 0 : static_cast(clk_tensor[i])); float* data = push_values.back().data() + 3; memcpy(data, g + output_len, sizeof(float) * fea_dim); } diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto index b3a01ae169e4e..45758389c5413 100755 --- a/paddle/fluid/framework/distributed_strategy.proto +++ b/paddle/fluid/framework/distributed_strategy.proto @@ -197,14 +197,14 @@ message TableParameter { message TableAccessorParameter { optional string accessor_class = 1; - optional SGDParameter embed_sgd_param = 2; - optional SGDParameter embedx_sgd_param = 3; optional uint32 fea_dim = 4 [ default = 11 ]; // field size of one value optional uint32 embedx_dim = 5 [ default = 8 ]; // embedx feature size optional uint32 embedx_threshold = 6 [ default = 10 ]; // embedx feature create threshold optional CtrAccessorParameter ctr_accessor_param = 7; repeated TableAccessorSaveParameter table_accessor_save_param = 8; + optional SGDParameter embed_sgd_param = 10; + optional SGDParameter embedx_sgd_param = 11; } message SGDParameter { @@ -228,7 +228,7 @@ message repeated float weight_bounds = 4; } -message SparseAdamSGDParameter { // SparseAdamSGDRule +message SparseAdamSGDParameter { // SparseAdamSGDRule | SparseSharedAdamSGDRule optional double learning_rate = 1 [ default = 0.001 ]; optional double initial_range = 2 [ default = 0.0001 ]; optional double beta1_decay_rate = 3 [ default = 0.9 ]; diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt index 42235b7c484e3..4cf3ab8dc1a67 100644 --- a/paddle/fluid/framework/fleet/CMakeLists.txt +++ b/paddle/fluid/framework/fleet/CMakeLists.txt @@ -25,10 +25,17 @@ endif() if(WITH_HETERPS) if(WITH_NCCL AND WITH_GPU) - nv_library( - ps_gpu_wrapper - SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc - DEPS heter_ps gloo_wrapper ${BRPC_DEPS}) + if(WITH_PSCORE) + nv_library( + ps_gpu_wrapper + SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc + DEPS heter_ps gloo_wrapper ps_framework_proto ${BRPC_DEPS}) + else() + nv_library( + ps_gpu_wrapper + SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc + DEPS heter_ps gloo_wrapper ${BRPC_DEPS}) + endif() add_subdirectory(heter_ps) elseif(WITH_XPU_KP) xpu_library( diff --git a/paddle/fluid/framework/fleet/heter_context.h b/paddle/fluid/framework/fleet/heter_context.h index 3955502c8b808..ef2e73d6dd5b5 100644 --- a/paddle/fluid/framework/fleet/heter_context.h +++ b/paddle/fluid/framework/fleet/heter_context.h @@ -81,7 +81,6 @@ class HeterContext { std::vector> device_values_; std::vector> device_keys_; std::vector>> device_dim_keys_; - std::vector>> device_dim_values_; std::vector mutex_; std::vector> dim_mutex_; int multi_mf_dim_ = 0; @@ -114,7 +113,6 @@ class HeterContext { value_dim_ptr_[i].resize(dim_num); } device_values_.resize(device_num); - device_dim_values_.resize(device_num); device_keys_.resize(device_num); device_dim_keys_.resize(device_num); diff --git a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt index 7540c6147f4b7..9631502f4f05e 100644 --- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt +++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt @@ -9,16 +9,16 @@ if(WITH_GPU) endif() nv_library( heter_comm_kernel - SRCS heter_comm_kernel.cu feature_value.h + SRCS heter_comm_kernel.cu feature_value.h feature_value.cu DEPS ${HETERPS_DEPS}) nv_library( hashtable_kernel - SRCS hashtable_kernel.cu feature_value.h + SRCS hashtable_kernel.cu feature_value.h feature_value.cu DEPS ${HETERPS_DEPS}) nv_library( heter_comm - SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h - mem_pool.h + SRCS heter_comm.h feature_value.h feature_value.cu heter_resource.cc + heter_resource.h mem_pool.h DEPS ${HETERPS_DEPS} heter_comm_kernel hashtable_kernel) nv_test( test_heter_comm diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.cu b/paddle/fluid/framework/fleet/heter_ps/feature_value.cu new file mode 100644 index 0000000000000..560ce33b9af78 --- /dev/null +++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.cu @@ -0,0 +1,192 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#ifdef PADDLE_WITH_HETERPS +#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" + +namespace paddle { +namespace framework { + +template +__global__ void PullCopy(float** dest, + const float* src, + const int64_t* len, + int slot_num, + int total_len, + uint64_t** keys, + uint64_t max_val_size, + int* gpu_dim, + FVAccessor feature_value_accessor) { + CUDA_KERNEL_LOOP(i, total_len) { + int low = 0; + int high = slot_num - 1; + while (low < high) { + int mid = (low + high) / 2; + if (i < len[mid]) + high = mid; + else + low = mid + 1; + } + int x = low; + int y = i - (x ? len[x - 1] : 0); + float* feature_value_ptr = + (float*)((char*)src + uint64_t(i) * uint64_t(max_val_size)); + int mf_dim = gpu_dim[x] - 3; + feature_value_accessor.Select( + dest[x] + y * (mf_dim + 3), feature_value_ptr, keys[x] + y, mf_dim); + } +} + +template +__global__ void PushCopyWithPool(float* dest, + float** src, + int64_t* len, + int slot_num, + uint64_t total_len, + int bs, + int* slot_vector, + int* mf_dim_vector, + size_t grad_value_size, + FVAccessor feature_value_accessor) { + CUDA_KERNEL_LOOP(i, total_len) { + int low = 0; + int high = slot_num - 1; + while (low < high) { + int mid = (low + high) / 2; + if (i < len[mid]) + high = mid; + else + low = mid + 1; + } + int x = low; + int y = i - (x ? len[low - 1] : 0); + float* cur = (float*)((char*)dest + i * grad_value_size); + + cur[feature_value_accessor.common_push_value.SlotIndex()] = + (float)slot_vector[x]; + int mf_dim = mf_dim_vector[x]; + cur[feature_value_accessor.common_push_value.MfDimIndex()] = mf_dim; + + cur[feature_value_accessor.common_push_value.ShowIndex()] = + *(src[x] + y * (mf_dim + 3)); + cur[feature_value_accessor.common_push_value.ClickIndex()] = + *(src[x] + y * (mf_dim + 3) + 1); + cur[feature_value_accessor.common_push_value.EmbedGIndex()] = + *(src[x] + y * (mf_dim + 3) + 2) * -1. * bs; + for (int j = 0; j < mf_dim; j++) { + cur[feature_value_accessor.common_push_value.EmbedxGIndex() + j] = + *(src[x] + y * (mf_dim + 3) + 3 + j) * -1. * bs; + } + } +} + +template +void AccessorWrapper::CopyForPullImpl( + const paddle::platform::Place& place, + uint64_t** gpu_keys, + const std::vector& values, + const float* total_values_gpu, + const int64_t* gpu_len, + const int slot_num, + const int hidden_size, + const int64_t total_length, + int* gpu_dim, + int feature_value_size) { + auto stream = dynamic_cast( + paddle::platform::DeviceContextPool::Instance().Get(place)) + ->stream(); + auto buf_value = memory::Alloc(place, values.size() * sizeof(float*)); + float** gpu_values = reinterpret_cast(buf_value->ptr()); + cudaMemcpy(gpu_values, + values.data(), + values.size() * sizeof(float*), + cudaMemcpyHostToDevice); + PullCopy<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>( + gpu_values, + total_values_gpu, + gpu_len, + slot_num, + total_length, + gpu_keys, + feature_value_size, + gpu_dim, + gpu_accessor_); + cudaStreamSynchronize(stream); +} + +template +void AccessorWrapper::CopyForPushImpl( + const paddle::platform::Place& place, + const std::vector& grad_values, + float* total_grad_values_gpu, + const std::vector& slot_lengths, + const uint64_t total_length, + const int batch_size, + size_t grad_value_size, + std::vector& slot_vector, + std::vector& slot_mf_dim_vector) { + auto stream = dynamic_cast( + paddle::platform::DeviceContextPool::Instance().Get(place)) + ->stream(); + auto slot_lengths_lod = slot_lengths; + for (int i = 1; i < slot_lengths_lod.size(); i++) { + slot_lengths_lod[i] += slot_lengths_lod[i - 1]; + } + auto buf_grad_value = + memory::Alloc(place, grad_values.size() * sizeof(float*)); + auto buf_length = memory::Alloc(place, slot_lengths.size() * sizeof(int64_t)); + auto buf_slot_vector = + memory::Alloc(place, slot_lengths_lod.size() * sizeof(int)); + auto buf_mf_dim_vector = + memory::Alloc(place, slot_lengths_lod.size() * sizeof(int)); + float** gpu_values = reinterpret_cast(buf_grad_value->ptr()); + int64_t* gpu_len = reinterpret_cast(buf_length->ptr()); + int* d_slot_vector = reinterpret_cast(buf_slot_vector->ptr()); + int* d_mf_dim_vector = reinterpret_cast(buf_mf_dim_vector->ptr()); + cudaMemcpy(gpu_values, + grad_values.data(), + grad_values.size() * sizeof(float*), + cudaMemcpyHostToDevice); + cudaMemcpy(gpu_len, + slot_lengths_lod.data(), + slot_lengths.size() * sizeof(int64_t), + cudaMemcpyHostToDevice); + cudaMemcpy(d_slot_vector, + slot_vector.data(), + slot_lengths_lod.size() * sizeof(int), + cudaMemcpyHostToDevice); + cudaMemcpy(d_mf_dim_vector, + slot_mf_dim_vector.data(), + slot_lengths_lod.size() * sizeof(int), + cudaMemcpyHostToDevice); + PushCopyWithPool<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>( + total_grad_values_gpu, + gpu_values, + gpu_len, + slot_lengths.size(), + total_length, + batch_size, + d_slot_vector, + d_mf_dim_vector, + grad_value_size, + gpu_accessor_); + cudaStreamSynchronize(stream); +} + +#ifdef PADDLE_WITH_PSCORE +template class AccessorWrapper; +#endif + +} // namespace framework +} // namespace paddle +#endif diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.h b/paddle/fluid/framework/fleet/heter_ps/feature_value.h index cb7f3a40d6720..ef4533d64eac2 100644 --- a/paddle/fluid/framework/fleet/heter_ps/feature_value.h +++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.h @@ -17,12 +17,547 @@ limitations under the License. */ #ifdef PADDLE_WITH_HETERPS #include +#include +#include + +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/place.h" +#ifdef PADDLE_WITH_PSCORE +#include "paddle/fluid/distributed/ps/table/accessor.h" +#include "paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h" +#include "paddle/fluid/distributed/ps/table/depends/feature_value.h" +#endif namespace paddle { namespace framework { #define MF_DIM 8 typedef uint64_t FeatureKey; +#define TYPEALIGN(ALIGNVAL, LEN) \ + (((uint64_t)(LEN) + ((ALIGNVAL)-1)) & ~((uint64_t)((ALIGNVAL)-1))) + +class FeatureValueAccessor { + public: + __host__ __device__ FeatureValueAccessor() {} + __host__ __device__ ~FeatureValueAccessor() {} + + __host__ __device__ virtual int Configure( + std::unordered_map config) { + _config = config; + Initialize(); + return 0; + } + __host__ __device__ virtual int Initialize() = 0; + + protected: + std::unordered_map _config; +}; + +// adagrad: embed_sgd_dim=1, embedx_sgd_dim=1,embedx_dim=n +// adam std: embed_sgd_dim=4, embedx_sgd_dim=n*2+2,embedx_dim=n +// adam shared: embed_sgd_dim=4, embedx_sgd_dim=4,embedx_dim=n +class CommonFeatureValueAccessor : public FeatureValueAccessor { + public: + struct CommonFeatureValue { + /* + uint64_t cpu_ptr; + float delta_score; + float show; + float click; + float embed_w; + std::vector embed_g2sum; + float slot; + float mf_dim + float mf_size + std::vector embedx_g2sum; + std::vector embedx_w; + */ + + __host__ __device__ int Dim() { + return 9 + embed_sgd_dim + embedx_sgd_dim + embedx_dim; + } // has cpu_ptr(2) + __host__ __device__ int DimSize(size_t dim, int embedx_dim) { + return sizeof(float); + } + __host__ __device__ size_t Size() { + return TYPEALIGN(8, Dim() * sizeof(float)); + } // cpu_ptr:uint64=2float + __host__ __device__ int EmbedDim() { return embed_sgd_dim; } + __host__ __device__ int EmbedXDim() { return embedx_sgd_dim; } + __host__ __device__ int EmbedWDim() { return embedx_dim; } + __host__ __device__ int CpuPtrIndex() { return 0; } // cpuprt uint64 + __host__ __device__ int DeltaScoreIndex() { return CpuPtrIndex() + 2; } + __host__ __device__ int ShowIndex() { return DeltaScoreIndex() + 1; } + __host__ __device__ int ClickIndex() { return ShowIndex() + 1; } + __host__ __device__ int EmbedWIndex() { return ClickIndex() + 1; } + __host__ __device__ int EmbedG2SumIndex() { return EmbedWIndex() + 1; } + __host__ __device__ int SlotIndex() { + return EmbedG2SumIndex() + embed_sgd_dim; + } + __host__ __device__ int MfDimIndex() { return SlotIndex() + 1; } + __host__ __device__ int MfSizeIndex() { + return MfDimIndex() + 1; + } // actual mf size (ex. 0) + __host__ __device__ int EmbedxG2SumIndex() { return MfSizeIndex() + 1; } + __host__ __device__ int EmbedxWIndex() { + return EmbedxG2SumIndex() + embedx_sgd_dim; + } + + // 根据mf_dim计算的总长度 + __host__ __device__ int Dim(int& mf_dim) { + int tmp_embedx_sgd_dim = 1; + if (optimizer_type_ == 3) { // adam + tmp_embedx_sgd_dim = mf_dim * 2 + 2; + } else if (optimizer_type_ == 4) { // shared_adam + tmp_embedx_sgd_dim = 4; + } + return 9 + embed_sgd_dim + tmp_embedx_sgd_dim + mf_dim; + } + + // 根据mf_dim 计算的总byte数 + __host__ __device__ size_t Size(int& mf_dim) { + return TYPEALIGN(8, Dim(mf_dim) * sizeof(float)); // cpu_ptr:2float + } + + // 根据mf_dim 计算的 mf_size byte数 + __host__ __device__ size_t MFSize(int& mf_dim) { + int tmp_embedx_sgd_dim = 1; + if (optimizer_type_ == 3) { // adam + tmp_embedx_sgd_dim = mf_dim * 2 + 2; + } else if (optimizer_type_ == 4) { // shared_adam + tmp_embedx_sgd_dim = 4; + } + return (tmp_embedx_sgd_dim + mf_dim) * sizeof(float); + } + + __host__ __device__ int EmbedxG2SumOffsetIndex() { return 0; } + __host__ __device__ int EmbedxWOffsetIndex(float* val) { + // has mf + int tmp_embedx_sgd_dim = 1; + if (int(MfSize(val)) > 0) { + if (optimizer_type_ == 3) { // adam + tmp_embedx_sgd_dim = int(MfDim(val)) * 2 + 2; + } else if (optimizer_type_ == 4) { // shared_adam + tmp_embedx_sgd_dim = 4; + } + return EmbedxG2SumIndex() + tmp_embedx_sgd_dim; + } else { + // no mf + return 0; + } + } + + __host__ __device__ uint64_t CpuPtr(float* val) { + return *(reinterpret_cast(val)); + } + __host__ __device__ float& DeltaScore(float* val) { + return val[DeltaScoreIndex()]; + } + __host__ __device__ float& Show(float* val) { return val[ShowIndex()]; } + __host__ __device__ float& Click(float* val) { return val[ClickIndex()]; } + __host__ __device__ float& Slot(float* val) { return val[SlotIndex()]; } + __host__ __device__ float& MfDim(float* val) { return val[MfDimIndex()]; } + __host__ __device__ float& MfSize(float* val) { return val[MfSizeIndex()]; } + __host__ __device__ float& EmbedW(float* val) { return val[EmbedWIndex()]; } + __host__ __device__ float& EmbedG2Sum(float* val) { + return val[EmbedG2SumIndex()]; + } + __host__ __device__ float& EmbedxG2Sum(float* val) { + return val[EmbedxG2SumIndex()]; + } + __host__ __device__ float& EmbedxW(float* val) { + return val[EmbedxWIndex()]; + } + + int embed_sgd_dim; + int embedx_dim; + int embedx_sgd_dim; + int optimizer_type_; + }; + + struct CommonPushValue { + /* + float slot; + float show; + float click; + float mf_dim; + float embed_g; + std::vector embedx_g; + */ + + __host__ __device__ int Dim(int embedx_dim) { return 5 + embedx_dim; } + + __host__ __device__ int DimSize(int dim, int embedx_dim) { + return sizeof(float); + } + __host__ __device__ int Size(int embedx_dim) { + return TYPEALIGN(8, Dim(embedx_dim) * sizeof(float)); + } + __host__ __device__ int SlotIndex() { return 0; } + __host__ __device__ int ShowIndex() { + return CommonPushValue::SlotIndex() + 1; + } + __host__ __device__ int ClickIndex() { + return CommonPushValue::ShowIndex() + 1; + } + __host__ __device__ int MfDimIndex() { + return CommonPushValue::ClickIndex() + 1; + } + __host__ __device__ int EmbedGIndex() { + return CommonPushValue::MfDimIndex() + 1; + } + __host__ __device__ int EmbedxGIndex() { + return CommonPushValue::EmbedGIndex() + 1; + } + __host__ __device__ float& Slot(float* val) { + return val[CommonPushValue::SlotIndex()]; + } + __host__ __device__ float& Show(float* val) { + return val[CommonPushValue::ShowIndex()]; + } + __host__ __device__ float& Click(float* val) { + return val[CommonPushValue::ClickIndex()]; + } + __host__ __device__ float& MfDim(float* val) { + return val[CommonPushValue::MfDimIndex()]; + } + __host__ __device__ float& EmbedG(float* val) { + return val[CommonPushValue::EmbedGIndex()]; + } + __host__ __device__ float* EmbedxG(float* val) { + return val + CommonPushValue::EmbedxGIndex(); + } + }; + + struct CommonPullValue { + /* + float show; + float click; + float embed_w; + std::vector embedx_w; + */ + + __host__ __device__ static int Dim(int embedx_dim) { + return 3 + embedx_dim; + } + __host__ __device__ int DimSize(size_t dim) { return sizeof(float); } + __host__ __device__ int Size(int embedx_dim) { + return TYPEALIGN(8, Dim(embedx_dim) * sizeof(float)); + } + __host__ __device__ int ShowIndex() { return 0; } + __host__ __device__ int ClickIndex() { return 1; } + __host__ __device__ int EmbedWIndex() { return 2; } + __host__ __device__ int EmbedxWIndex() { return 3; } + __host__ __device__ float& Show(float* val) { + return val[CommonPullValue::ShowIndex()]; + } + __host__ __device__ float& Click(float* val) { + return val[CommonPullValue::ClickIndex()]; + } + __host__ __device__ float& EmbedW(float* val) { + return val[CommonPullValue::EmbedWIndex()]; + } + __host__ __device__ float* EmbedxW(float* val) { + return val + CommonPullValue::EmbedxWIndex(); + } + }; + + __host__ __device__ CommonFeatureValueAccessor() {} + __host__ __device__ ~CommonFeatureValueAccessor() {} + + __host__ __device__ virtual int Initialize() { + int optimizer_type = (_config.find("optimizer_type") == _config.end()) + ? 1 + : int(_config["optimizer_type"]); + int sparse_embedx_dim = (_config.find("embedx_dim") == _config.end()) + ? 8 + : int(_config["embedx_dim"]); + if (optimizer_type == 3) { // adam + common_feature_value.embed_sgd_dim = 4; + common_feature_value.embedx_sgd_dim = sparse_embedx_dim * 2 + 2; + } else if (optimizer_type == 4) { // shared_adam + common_feature_value.embed_sgd_dim = 4; + common_feature_value.embedx_sgd_dim = 4; + } else { + common_feature_value.embed_sgd_dim = 1; + common_feature_value.embedx_sgd_dim = 1; + } + common_feature_value.optimizer_type_ = optimizer_type; + common_feature_value.embedx_dim = sparse_embedx_dim; + + return 0; + } + + // // build阶段从cpu_val赋值给gpu_val + __host__ void BuildFill( + float* gpu_val, + void* cpu, + paddle::distributed::ValueAccessor* cpu_table_accessor, + int mf_dim) { +#ifdef PADDLE_WITH_PSCORE + paddle::distributed::CtrDymfAccessor* cpu_accessor = + dynamic_cast(cpu_table_accessor); + paddle::distributed::FixedFeatureValue* cpu_ptr = + (paddle::distributed::FixedFeatureValue*)(cpu); + float* cpu_val = cpu_ptr->data(); + size_t cpu_dim = cpu_ptr->size(); + + gpu_val[common_feature_value.DeltaScoreIndex()] = + cpu_val[cpu_accessor->common_feature_value.DeltaScoreIndex()]; + gpu_val[common_feature_value.ShowIndex()] = + cpu_val[cpu_accessor->common_feature_value.ShowIndex()]; + gpu_val[common_feature_value.ClickIndex()] = + cpu_val[cpu_accessor->common_feature_value.ClickIndex()]; + gpu_val[common_feature_value.SlotIndex()] = + cpu_val[cpu_accessor->common_feature_value.SlotIndex()]; + gpu_val[common_feature_value.EmbedWIndex()] = + cpu_val[cpu_accessor->common_feature_value.EmbedWIndex()]; + for (int i = 0; i < common_feature_value.EmbedDim(); i++) { + gpu_val[common_feature_value.EmbedG2SumIndex() + i] = + cpu_val[cpu_accessor->common_feature_value.EmbedG2SumIndex() + i]; + } + *(reinterpret_cast( + gpu_val + common_feature_value.CpuPtrIndex())) = (uint64_t)(cpu); + cpu_val[cpu_accessor->common_feature_value.MfDimIndex()] = float(mf_dim); + gpu_val[common_feature_value.MfDimIndex()] = mf_dim; + if (cpu_dim > cpu_accessor->GetAccessorInfo().dim - + cpu_accessor->GetAccessorInfo().mf_size / sizeof(float)) { + gpu_val[common_feature_value.MfSizeIndex()] = + common_feature_value.MFSize(mf_dim) / sizeof(float); + + for (int x = 0; + x < int(common_feature_value.MFSize(mf_dim) / sizeof(float)); + x++) { + gpu_val[common_feature_value.EmbedxG2SumIndex() + x] = + cpu_val[cpu_accessor->common_feature_value.EmbedxG2SumIndex() + x]; + } + } else { + gpu_val[common_feature_value.MfSizeIndex()] = 0; + for (int x = common_feature_value.EmbedxG2SumIndex(); + x < int(common_feature_value.Size(mf_dim) / sizeof(float)); + x++) { + gpu_val[x] = 0; + } + } +#endif + } + + // dump_to_cpu阶段从gpu_val赋值给cpu_val + __host__ void DumpFill(float* gpu_val, + paddle::distributed::ValueAccessor* cpu_table_accessor, + int mf_dim) { +#ifdef PADDLE_WITH_PSCORE + paddle::distributed::CtrDymfAccessor* cpu_accessor = + dynamic_cast(cpu_table_accessor); + + auto* downpour_value = + (paddle::distributed::FixedFeatureValue*)(*(reinterpret_cast( + gpu_val + common_feature_value.CpuPtrIndex()))); + size_t downpour_value_size = downpour_value->size(); + if (gpu_val[common_feature_value.MfSizeIndex()] > 0 && + downpour_value_size == (cpu_accessor->GetAccessorInfo().dim - + int(cpu_accessor->GetAccessorInfo().mf_size / + sizeof(float)))) { // cpu_accessor + downpour_value->resize(cpu_accessor->common_feature_value.Dim(mf_dim)); + } + float* cpu_val = downpour_value->data(); + cpu_val[cpu_accessor->common_feature_value.DeltaScoreIndex()] = + gpu_val[common_feature_value.DeltaScoreIndex()]; + cpu_val[cpu_accessor->common_feature_value.ShowIndex()] = + gpu_val[common_feature_value.ShowIndex()]; + cpu_val[cpu_accessor->common_feature_value.ClickIndex()] = + gpu_val[common_feature_value.ClickIndex()]; + cpu_val[cpu_accessor->common_feature_value.EmbedWIndex()] = + gpu_val[common_feature_value.EmbedWIndex()]; + cpu_val[cpu_accessor->common_feature_value.SlotIndex()] = + gpu_val[common_feature_value.SlotIndex()]; + + for (int i = 0; i < common_feature_value.EmbedDim(); i++) { + cpu_val[cpu_accessor->common_feature_value.EmbedG2SumIndex() + i] = + gpu_val[common_feature_value.EmbedG2SumIndex() + i]; + } + + if (gpu_val[common_feature_value.MfSizeIndex()] > 0) { + for (int x = 0; + x < int(common_feature_value.MFSize(mf_dim) / sizeof(float)); + x++) { + cpu_val[cpu_accessor->common_feature_value.EmbedxG2SumIndex() + x] = + gpu_val[common_feature_value.EmbedxG2SumIndex() + x]; + } + } +#endif + } + + // dy_mf_fill_dvals_kernel, dy_mf_search_kernel 阶段 gpukernel + // 中从src_val赋值给dest_val + __host__ __device__ void FeatureValueFill(float* dest_val, + float* src_val, + int mf_dim) { + *(reinterpret_cast(dest_val + + common_feature_value.CpuPtrIndex())) = + *(reinterpret_cast(src_val + + common_feature_value.CpuPtrIndex())); + dest_val[common_feature_value.DeltaScoreIndex()] = + src_val[common_feature_value.DeltaScoreIndex()]; + dest_val[common_feature_value.ShowIndex()] = + src_val[common_feature_value.ShowIndex()]; + dest_val[common_feature_value.ClickIndex()] = + src_val[common_feature_value.ClickIndex()]; + dest_val[common_feature_value.EmbedWIndex()] = + src_val[common_feature_value.EmbedWIndex()]; + for (int i = 0; i < common_feature_value.EmbedDim(); i++) { + dest_val[common_feature_value.EmbedG2SumIndex() + i] = + src_val[common_feature_value.EmbedG2SumIndex() + i]; + } + dest_val[common_feature_value.SlotIndex()] = + src_val[common_feature_value.SlotIndex()]; + dest_val[common_feature_value.MfDimIndex()] = mf_dim; + dest_val[common_feature_value.MfSizeIndex()] = + src_val[common_feature_value.MfSizeIndex()]; + + for (int x = common_feature_value.EmbedxG2SumIndex(); + x < int(common_feature_value.Size(mf_dim) / sizeof(float)); + x++) { + dest_val[x] = src_val[x]; + } + } + + // dy_mf_fill_shard_grads_kernel,update_one 阶段 gpukernel + // 中从src_val赋值给dest_val + __host__ __device__ void PushValueFill(float* dest_val, + const float* src_val) { + dest_val[common_push_value.SlotIndex()] = + src_val[common_push_value.SlotIndex()]; + dest_val[common_push_value.ShowIndex()] = + src_val[common_push_value.ShowIndex()]; + dest_val[common_push_value.ClickIndex()] = + src_val[common_push_value.ClickIndex()]; + dest_val[common_push_value.MfDimIndex()] = + src_val[common_push_value.MfDimIndex()]; + dest_val[common_push_value.EmbedGIndex()] = + src_val[common_push_value.EmbedGIndex()]; + + for (int x = 0; x < int(src_val[common_push_value.MfDimIndex()]); x++) { + dest_val[common_push_value.EmbedxGIndex() + x] = + src_val[common_push_value.EmbedxGIndex() + x]; + } + } + + // update_basic 阶段 gpukernel 中从src_val赋值给dest_val + __host__ __device__ void PushValueFillBasic(float* dest_val, + const float* src_val) { + dest_val[common_push_value.SlotIndex()] = + src_val[common_push_value.SlotIndex()]; + dest_val[common_push_value.ShowIndex()] = + src_val[common_push_value.ShowIndex()]; + dest_val[common_push_value.ClickIndex()] = + src_val[common_push_value.ClickIndex()]; + dest_val[common_push_value.MfDimIndex()] = + src_val[common_push_value.MfDimIndex()]; + dest_val[common_push_value.EmbedGIndex()] = + src_val[common_push_value.EmbedGIndex()]; + } + + // merge_one 阶段 gpukernel 中 PushValue 从src_val赋值给dest_val + __host__ __device__ void MergePushValue(float* dest_val, + const float* src_val) { + dest_val[common_push_value.ShowIndex()] += + src_val[common_push_value.ShowIndex()]; + dest_val[common_push_value.ClickIndex()] += + src_val[common_push_value.ClickIndex()]; + dest_val[common_push_value.EmbedGIndex()] += + src_val[common_push_value.EmbedGIndex()]; + for (int j = 0; j < int(dest_val[common_push_value.MfDimIndex()]); j++) { + dest_val[common_push_value.EmbedxGIndex() + j] += + src_val[common_push_value.EmbedxGIndex() + j]; + } + } + + // merge_basic 阶段 gpukernel 中 PushValue 从src_val赋值给dest_val + __host__ __device__ void MergePushValueBasic(float* dest_val, + const float* src_val) { + dest_val[common_push_value.ShowIndex()] += + src_val[common_push_value.ShowIndex()]; + dest_val[common_push_value.ClickIndex()] += + src_val[common_push_value.ClickIndex()]; + dest_val[common_push_value.EmbedGIndex()] += + src_val[common_push_value.EmbedGIndex()]; + } + + // PullCopy 阶段 gpukernel 中 FeatureValue回填到PullValue + __host__ __device__ void Select(float* dest_val, + float* src_val, + uint64_t* key, + int mf_dim) { + if (*key == 0) { + *(dest_val + common_pull_value.ShowIndex()) = 0; + *(dest_val + common_pull_value.ClickIndex()) = 0; + *(dest_val + common_pull_value.EmbedWIndex()) = 0; + } else { + *(dest_val + common_pull_value.ShowIndex()) = + src_val[common_feature_value.ShowIndex()]; + *(dest_val + common_pull_value.ClickIndex()) = + src_val[common_feature_value.ClickIndex()]; + *(dest_val + common_pull_value.EmbedWIndex()) = + src_val[common_feature_value.EmbedWIndex()]; + } + + if (src_val[common_feature_value.MfSizeIndex()] == 0 || *key == 0) { + for (int j = 0; j < mf_dim; j++) { + *(dest_val + common_pull_value.EmbedxWIndex() + j) = 0; + } + } else { + for (int j = 0; j < mf_dim; j++) { + *(dest_val + common_pull_value.EmbedxWIndex() + j) = + src_val[common_feature_value.EmbedxWOffsetIndex(src_val) + j]; + } + } + } + + __host__ __device__ std::string ParseToString(const float* v, + int param_size) { + /* + uint64_t cpu_ptr; // 2float + float delta_score; + float show; + float click; + float embed_w; + std::vector embed_g2sum; + float slot; + float mf_dim + float mf_size + std::vector embedx_g2sum; + std::vector embedx_w; + */ + std::stringstream os; + os << "cpuptr: " << common_feature_value.CpuPtr(const_cast(v)) + << " delta_score: " << v[2] << " show: " << v[3] << " click: " << v[4] + << " embed_w:" << v[5] << " embed_g2sum:"; + for (int i = common_feature_value.EmbedG2SumIndex(); + i < common_feature_value.SlotIndex(); + i++) { + os << " " << v[i]; + } + int mf_dim = int(common_feature_value.MfDim(const_cast(v))); + os << " slot: " << common_feature_value.Slot(const_cast(v)) + << " mf_dim: " << mf_dim + << " mf_size: " << common_feature_value.MfSize(const_cast(v)) + << " mf: "; + if (param_size > common_feature_value.EmbedxG2SumIndex()) { + for (auto i = common_feature_value.EmbedxG2SumIndex(); + i < common_feature_value.Dim(mf_dim); + ++i) { + os << " " << v[i]; + } + } + return os.str(); + } + + public: + CommonFeatureValue common_feature_value; + CommonPushValue common_push_value; + CommonPullValue common_pull_value; +}; struct FeatureValue { float delta_score; @@ -95,6 +630,176 @@ struct FeaturePushValue { } }; +class VirtualAccessor { + public: + virtual int Configure(std::unordered_map config) = 0; + + virtual size_t GetFeatureValueSize(int& mf_dim) = 0; + + virtual size_t GetPushValueSize(int& mf_dim) = 0; + + virtual void BuildFill(void* gpu_val, + void* cpu_val, + paddle::distributed::ValueAccessor* cpu_table_accessor, + int mf_dim) = 0; + + virtual void DumpFill(float* gpu_val, + paddle::distributed::ValueAccessor* cpu_table_accessor, + int mf_dim) = 0; + + virtual void CopyForPull(const paddle::platform::Place& place, + uint64_t** gpu_keys, + const std::vector& values, + const float* total_values_gpu, + const int64_t* gpu_len, + const int slot_num, + const int hidden_size, + const int64_t total_length, + int* gpu_dim, + int feature_value_size) = 0; + + virtual void CopyForPush(const paddle::platform::Place& place, + const std::vector& grad_values, + float* total_grad_values_gpu, + const std::vector& slot_lengths, + const uint64_t total_length, + const int batch_size, + size_t grad_value_size, + std::vector& slot_vector, + std::vector& slot_mf_dim_vector) = 0; + + virtual std::string ParseToString(const float* v, int param_size) = 0; +}; + +template +class AccessorWrapper : public VirtualAccessor { + public: + explicit AccessorWrapper() {} + virtual ~AccessorWrapper() {} + AccessorWrapper(const AccessorWrapper&) = delete; + AccessorWrapper& operator=(const AccessorWrapper&) = delete; + + virtual int Configure(std::unordered_map config) { + return gpu_accessor_.Configure(config); + } + + virtual size_t GetFeatureValueSize(int& mf_dim) { + return gpu_accessor_.common_feature_value.Size(mf_dim); + } + + virtual size_t GetPushValueSize(int& mf_dim) { + return gpu_accessor_.common_push_value.Size(mf_dim); + } + + virtual void BuildFill(void* gpu_val, + void* cpu_val, + paddle::distributed::ValueAccessor* cpu_table_accessor, + int mf_dim) { + gpu_accessor_.BuildFill( + (float*)(gpu_val), cpu_val, cpu_table_accessor, mf_dim); + } + + virtual void DumpFill(float* gpu_val, + paddle::distributed::ValueAccessor* cpu_table_accessor, + int mf_dim) { + gpu_accessor_.DumpFill(gpu_val, cpu_table_accessor, mf_dim); + } + + virtual void CopyForPull(const paddle::platform::Place& place, + uint64_t** gpu_keys, + const std::vector& values, + const float* total_values_gpu, + const int64_t* gpu_len, + const int slot_num, + const int hidden_size, + const int64_t total_length, + int* gpu_dim, + int feature_value_size) { + CopyForPullImpl(place, + gpu_keys, + values, + total_values_gpu, + gpu_len, + slot_num, + hidden_size, + total_length, + gpu_dim, + feature_value_size); + } + + virtual void CopyForPush(const paddle::platform::Place& place, + const std::vector& grad_values, + float* total_grad_values_gpu, + const std::vector& slot_lengths, + const uint64_t total_length, + const int batch_size, + size_t grad_value_size, + std::vector& slot_vector, + std::vector& slot_mf_dim_vector) { + CopyForPushImpl(place, + grad_values, + total_grad_values_gpu, + slot_lengths, + total_length, + batch_size, + grad_value_size, + slot_vector, + slot_mf_dim_vector); + } + + void CopyForPullImpl(const paddle::platform::Place& place, + uint64_t** gpu_keys, + const std::vector& values, + const float* total_values_gpu, + const int64_t* gpu_len, + const int slot_num, + const int hidden_size, + const int64_t total_length, + int* gpu_dim, + int feature_value_size); + + void CopyForPushImpl(const paddle::platform::Place& place, + const std::vector& grad_values, + float* total_grad_values_gpu, + const std::vector& slot_lengths, + const uint64_t total_length, + const int batch_size, + size_t grad_value_size, + std::vector& slot_vector, + std::vector& slot_mf_dim_vector); + + virtual std::string ParseToString(const float* v, int param_size) { + return gpu_accessor_.ParseToString(v, param_size); + } + + GPUAccessor gpu_accessor_; +}; + +class GlobalAccessorTransfor { + public: + static GlobalAccessorTransfor& GetInstance() { + static GlobalAccessorTransfor ins; + return ins; + } + void Init(std::string accessor_type) { + if (accessor_wrapper_ptr_ != nullptr) { + return; + } + if (accessor_type == "CtrDymfAccessor") { + accessor_wrapper_ptr_ = new AccessorWrapper(); + } else { + VLOG(0) << "GlobalAccessorTransfor Init not support accessor_type:" + << accessor_type; + accessor_wrapper_ptr_ = new AccessorWrapper(); + } + } + VirtualAccessor* GetAccessorWrapper() { return accessor_wrapper_ptr_; } + + private: + VirtualAccessor* accessor_wrapper_ptr_ = nullptr; +}; + } // end namespace framework } // end namespace paddle + #endif diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h index 9a6581c2ae5e3..a4bee2c19bbda 100644 --- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h +++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h @@ -25,10 +25,12 @@ #ifdef PADDLE_WITH_HETERPS namespace paddle { namespace framework { -class GpuPsGraphTable : public HeterComm { +class GpuPsGraphTable + : public HeterComm { public: GpuPsGraphTable(std::shared_ptr resource, int topo_aware) - : HeterComm(1, resource) { + : HeterComm( + 1, resource) { load_factor_ = 0.25; rw_lock.reset(new pthread_rwlock_t()); gpu_num = resource_->total_device(); diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable.h b/paddle/fluid/framework/fleet/heter_ps/hashtable.h index dbd6130c1461d..43192df0c71f0 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable.h +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable.h @@ -137,8 +137,12 @@ class HashTable { size_t len, StreamType stream); - template - void get(const KeyType* d_keys, char* d_vals, size_t len, StreamType stream); + template + void get(const KeyType* d_keys, + char* d_vals, + size_t len, + StreamType stream, + FVAccessor& fv_accessor); void show(); @@ -150,9 +154,9 @@ class HashTable { #if defined(PADDLE_WITH_CUDA) - template + template void update(const KeyType* d_keys, - const GradType* d_grads, + const float* d_grads, size_t len, Sgd sgd, StreamType stream); diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu index bb9998249048e..2f5d5697e7c38 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu @@ -83,36 +83,25 @@ __global__ void search_kernel(Table* table, } } -template +template __global__ void dy_mf_search_kernel(Table* table, const typename Table::key_type* const keys, char* vals, size_t len, - size_t pull_feature_value_size) { + size_t pull_feature_value_size, + FVAccessor feature_value_accessor) { const size_t i = blockIdx.x * blockDim.x + threadIdx.x; if (i < len) { auto it = table->find(keys[i]); if (it != table->end()) { uint64_t offset = i * pull_feature_value_size; - FeatureValue* cur = (FeatureValue*)(vals + offset); - FeatureValue& input = *(FeatureValue*)(it->second); - cur->slot = input.slot; - cur->show = input.show; - cur->clk = input.clk; - cur->mf_dim = input.mf_dim; - cur->lr = input.lr; - cur->mf_size = input.mf_size; - cur->cpu_ptr = input.cpu_ptr; - cur->delta_score = input.delta_score; - cur->lr_g2sum = input.lr_g2sum; - for (int j = 0; j < cur->mf_dim + 1; ++j) { - cur->mf[j] = input.mf[j]; - } - } else { - if (keys[i] != 0) { - printf("warning::pull miss key: %llu", keys[i]); - } + float* cur = (float*)(vals + offset); + float* input = it->second; + int mf_dim = + int(input[feature_value_accessor.common_feature_value.MfDimIndex()]); + + feature_value_accessor.FeatureValueFill(cur, input, mf_dim); } } } @@ -145,8 +134,8 @@ __global__ void dy_mf_update_kernel(Table* table, if (i < len) { auto it = table->find(keys[i]); if (it != table->end()) { - FeaturePushValue* cur = (FeaturePushValue*)(grads + i * grad_value_size); - sgd.dy_mf_update_value(optimizer_config, (it.getter())->second, *cur); + float* cur = (float*)(grads + i * grad_value_size); + sgd.dy_mf_update_value(optimizer_config, (it.getter())->second, cur); } else { if (keys[i] != 0) { printf("warning::push miss key: %llu", keys[i]); @@ -212,17 +201,18 @@ void HashTable::get(const KeyType* d_keys, } template -template +template void HashTable::get(const KeyType* d_keys, char* d_vals, size_t len, - StreamType stream) { + StreamType stream, + FVAccessor& fv_accessor) { if (len == 0) { return; } const int grid_size = (len - 1) / BLOCK_SIZE_ + 1; dy_mf_search_kernel<<>>( - container_, d_keys, d_vals, len, pull_feature_value_size_); + container_, d_keys, d_vals, len, pull_feature_value_size_, fv_accessor); } template @@ -298,27 +288,6 @@ void HashTable::dump_to_cpu(int devid, StreamType stream) { cpu_val[x + 7] = gpu_val.mf[x]; } } -#endif -#ifdef PADDLE_WITH_PSCORE - auto* downpour_value = - (paddle::distributed::FixedFeatureValue*)(gpu_val.cpu_ptr); - int downpour_value_size = downpour_value->size(); - if (gpu_val.mf_size > 0 && downpour_value_size == 7) { - downpour_value->resize(gpu_val.mf_size + downpour_value_size); - } - float* cpu_val = downpour_value->data(); - // cpu_val[0] = 0; - cpu_val[2] = gpu_val.delta_score; - cpu_val[3] = gpu_val.show; - cpu_val[4] = gpu_val.clk; - cpu_val[5] = gpu_val.lr; - cpu_val[6] = gpu_val.lr_g2sum; - cpu_val[0] = gpu_val.slot; - if (gpu_val.mf_size > 0) { - for (int x = 0; x < gpu_val.mf_size; x++) { - cpu_val[x + 7] = gpu_val.mf[x]; - } - } #endif } }; @@ -336,9 +305,9 @@ void HashTable::dump_to_cpu(int devid, StreamType stream) { } template -template +template void HashTable::update(const KeyType* d_keys, - const GradType* d_grads, + const float* d_grads, size_t len, Sgd sgd, StreamType stream) { @@ -371,8 +340,8 @@ void HashTable::update(const KeyType* d_keys, push_grad_value_size_); } -template class HashTable; -template class HashTable; +template class HashTable; +template class HashTable; template class HashTable; template class HashTable; template class HashTable; @@ -382,15 +351,19 @@ template class HashTable; template class HashTable; template class HashTable; -template void HashTable::get< - cudaStream_t>(const unsigned long* d_keys, - paddle::framework::FeatureValue* d_vals, - size_t len, - cudaStream_t stream); +template void HashTable::get( + const unsigned long* d_keys, + float* d_vals, + size_t len, + cudaStream_t stream); template void -HashTable::get( - const unsigned long* d_keys, char* d_vals, size_t len, cudaStream_t stream); +HashTable::get( + const unsigned long* d_keys, + char* d_vals, + size_t len, + cudaStream_t stream, + CommonFeatureValueAccessor& fv_accessor); template void HashTable::get(const long* d_keys, int* d_vals, @@ -399,6 +372,12 @@ template void HashTable::get(const long* d_keys, template void HashTable::get( const unsigned long* d_keys, int* d_vals, size_t len, cudaStream_t stream); +template void HashTable::get( + const unsigned long* d_keys, + unsigned long* d_vals, + size_t len, + cudaStream_t stream); + template void HashTable::get( const long* d_keys, unsigned long* d_vals, size_t len, cudaStream_t stream); template void HashTable::get(const long* d_keys, @@ -414,19 +393,19 @@ template void HashTable::get( // const unsigned long* d_keys, char* d_vals, size_t len, cudaStream_t // stream); -template void HashTable::insert< - cudaStream_t>(const unsigned long* d_keys, - const paddle::framework::FeatureValue* d_vals, - size_t len, - cudaStream_t stream); +template void HashTable::insert( + const unsigned long* d_keys, + const float* d_vals, + size_t len, + cudaStream_t stream); -template void HashTable:: - insert(const unsigned long* d_keys, - size_t len, - char* pool, - size_t feature_value_size, - size_t start_index, - cudaStream_t stream); +template void HashTable::insert( + const unsigned long* d_keys, + size_t len, + char* pool, + size_t feature_value_size, + size_t start_index, + cudaStream_t stream); template void HashTable::insert(const long* d_keys, const int* d_vals, @@ -460,30 +439,37 @@ template void HashTable::insert( size_t len, cudaStream_t stream); -template void HashTable:: - dump_to_cpu(int devid, cudaStream_t stream); +template void HashTable::insert( + const unsigned long* d_keys, + const unsigned long* d_vals, + size_t len, + cudaStream_t stream); -template void HashTable::update< - paddle::framework::FeaturePushValue, - Optimizer, +template void HashTable::dump_to_cpu( + int devid, cudaStream_t stream); + +template void +HashTable::update( + const unsigned long* d_keys, + const char* d_grads, + size_t len, + SparseAdagradOptimizer sgd, + cudaStream_t stream); +template void +HashTable::update( + const unsigned long* d_keys, + const char* d_grads, + size_t len, + SparseAdamOptimizer sgd, + cudaStream_t stream); +template void HashTable::update< + SparseAdamSharedOptimizer, cudaStream_t>(const unsigned long* d_keys, - const paddle::framework::FeaturePushValue* d_grads, + const char* d_grads, size_t len, - Optimizer sgd, + SparseAdamSharedOptimizer sgd, cudaStream_t stream); -template void HashTable:: - update, - cudaStream_t>(const unsigned long* d_keys, - const char* d_grads, - size_t len, - Optimizer sgd, - cudaStream_t stream); - // template void HashTable::update< // Optimizer +template class HeterComm { public: HeterComm(size_t capacity, std::shared_ptr resource); @@ -65,12 +68,9 @@ class HeterComm { GradType* d_grads, size_t len, int& uniq_len); // NOLINT - void dynamic_merge_grad(int gpu_num, - KeyType* d_keys, - GradType* d_grads, - size_t len, - int& uniq_len); - void pull_sparse(int num, KeyType* d_keys, ValType* d_vals, size_t len); + void dynamic_merge_grad( + int gpu_num, KeyType* d_keys, float* d_grads, size_t len, int& uniq_len); + void pull_sparse(int num, KeyType* d_keys, float* d_vals, size_t len); void build_ps(int num, KeyType* h_keys, ValType* h_vals, @@ -92,7 +92,7 @@ class HeterComm { template void push_sparse(int num, KeyType* d_keys, - GradType* d_grads, + float* d_grads, size_t len, Sgd& sgd); // NOLINT #elif defined(PADDLE_WITH_XPU_KP) @@ -149,6 +149,13 @@ class HeterComm { multi_mf_dim_ = multi_mf_dim; max_mf_dim_ = max_mf_dim; } + + void set_accessor(FVAccessor& accessor) { + feature_value_accessor_ = accessor; + // for (auto& ptr_table: ptr_tables_) { + // ptr_table->set_accessor(feature_value_accessor_); + // } + } #endif bool need_transfer(int send_id, int receive_id) { @@ -282,9 +289,11 @@ class HeterComm { char* src_val, size_t val_size); + FVAccessor feature_value_accessor_; + protected: using Table = HashTable; - using PtrTable = HashTable; + using PtrTable = HashTable; std::vector tables_; std::vector ptr_tables_; std::shared_ptr resource_; diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h index a7333cd01c6ec..f8657c8e895ad 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h @@ -24,8 +24,12 @@ limitations under the License. */ namespace paddle { namespace framework { -template -HeterComm::HeterComm( + +template +HeterComm::HeterComm( size_t capacity, std::shared_ptr resource) { VLOG(1) << "Construct new HeterComm"; resource_ = resource; @@ -42,10 +46,14 @@ HeterComm::HeterComm( tables_.push_back(table); } else { max_mf_dim_ = resource_->max_mf_dim(); - size_t val_type_size = TYPEALIGN( - 8, sizeof(FeatureValue) + sizeof(float) * (max_mf_dim_ + 1)); - size_t grad_type_size = TYPEALIGN( - 8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float))); + auto accessor_wrapper_ptr = + GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); + size_t val_type_size = + accessor_wrapper_ptr->GetFeatureValueSize(max_mf_dim_); + size_t grad_type_size = + accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_); + VLOG(0) << " HeterComm init, max feature_value_size:" << val_type_size + << ", feature_value_push_size:" << grad_type_size; auto ptr_table = new PtrTable(capacity / load_factor_); ptr_table->set_feature_value_size(val_type_size, grad_type_size); ptr_tables_.push_back(ptr_table); @@ -58,8 +66,11 @@ HeterComm::HeterComm( init_path(); } -template -void HeterComm::init_path() { +template +void HeterComm::init_path() { int total_device = resource_->total_device(); path_.resize(total_device); if (!topo_aware_) { @@ -111,14 +122,18 @@ void HeterComm::init_path() { } } -template +template template -void HeterComm::memory_copy(DstPlace dst_place, - void* dst, - SrcPlace src_place, - const void* src, - size_t count, - StreamType stream) { +void HeterComm::memory_copy( + DstPlace dst_place, + void* dst, + SrcPlace src_place, + const void* src, + size_t count, + StreamType stream) { #if defined(PADDLE_WITH_CUDA) cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream); if (stream == 0) { @@ -129,11 +144,12 @@ void HeterComm::memory_copy(DstPlace dst_place, #endif } -template -void HeterComm::create_storage(int start_index, - int end_index, - int keylen, - int vallen) { +template +void HeterComm::create_storage( + int start_index, int end_index, int keylen, int vallen) { #if defined(PADDLE_WITH_CUDA) auto& allocator = allocators_[start_index]; auto& nodes = path_[start_index][end_index].nodes_; @@ -167,9 +183,12 @@ void HeterComm::create_storage(int start_index, #endif } -template -void HeterComm::destroy_storage(int start_index, - int end_index) { +template +void HeterComm::destroy_storage( + int start_index, int end_index) { #if defined(PADDLE_WITH_CUDA) auto& allocator = allocators_[start_index]; auto& nodes = path_[start_index][end_index].nodes_; @@ -184,13 +203,17 @@ void HeterComm::destroy_storage(int start_index, #endif } -template -void HeterComm::walk_to_dest(int start_index, - int num, - int* h_left, - int* h_right, - KeyType* src_key, - GradType* src_val) { +template +void HeterComm::walk_to_dest( + int start_index, + int num, + int* h_left, + int* h_right, + KeyType* src_key, + GradType* src_val) { int need_copy_val = 0; if (src_val) { need_copy_val = 1; @@ -267,14 +290,18 @@ void HeterComm::walk_to_dest(int start_index, } } -template -void HeterComm::walk_to_dest(int start_index, - int gpu_num, - int* h_left, - int* h_right, - KeyType* src_key, - char* src_val, - size_t val_size) { +template +void HeterComm::walk_to_dest( + int start_index, + int gpu_num, + int* h_left, + int* h_right, + KeyType* src_key, + char* src_val, + size_t val_size) { int need_copy_val = 0; if (src_val) { need_copy_val = 1; @@ -327,13 +354,17 @@ void HeterComm::walk_to_dest(int start_index, } } -template -void HeterComm::walk_to_src(int start_index, - int gpu_num, - int* h_left, - int* h_right, - char* src_val, - size_t val_size) { +template +void HeterComm::walk_to_src( + int start_index, + int gpu_num, + int* h_left, + int* h_right, + char* src_val, + size_t val_size) { std::queue que; for (int i = 0; i < gpu_num; i++) { if (h_left[i] == -1 || h_right[i] == -1) { @@ -383,8 +414,11 @@ void HeterComm::walk_to_src(int start_index, } } -template -HeterComm::~HeterComm() { +template +HeterComm::~HeterComm() { if (!multi_mf_dim_) { for (auto& table : tables_) { delete table; @@ -402,15 +436,22 @@ HeterComm::~HeterComm() { } } -template -void HeterComm::show_one_table(int gpu_num) { +template +void HeterComm::show_one_table( + int gpu_num) { if (!multi_mf_dim_) { tables_[gpu_num]->show(); } } -template -int HeterComm::log2i(int x) { +template +int HeterComm::log2i(int x) { unsigned res = 0; while (x >>= 1) { ++res; @@ -418,13 +459,20 @@ int HeterComm::log2i(int x) { return res; } -template -int HeterComm::get_index_by_devid(int devid) { +template +int HeterComm::get_index_by_devid( + int devid) { return resource_->get_index_by_devid(devid); } -template -void HeterComm::set_sparse_sgd( +template +void HeterComm::set_sparse_sgd( const OptimizerConfig& optimizer_config) { for (int i = 0; i < resource_->total_device(); ++i) { AnyDeviceGuard guard(resource_->dev_id(i)); @@ -436,8 +484,11 @@ void HeterComm::set_sparse_sgd( } } -template -void HeterComm::set_embedx_sgd( +template +void HeterComm::set_embedx_sgd( const OptimizerConfig& optimizer_config) { for (int i = 0; i < resource_->total_device(); ++i) { AnyDeviceGuard guard(resource_->dev_id(i)); @@ -449,13 +500,17 @@ void HeterComm::set_embedx_sgd( } } -template -void HeterComm::build_ps(int dev_num, - KeyType* h_keys, - ValType* h_vals, - size_t len, - size_t chunk_size, - int stream_num) { +template +void HeterComm::build_ps( + int dev_num, + KeyType* h_keys, + ValType* h_vals, + size_t len, + size_t chunk_size, + int stream_num) { if (len <= 0) { return; } @@ -518,14 +573,18 @@ void HeterComm::build_ps(int dev_num, } } -template -void HeterComm::build_ps(int num, - KeyType* h_keys, - char* pool, - size_t len, - size_t feature_value_size, - size_t chunk_size, - int stream_num) { +template +void HeterComm::build_ps( + int num, + KeyType* h_keys, + char* pool, + size_t len, + size_t feature_value_size, + size_t chunk_size, + int stream_num) { if (len <= 0) { return; } @@ -580,8 +639,11 @@ void HeterComm::build_ps(int num, } } -template -void HeterComm::merge_grad( +template +void HeterComm::merge_grad( int dev_num, KeyType* d_keys, GradType* d_grads, @@ -654,13 +716,12 @@ void HeterComm::merge_grad( sync_stream(stream); } -template -void HeterComm::dynamic_merge_grad( - int gpu_num, - KeyType* d_keys, - GradType* d_grads, - size_t len, - int& uniq_len) { +template +void HeterComm::dynamic_merge_grad( + int gpu_num, KeyType* d_keys, float* d_grads, size_t len, int& uniq_len) { int dev_id = resource_->dev_id(gpu_num); platform::CUDAPlace place = platform::CUDAPlace(dev_id); platform::CUDADeviceGuard guard(dev_id); @@ -668,16 +729,15 @@ void HeterComm::dynamic_merge_grad( size_t temp_storage_bytes; - // VLOG(1) << "hetercomm merge_grad: max_mf_dim: " << max_mf_dim_; - size_t grad_value_size = - TYPEALIGN(8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float))); + auto accessor_wrapper_ptr = + GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); + size_t grad_value_size = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_); auto d_merge_keys = memory::Alloc(place, len * sizeof(KeyType)); KeyType* d_merge_keys_ptr = reinterpret_cast(d_merge_keys->ptr()); auto d_merge_grads = memory::Alloc(place, len * grad_value_size); - GradType* d_merge_grads_ptr = - reinterpret_cast(d_merge_grads->ptr()); + float* d_merge_grads_ptr = reinterpret_cast(d_merge_grads->ptr()); auto d_fea_num_info = memory::Alloc(place, sizeof(uint32_t) * (len * 3 + 1)); uint32_t* d_fea_num_info_ptr = @@ -772,7 +832,8 @@ void HeterComm::dynamic_merge_grad( uniq_len, grad_value_size, merger_, - stream); + stream, + feature_value_accessor_); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(d_grads, d_merge_grads_ptr, @@ -782,8 +843,11 @@ void HeterComm::dynamic_merge_grad( PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); } -template -void HeterComm::split_input_to_shard( +template +void HeterComm::split_input_to_shard( KeyType* d_keys, int* d_idx_ptr, size_t len, @@ -843,11 +907,12 @@ void HeterComm::split_input_to_shard( sync_stream(stream); } -template -void HeterComm::pull_sparse(int num, - KeyType* d_keys, - ValType* d_vals, - size_t len) { +template +void HeterComm::pull_sparse( + int num, KeyType* d_keys, float* d_vals, size_t len) { if (len == 0) { return; } @@ -893,12 +958,15 @@ void HeterComm::pull_sparse(int num, auto d_idx = memory::Alloc(place, len * sizeof(int)); int* d_idx_ptr = reinterpret_cast(d_idx->ptr()); - size_t val_type_size = - TYPEALIGN(8, sizeof(FeatureValue) + sizeof(float) * (max_mf_dim_ + 1)); + + auto accessor_wrapper_ptr = + GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); + size_t val_type_size = accessor_wrapper_ptr->GetFeatureValueSize(max_mf_dim_); + VLOG(3) << "pull_sparse len:" << len << " val_type_size: " << val_type_size; auto d_shard_keys = memory::Alloc(place, len * sizeof(KeyType)); KeyType* d_shard_keys_ptr = reinterpret_cast(d_shard_keys->ptr()); auto d_shard_vals = memory::Alloc(place, len * val_type_size); - ValType* d_shard_vals_ptr = reinterpret_cast(d_shard_vals->ptr()); + float* d_shard_vals_ptr = reinterpret_cast(d_shard_vals->ptr()); split_input_to_shard(d_keys, d_idx_ptr, len, d_left_ptr, d_right_ptr, num); @@ -944,7 +1012,8 @@ void HeterComm::pull_sparse(int num, ptr_tables_[i]->get(reinterpret_cast(node.key_storage), node.val_storage, h_right[i] - h_left[i] + 1, - resource_->remote_stream(i, num)); + resource_->remote_stream(i, num), + feature_value_accessor_); } for (int i = 0; i < total_device; ++i) { @@ -964,10 +1033,16 @@ void HeterComm::pull_sparse(int num, auto& node = path_[num][i].nodes_.front(); sync_stream(node.out_stream); } - heter_comm_kernel_->dy_mf_fill_dvals( - d_shard_vals_ptr, d_vals, d_idx_ptr, len, val_type_size, stream); + heter_comm_kernel_->dy_mf_fill_dvals(d_shard_vals_ptr, + d_vals, + d_idx_ptr, + len, + val_type_size, + stream, + feature_value_accessor_); sync_stream(stream); + for (int i = 0; i < total_device; ++i) { if (h_left[i] == -1 || h_right[i] == -1) { continue; @@ -977,13 +1052,17 @@ void HeterComm::pull_sparse(int num, } #if defined(PADDLE_WITH_CUDA) -template +template template -void HeterComm::push_sparse(int dev_num, - KeyType* d_keys, - GradType* d_grads, - size_t len, - Sgd& sgd) { // NOLINT +void HeterComm::push_sparse( + int dev_num, + KeyType* d_keys, + float* d_grads, + size_t len, + Sgd& sgd) { // NOLINT if (len == 0) { return; } @@ -991,8 +1070,9 @@ void HeterComm::push_sparse(int dev_num, int total_device = resource_->total_device(); int dev_id = resource_->dev_id(dev_num); - size_t grad_value_size = - TYPEALIGN(8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float))); + auto accessor_wrapper_ptr = + GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); + size_t grad_value_size = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_); DevPlace place = DevPlace(dev_id); AnyDeviceGuard guard(dev_id); auto stream = resource_->local_stream(dev_num, 0); @@ -1037,8 +1117,7 @@ void HeterComm::push_sparse(int dev_num, KeyType* d_shard_keys_ptr = reinterpret_cast(d_shard_keys->ptr()); auto d_shard_grads = memory::Alloc(place, len * grad_value_size); - GradType* d_shard_grads_ptr = - reinterpret_cast(d_shard_grads->ptr()); + float* d_shard_grads_ptr = reinterpret_cast(d_shard_grads->ptr()); int uniq_len = len; dynamic_merge_grad(dev_num, d_keys, d_grads, len, uniq_len); @@ -1048,24 +1127,15 @@ void HeterComm::push_sparse(int dev_num, split_input_to_shard( d_keys, d_idx_ptr, uniq_len, d_left_ptr, d_right_ptr, dev_num); - if (!multi_mf_dim_) { - heter_comm_kernel_->fill_shard_grads(d_shard_keys_ptr, - d_keys, - d_shard_grads_ptr, - d_grads, - d_idx_ptr, - uniq_len, - stream); - } else { - heter_comm_kernel_->dy_mf_fill_shard_grads(d_shard_keys_ptr, - d_keys, - d_shard_grads_ptr, - d_grads, - d_idx_ptr, - uniq_len, - grad_value_size, - stream); - } + heter_comm_kernel_->dy_mf_fill_shard_grads(d_shard_keys_ptr, + d_keys, + d_shard_grads_ptr, + d_grads, + d_idx_ptr, + uniq_len, + grad_value_size, + stream, + feature_value_accessor_); sync_stream(stream); @@ -1089,33 +1159,17 @@ void HeterComm::push_sparse(int dev_num, if (h_left[i] == -1 || h_right[i] == -1) { continue; } - if (!multi_mf_dim_) { - create_storage(dev_num, - i, - shard_len * sizeof(KeyType), - shard_len * sizeof(GradType)); - } else { - create_storage( - dev_num, i, shard_len * sizeof(KeyType), shard_len * grad_value_size); - } + create_storage( + dev_num, i, shard_len * sizeof(KeyType), shard_len * grad_value_size); } - if (!multi_mf_dim_) { - walk_to_dest(dev_num, - total_device, - h_left, - h_right, - d_shard_keys_ptr, - d_shard_grads_ptr); - } else { - walk_to_dest(dev_num, - total_device, - h_left, - h_right, - d_shard_keys_ptr, - reinterpret_cast(d_shard_grads_ptr), - grad_value_size); - } + walk_to_dest(dev_num, + total_device, + h_left, + h_right, + d_shard_keys_ptr, + reinterpret_cast(d_shard_grads_ptr), + grad_value_size); for (int i = 0; i < total_device; ++i) { if (h_left[i] == -1 || h_right[i] == -1) { @@ -1125,21 +1179,12 @@ void HeterComm::push_sparse(int dev_num, sync_stream(node.in_stream); AnyDeviceGuard guard(resource_->dev_id(i)); - if (!multi_mf_dim_) { - tables_[i]->rwlock_->WRLock(); - tables_[i]->update(reinterpret_cast(node.key_storage), - reinterpret_cast(node.val_storage), - h_right[i] - h_left[i] + 1, - sgd, - resource_->remote_stream(i, dev_num)); - } else { - ptr_tables_[i]->rwlock_->WRLock(); - ptr_tables_[i]->update(reinterpret_cast(node.key_storage), - node.val_storage, - h_right[i] - h_left[i] + 1, - sgd, - resource_->remote_stream(i, dev_num)); - } + ptr_tables_[i]->rwlock_->WRLock(); + ptr_tables_[i]->update(reinterpret_cast(node.key_storage), + node.val_storage, + h_right[i] - h_left[i] + 1, + sgd, + resource_->remote_stream(i, dev_num)); } for (int i = 0; i < total_device; ++i) { @@ -1162,11 +1207,12 @@ void HeterComm::push_sparse(int dev_num, } #elif defined(PADDLE_WITH_XPU_KP) -template -void HeterComm::push_sparse(int dev_num, - KeyType* d_keys, - GradType* d_grads, - size_t len) { +template +void HeterComm::push_sparse( + int dev_num, KeyType* d_keys, GradType* d_grads, size_t len) { if (len == 0) { return; } @@ -1302,9 +1348,12 @@ void HeterComm::push_sparse(int dev_num, #endif #if defined(PADDLE_WITH_CUDA) -template +template template -void HeterComm::update_one_table( +void HeterComm::update_one_table( int gpu_num, KeyType* d_keys, GradType* d_grads, @@ -1323,9 +1372,12 @@ void HeterComm::update_one_table( cudaStreamSynchronize(resource_->remote_stream(gpu_num, gpu_num)); } -template +template template -void HeterComm::push_sparse_multi_node( +void HeterComm::push_sparse_multi_node( int gpu_num, KeyType* d_keys, GradType* d_grads, @@ -1352,8 +1404,11 @@ void HeterComm::push_sparse_multi_node( sgd); } -template -int HeterComm::gather_one_node_grad( +template +int HeterComm::gather_one_node_grad( int gpu_num, KeyType* d_keys, GradType* d_grads, int len) { int total_gpu = resource_->total_device(); int dev_id = resource_->dev_id(gpu_num); @@ -1454,8 +1509,11 @@ int HeterComm::gather_one_node_grad( return ret; } -template -int HeterComm::gather_multi_node_grad( +template +int HeterComm::gather_multi_node_grad( int gpu_num, KeyType* d_keys, GradType* d_grads, int len) { int dev_id = resource_->dev_id(gpu_num); auto& storage = storage_[gpu_num]; @@ -1525,8 +1583,11 @@ int HeterComm::gather_multi_node_grad( } #endif -template -void HeterComm::end_pass() { +template +void HeterComm::end_pass() { int total_device = resource_->total_device(); std::vector threads; @@ -1547,8 +1608,10 @@ void HeterComm::end_pass() { } } -// template -// void HeterComm::dump_to_cpu(int index) { +// template +// void HeterComm::dump_to_cpu(int +// index) { // auto stream = resource_->local_stream(index, 0); // int dev_id = resource_->dev_id(index); // platform::CUDADeviceGuard guard(dev_id); diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu index fd0dd1a72cca1..ebf7e76527af0 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu @@ -128,22 +128,28 @@ __global__ void fill_dvals_kernel(ValType* d_shard_vals, } } -template -__global__ void dy_mf_fill_shard_grads_kernel(KeyType* d_shard_keys, - KeyType* d_keys, - GradType* d_shard_grads, - GradType* d_grads, - T* idx, - size_t len, - size_t grad_value_size) { +template +__global__ void dy_mf_fill_shard_grads_kernel( + KeyType* d_shard_keys, + KeyType* d_keys, + float* d_shard_grads, + float* d_grads, + T* idx, + size_t len, + size_t grad_value_size, + FVAccessor feature_value_accessor) { const size_t i = blockIdx.x * blockDim.x + threadIdx.x; if (i < len) { d_shard_keys[i] = d_keys[idx[i]]; - *(GradType*)((char*)d_shard_grads + i * grad_value_size) = - *(GradType*)((char*)d_grads + uint64_t(idx[i]) * grad_value_size); + float* cur = (float*)((char*)d_shard_grads + i * grad_value_size); + float* shard_val = + (float*)((char*)d_grads + uint64_t(idx[i]) * grad_value_size); + + feature_value_accessor.PushValueFill(cur, shard_val); } } +template __global__ void merge_gradients_kernel(const uint32_t* offset, const uint32_t* fea_num, const uint32_t* index, @@ -151,36 +157,40 @@ __global__ void merge_gradients_kernel(const uint32_t* offset, char* output, int n, size_t grad_value_size, - DynamicGradMerger& merger_) { + DynamicGradMerger& merger, + FVAccessor& feature_value_accessor) { const size_t i = blockIdx.x * blockDim.x + threadIdx.x; if (i < n) { uint32_t start = offset[i]; uint32_t num = fea_num[i]; int ori_index = index[start]; - FeaturePushValue& out = *(FeaturePushValue*)(output + i * grad_value_size); - FeaturePushValue& in = - *(FeaturePushValue*)(input + size_t(ori_index) * grad_value_size); - merger_.update_one(out, in); + float* out = (float*)(output + i * grad_value_size); + float* in = (float*)(input + size_t(ori_index) * grad_value_size); + merger.update_one(out, in, feature_value_accessor); for (int j = 1; j < num; ++j) { ori_index = index[start + j]; - FeaturePushValue& rhs = - *(FeaturePushValue*)(input + size_t(ori_index) * grad_value_size); - merger_.merge_one(out, rhs); + in = (float*)(input + size_t(ori_index) * grad_value_size); + merger.merge_one(out, in, feature_value_accessor); } } } -template -__global__ void dy_mf_fill_dvals_kernel(ValType* d_shard_vals, - ValType* d_vals, +template +__global__ void dy_mf_fill_dvals_kernel(float* d_shard_vals, + float* d_vals, T* idx, size_t len, - size_t val_size) { + size_t val_size, + FVAccessor feature_value_accessor) { const size_t i = blockIdx.x * blockDim.x + threadIdx.x; if (i < len) { uint64_t new_offset = uint64_t(idx[i]) * val_size; - *(ValType*)((char*)d_vals + new_offset) = - *(ValType*)((char*)d_shard_vals + i * val_size); + float* cur = (float*)((char*)d_vals + new_offset); + float* shard_val = (float*)((char*)d_shard_vals + uint64_t(i) * val_size); + int mf_dim = int( + shard_val[feature_value_accessor.common_feature_value.MfDimIndex()]); + + feature_value_accessor.FeatureValueFill(cur, shard_val, mf_dim); } } @@ -312,15 +322,20 @@ void HeterCommKernel::reduce_by_key(void* d_temp_storage, debug_synchronous)); } -template -void HeterCommKernel::dy_mf_fill_shard_grads(KeyType* d_shard_keys, - KeyType* d_keys, - GradType* d_shard_grads, - GradType* d_grads, - T* idx, - long long len, - size_t grad_value_size, - const StreamType& stream) { +template +void HeterCommKernel::dy_mf_fill_shard_grads( + KeyType* d_shard_keys, + KeyType* d_keys, + float* d_shard_grads, + float* d_grads, + T* idx, + long long len, + size_t grad_value_size, + const StreamType& stream, + FVAccessor& feature_value_accessor) { int grid_size = (len - 1) / block_size_ + 1; size_t c_len = (size_t)len; dy_mf_fill_shard_grads_kernel<<>>( @@ -330,10 +345,11 @@ void HeterCommKernel::dy_mf_fill_shard_grads(KeyType* d_shard_keys, d_grads, idx, c_len, - grad_value_size); + grad_value_size, + feature_value_accessor); } -template +template void HeterCommKernel::merge_gradient(const uint32_t* offset, const uint32_t* fea_num, const uint32_t* index, @@ -342,23 +358,33 @@ void HeterCommKernel::merge_gradient(const uint32_t* offset, int n, size_t grad_value_size, DynamicGradMerger& merger_, - const StreamType& stream) { + const StreamType& stream, + FVAccessor& feature_value_accessor) { int grid_size = (n - 1) / block_size_ + 1; merge_gradients_kernel<<>>( - offset, fea_num, index, input, output, n, grad_value_size, merger_); + offset, + fea_num, + index, + input, + output, + n, + grad_value_size, + merger_, + feature_value_accessor); } -template -void HeterCommKernel::dy_mf_fill_dvals(ValType* d_shard_vals, - ValType* d_vals, +template +void HeterCommKernel::dy_mf_fill_dvals(float* d_shard_vals, + float* d_vals, T* idx, long long len, size_t val_size, - const StreamType& stream) { + const StreamType& stream, + FVAccessor& feature_value_accessor) { int grid_size = (len - 1) / block_size_ + 1; size_t c_len = (size_t)len; dy_mf_fill_dvals_kernel<<>>( - d_shard_vals, d_vals, idx, c_len, val_size); + d_shard_vals, d_vals, idx, c_len, val_size, feature_value_accessor); } template void HeterCommKernel::fill_idx( @@ -402,17 +428,15 @@ template void HeterCommKernel::fill_shard_key( long long len, const cudaStream_t& stream); -template void HeterCommKernel::fill_shard_grads< - unsigned long, - paddle::framework::FeaturePushValue, - int, - cudaStream_t>(unsigned long* d_shard_keys, - unsigned long* d_keys, - paddle::framework::FeaturePushValue* d_shard_grads, - paddle::framework::FeaturePushValue* d_grads, - int* idx, - long long len, - const cudaStream_t& stream); +template void +HeterCommKernel::fill_shard_grads( + unsigned long* d_shard_keys, + unsigned long* d_keys, + float* d_shard_grads, + float* d_grads, + int* idx, + long long len, + const cudaStream_t& stream); template void HeterCommKernel::fill_dvals( @@ -467,20 +491,23 @@ template void HeterCommKernel::reduce_by_key< cudaStream_t stream, bool debug_synchronous); -template void HeterCommKernel::dy_mf_fill_shard_grads< - unsigned long, - paddle::framework::FeaturePushValue, - int, - cudaStream_t>(unsigned long* d_shard_keys, - unsigned long* d_keys, - paddle::framework::FeaturePushValue* d_shard_grads, - paddle::framework::FeaturePushValue* d_grads, - int* idx, - long long len, - size_t grad_value_size, - const cudaStream_t& stream); - -template void HeterCommKernel::merge_gradient( +template void +HeterCommKernel::dy_mf_fill_shard_grads( + unsigned long* d_shard_keys, + unsigned long* d_keys, + float* d_shard_grads, + float* d_grads, + int* idx, + long long len, + size_t grad_value_size, + const cudaStream_t& stream, + CommonFeatureValueAccessor& feature_value_accessor); + +template void +HeterCommKernel::merge_gradient( const uint32_t* offset, const uint32_t* fea_num, const uint32_t* index, @@ -489,16 +516,18 @@ template void HeterCommKernel::merge_gradient( int n, size_t grad_value_size, DynamicGradMerger& merger_, - const cudaStream_t& stream); + const cudaStream_t& stream, + CommonFeatureValueAccessor& feature_value_accessor); template void HeterCommKernel:: - dy_mf_fill_dvals( - paddle::framework::FeatureValue* d_shard_vals, - paddle::framework::FeatureValue* d_vals, + dy_mf_fill_dvals( + float* d_shard_vals, + float* d_vals, int* idx, long long len, size_t val_size, - const cudaStream_t& stream); + const cudaStream_t& stream, + CommonFeatureValueAccessor& feature_value_accessor); #endif } // namespace framework diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h index d1555dc2e0919..57f0aff4b6e56 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h @@ -41,25 +41,16 @@ struct DynamicGradMerger { return out; } - template - __device__ __forceinline__ void update_one(T& output, const T& input) { - output.slot = input.slot; - output.show = input.show; - output.clk = input.clk; - output.mf_dim = input.mf_dim; - output.lr_g = input.lr_g; - for (int i = 0; i < output.mf_dim; ++i) { - output.mf_g[i] = input.mf_g[i]; - } + template + __device__ __forceinline__ void update_one( + float* output, const float* input, FVAccessor& feature_value_accessor) { + feature_value_accessor.PushValueFill(output, input); } - template - __device__ __forceinline__ void merge_one(T& output, const T& input) { - output.show += input.show; - output.clk += input.clk; - output.lr_g += input.lr_g; - for (int i = 0; i < input.mf_dim; ++i) { - output.mf_g[i] += input.mf_g[i]; - } + + template + __device__ __forceinline__ void merge_one( + float* output, const float* input, FVAccessor& feature_value_accessor) { + feature_value_accessor.MergePushValue(output, input); } }; @@ -146,19 +137,20 @@ class HeterCommKernel { bool debug_synchronous = false); template + typename StreamType, + typename FVAccessor> void dy_mf_fill_shard_grads(KeyType* d_shard_keys, KeyType* d_keys, - GradType* d_shard_grads, - GradType* d_grads, + float* d_shard_grads, + float* d_grads, T* idx, long long len, size_t grad_value_size, - const StreamType& stream); + const StreamType& stream, + FVAccessor& feature_value_accessor); - template + template void merge_gradient(const uint32_t* offset, const uint32_t* fea_num, const uint32_t* index, @@ -167,15 +159,17 @@ class HeterCommKernel { int n, size_t grad_value_size, DynamicGradMerger& merger_, - const StreamType& stream); + const StreamType& stream, + FVAccessor& feature_value_accessor); - template - void dy_mf_fill_dvals(ValType* d_shard_vals, - ValType* d_vals, + template + void dy_mf_fill_dvals(float* d_shard_vals, + float* d_vals, T* idx, long long len, size_t val_size, - const StreamType& stream); + const StreamType& stream, + FVAccessor& feature_value_accessor); private: int block_size_{256}; diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc index 82f5393c3660b..4eff4a8ad55b9 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cc @@ -22,34 +22,43 @@ namespace paddle { namespace framework { HeterPsBase* HeterPsBase::get_instance( - size_t capacity, std::shared_ptr resource) { - return new HeterPs(capacity, resource); + size_t capacity, + std::shared_ptr resource, + std::unordered_map fleet_config, + std::string accessor_type, + int optimizer_type) { + if (accessor_type == "CtrDymfAccessor" && + (optimizer_type == 1 || optimizer_type == 3 || optimizer_type == 4)) { + return new HeterPs( + capacity, resource, accessor_type, fleet_config, optimizer_type); + } else { + VLOG(0) << " HeterPsBase get_instance Warning: now only support " + "CtrDymfAccessor, but get " + << accessor_type_; + return new HeterPs( + capacity, resource, accessor_type, fleet_config, optimizer_type); + } } -HeterPs::HeterPs(size_t capacity, std::shared_ptr resource) { - comm_ = - std::make_shared>( - capacity, resource); +HeterPs::HeterPs(size_t capacity, + std::shared_ptr resource, + std::unordered_map fleet_config, + std::string accessor_type, + int optimizer_type) { + comm_ = std::make_shared>( + capacity, resource); + optimizer_type_ = optimizer_type; } HeterPs::~HeterPs() {} void HeterPs::pull_sparse(int num, FeatureKey* d_keys, - FeatureValue* d_vals, + float* d_vals, size_t len) { comm_->pull_sparse(num, d_keys, d_vals, len); } -void HeterPs::build_ps(int num, - FeatureKey* h_keys, - FeatureValue* h_vals, - size_t len, - size_t chunk_size, - int stream_num) { - comm_->build_ps(num, h_keys, h_vals, len, chunk_size, stream_num); -} - int HeterPs::get_index_by_devid(int devid) { return comm_->get_index_by_devid(devid); } @@ -68,7 +77,7 @@ void HeterPs::show_one_table(int gpu_num) { comm_->show_one_table(gpu_num); } void HeterPs::push_sparse(int num, FeatureKey* d_keys, - FeaturePushValue* d_grads, + float* d_grads, size_t len) { comm_->push_sparse(num, d_keys, d_grads, len); // comm_->push_sparse_multi_node(num, d_keys, d_grads, len, opt_); diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu index 005cbd401223d..b059690990370 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu @@ -22,80 +22,139 @@ namespace paddle { namespace framework { HeterPsBase* HeterPsBase::get_instance( - size_t capacity, std::shared_ptr resource) { - return new HeterPs(capacity, resource); + size_t capacity, + std::shared_ptr resource, + std::unordered_map fleet_config, + std::string accessor_type, + int optimizer_type) { + if (accessor_type == "CtrDymfAccessor" && + (optimizer_type == 1 || optimizer_type == 3 || optimizer_type == 4)) { + return new HeterPs( + capacity, resource, fleet_config, accessor_type, optimizer_type); + } else { + VLOG(0) << " HeterPsBase get_instance Warning: now only support " + "CtrDymfAccessor, but get " + << accessor_type; + return new HeterPs( + capacity, resource, fleet_config, accessor_type, optimizer_type); + } } -HeterPs::HeterPs(size_t capacity, std::shared_ptr resource) { - comm_ = - std::make_shared>( - capacity, resource); - opt_ = Optimizer(); +template +HeterPs::HeterPs( + size_t capacity, + std::shared_ptr resource, + std::unordered_map fleet_config, + std::string accessor_type, + int optimizer_type) { + comm_ = std::make_shared>( + capacity, resource); + feature_value_accessor_.Configure(fleet_config); + set_accessor(feature_value_accessor_); + accessor_type_ = accessor_type; + optimizer_type_ = optimizer_type; } -HeterPs::~HeterPs() {} +template +HeterPs::~HeterPs() {} -void HeterPs::pull_sparse(int num, - FeatureKey* d_keys, - FeatureValue* d_vals, - size_t len) { +template +void HeterPs::pull_sparse(int num, + FeatureKey* d_keys, + float* d_vals, + size_t len) { comm_->pull_sparse(num, d_keys, d_vals, len); } -void HeterPs::build_ps(int num, - FeatureKey* h_keys, - FeatureValue* h_vals, - size_t len, - size_t chunk_size, - int stream_num) { - comm_->build_ps(num, h_keys, h_vals, len, chunk_size, stream_num); -} - -void HeterPs::build_ps(int num, - FeatureKey* h_keys, - char* pool, - size_t len, - size_t feature_value_size, - size_t chunk_size, - int stream_num) { +template +void HeterPs::build_ps(int num, + FeatureKey* h_keys, + char* pool, + size_t len, + size_t feature_value_size, + size_t chunk_size, + int stream_num) { comm_->build_ps( num, h_keys, pool, len, feature_value_size, chunk_size, stream_num); } -int HeterPs::get_index_by_devid(int devid) { +template +int HeterPs::get_index_by_devid(int devid) { return comm_->get_index_by_devid(devid); } -void HeterPs::set_sparse_sgd(const OptimizerConfig& optimizer_config) { +template +void HeterPs::set_sparse_sgd( + const OptimizerConfig& optimizer_config) { comm_->set_sparse_sgd(optimizer_config); } -void HeterPs::set_embedx_sgd(const OptimizerConfig& optimizer_config) { +template +void HeterPs::set_embedx_sgd( + const OptimizerConfig& optimizer_config) { comm_->set_embedx_sgd(optimizer_config); } -void HeterPs::end_pass() { comm_->end_pass(); } +template +void HeterPs::end_pass() { + comm_->end_pass(); +} -void HeterPs::show_one_table(int gpu_num) { comm_->show_one_table(gpu_num); } +template +void HeterPs::show_one_table(int gpu_num) { + comm_->show_one_table(gpu_num); +} -void HeterPs::push_sparse(int num, - FeatureKey* d_keys, - FeaturePushValue* d_grads, - size_t len) { - comm_->push_sparse(num, d_keys, d_grads, len, opt_); - // comm_->push_sparse_multi_node(num, d_keys, d_grads, len, opt_); +template +void HeterPs::push_sparse(int num, + FeatureKey* d_keys, + float* d_grads, + size_t len) { + if (accessor_type_ == "CtrDymfAccessor") { + if (optimizer_type_ == 3) { // adam + auto optimizer = SparseAdamOptimizer(feature_value_accessor_); + VLOG(5) << "INTO push_sparse SparseAdamOptimizer, EmbedDim():" + << optimizer.EmbedDim(); + comm_->push_sparse(num, d_keys, d_grads, len, optimizer); + } else if (optimizer_type_ == 4) { // shared_adam + auto optimizer = SparseAdamSharedOptimizer(feature_value_accessor_); + VLOG(5) << "INTO push_sparse SparseAdamSharedOptimizer, EmbedDim():" + << optimizer.EmbedDim(); + comm_->push_sparse(num, d_keys, d_grads, len, optimizer); + } else if (optimizer_type_ == 1) { // adagrad { + auto optimizer = SparseAdagradOptimizer(feature_value_accessor_); + VLOG(5) << "INTO push_sparse SparseAdagradOptimizer, EmbedDim():" + << optimizer.EmbedDim(); + comm_->push_sparse(num, d_keys, d_grads, len, optimizer); + } else { + VLOG(0) << " push sparse Error: CtrDymfAccessor only support adagrad(1)," + "adam(3) or shared_adam(4), bug get optimizer type:" + << optimizer_type_; + } + } else { + VLOG(0) << " push sparse Error: now only support CtrDymfAccessor, but get " + << accessor_type_; + } } -void HeterPs::set_nccl_comm_and_size(const std::vector& inner_comms, - const std::vector& inter_comms, - int comm_size) { +template +void HeterPs::set_nccl_comm_and_size( + const std::vector& inner_comms, + const std::vector& inter_comms, + int comm_size) { comm_->set_nccl_comm_and_size(inner_comms, inter_comms, comm_size); } -void HeterPs::set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) { +template +void HeterPs::set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) { comm_->set_multi_mf_dim(multi_mf_dim, max_mf_dim); } +template +void HeterPs::set_accessor(FVAccessor& accessor) { + comm_->set_accessor(accessor); +} + } // end namespace framework } // end namespace paddle #endif diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h index 7fee229738830..439f5d6c81854 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h @@ -26,24 +26,23 @@ limitations under the License. */ namespace paddle { namespace framework { +template class HeterPs : public HeterPsBase { public: HeterPs() {} - HeterPs(size_t capacity, std::shared_ptr resource); + HeterPs(size_t capacity, + std::shared_ptr resource, + std::unordered_map fleet_config, + std::string accessor_type, + int optimizer_type); virtual ~HeterPs(); HeterPs(const HeterPs&) = delete; HeterPs& operator=(const HeterPs&) = delete; void pull_sparse(int num, FeatureKey* d_keys, - FeatureValue* d_vals, + float* d_vals, size_t len) override; - void build_ps(int num, - FeatureKey* h_keys, - FeatureValue* h_vals, - size_t len, - size_t chunk_size, - int stream_num) override; void build_ps(int num, FeatureKey* h_keys, char* pool, @@ -56,6 +55,8 @@ class HeterPs : public HeterPsBase { const std::vector& inter_comms, int comm_size) override; void set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) override; + + void set_accessor(FVAccessor& accessor); #endif void set_sparse_sgd(const OptimizerConfig& optimizer_config) override; @@ -66,13 +67,15 @@ class HeterPs : public HeterPsBase { void show_one_table(int gpu_num) override; void push_sparse(int num, FeatureKey* d_keys, - FeaturePushValue* d_grads, + float* d_grads, size_t len) override; private: - std::shared_ptr> comm_; + std::shared_ptr> comm_; #if defined(PADDLE_WITH_CUDA) - Optimizer opt_; + FVAccessor feature_value_accessor_; + std::string accessor_type_; + int optimizer_type_; #endif }; diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h index acc984f14adaa..e45d1db71ccae 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h @@ -34,14 +34,8 @@ class HeterPsBase { virtual void pull_sparse(int num, FeatureKey* d_keys, - FeatureValue* d_vals, + float* d_vals, size_t len) = 0; - virtual void build_ps(int num, - FeatureKey* h_keys, - FeatureValue* h_vals, - size_t len, - size_t chunk_size, - int stream_num) = 0; virtual void build_ps(int num, FeatureKey* h_keys, char* pool, @@ -56,19 +50,25 @@ class HeterPsBase { const std::vector& inter_comms, int comm_size) = 0; virtual void set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) = 0; + #endif virtual void end_pass() = 0; virtual void show_one_table(int gpu_num) = 0; virtual void push_sparse(int num, FeatureKey* d_keys, - FeaturePushValue* d_grads, + float* d_grads, size_t len) = 0; virtual void set_sparse_sgd(const OptimizerConfig& optimizer_config) = 0; virtual void set_embedx_sgd(const OptimizerConfig& optimizer_config) = 0; - static HeterPsBase* get_instance(size_t capacity, - std::shared_ptr resource); + static HeterPsBase* get_instance( + size_t capacity, + std::shared_ptr resource, + // CommonFeatureValueAccessor feature_value_accessor, + std::unordered_map fleet_config, + std::string accessor_type, + int optimizer_type); }; } // end namespace framework diff --git a/paddle/fluid/framework/fleet/heter_ps/mem_pool.h b/paddle/fluid/framework/fleet/heter_ps/mem_pool.h index 88c3136dd77d1..05e252b2afe44 100644 --- a/paddle/fluid/framework/fleet/heter_ps/mem_pool.h +++ b/paddle/fluid/framework/fleet/heter_ps/mem_pool.h @@ -82,20 +82,6 @@ class HBMMemoryPool : public managed { cudaMemset(mem_, 0, block_size_ * capacity); } - friend std::ostream& operator<<(std::ostream& out, HBMMemoryPool& p) { - for (size_t k = 0; k < 5; k++) { - auto x = (FeatureValue*)(p.mem() + k * p.capacity()); - out << "show: " << x->show << " clk: " << x->clk << " slot: " << x->slot - << " lr: " << x->lr << " mf_dim: " << x->mf_size - << " mf_size: " << x->mf_size << " mf:"; - for (int i = 0; i < x->mf_size + 1; ++i) { - out << " " << x->mf[i]; - } - out << "\n"; - } - return out; - } - char* mem() { return mem_; } size_t capacity() { return capacity_; } diff --git a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h index 74a4f1ca16c2b..3a6f60fef858b 100644 --- a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h +++ b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h @@ -27,134 +27,460 @@ namespace paddle { namespace framework { #if defined(PADDLE_WITH_CUDA) -template + class Optimizer { public: - Optimizer() {} + __host__ Optimizer(CommonFeatureValueAccessor feature_value_accessor) { + feature_value_accessor_ = feature_value_accessor; + } + __host__ ~Optimizer() {} + + __device__ void update_value(const OptimizerConfig& optimizer_config, + float& val, // NOLINT + const float& grad) { + printf( + "Warning: update_value will not used. Please use dy_mf_update_value\n"); + } + + __device__ void dy_mf_update_value(const OptimizerConfig& optimizer_config, + float* ptr, + const float* grad) {} - ~Optimizer() {} + CommonFeatureValueAccessor feature_value_accessor_; - void initialize() {} + size_t _embedding_dim; + size_t _lr_embedding_dim; +}; + +class SparseAdagradOptimizer : public Optimizer { + public: + __host__ SparseAdagradOptimizer( + CommonFeatureValueAccessor feature_value_accessor) + : Optimizer(feature_value_accessor) { + _lr_embedding_dim = 1; + _embedding_dim = feature_value_accessor_.common_feature_value.EmbedWDim(); + } + + __device__ void update_value_work(const OptimizerConfig& optimizer_config, + int n, + float* w, + float* sgd, // NOLINT + const float* g, + float scale) { + float& g2sum = sgd[G2SumIndex()]; + double add_g2sum = 0; + double ratio = optimizer_config.mf_learning_rate * + sqrt(optimizer_config.mf_initial_g2sum / + (optimizer_config.mf_initial_g2sum + g2sum)); + for (int i = 0; i < n; ++i) { + double scaled_grad = g[i] / scale; + + w[i] += scaled_grad * ratio; + + if (w[i] < optimizer_config.mf_min_bound) + w[i] = optimizer_config.mf_min_bound; + if (w[i] > optimizer_config.mf_max_bound) + w[i] = optimizer_config.mf_max_bound; + add_g2sum += scaled_grad * scaled_grad; + } + + g2sum += add_g2sum / n; + } + + __device__ void update_value(const OptimizerConfig& optimizer_config, + float& val, // NOLINT + const float& grad) { + printf( + "Warning: update_value will not used. Please use dy_mf_update_value\n"); + } + __device__ void dy_mf_update_value(const OptimizerConfig& optimizer_config, + float* ptr, + const float* grad) { + float g_show = grad[feature_value_accessor_.common_push_value.ShowIndex()]; + float g_click = + grad[feature_value_accessor_.common_push_value.ClickIndex()]; + + ptr[feature_value_accessor_.common_feature_value.SlotIndex()] = + grad[feature_value_accessor_.common_push_value.SlotIndex()]; + ptr[feature_value_accessor_.common_feature_value.ShowIndex()] += g_show; + ptr[feature_value_accessor_.common_feature_value.ClickIndex()] += g_click; + ptr[feature_value_accessor_.common_feature_value.DeltaScoreIndex()] += + optimizer_config.nonclk_coeff * (g_show - g_click) + + optimizer_config.clk_coeff * g_click; + + update_value_work( + optimizer_config, + 1, + ptr + feature_value_accessor_.common_feature_value.EmbedWIndex(), + ptr + feature_value_accessor_.common_feature_value.EmbedG2SumIndex(), + grad + feature_value_accessor_.common_push_value.EmbedGIndex(), + g_show); + + int mf_dim = + int(ptr[feature_value_accessor_.common_feature_value.MfDimIndex()]); + if (ptr[feature_value_accessor_.common_feature_value.MfSizeIndex()] == 0) { + if (optimizer_config.mf_create_thresholds <= + optimizer_config.nonclk_coeff * + (ptr[feature_value_accessor_.common_feature_value + .ShowIndex()] - + ptr[feature_value_accessor_.common_feature_value + .ClickIndex()]) + + optimizer_config.clk_coeff * + ptr[feature_value_accessor_.common_feature_value + .ClickIndex()]) { + ptr[feature_value_accessor_.common_feature_value.MfSizeIndex()] = + feature_value_accessor_.common_feature_value.MFSize(mf_dim) / + sizeof(float); + + int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + curandState state; + curand_init(clock64(), tid_x, 0, &state); + for (int i = 0; i < mf_dim; ++i) { + ptr[feature_value_accessor_.common_feature_value.EmbedxWIndex() + i] = + (curand_uniform(&state)) * optimizer_config.mf_initial_range; + } + } + } else { + update_value_work( + optimizer_config, + mf_dim, + ptr + feature_value_accessor_.common_feature_value.EmbedxWIndex(), + ptr + feature_value_accessor_.common_feature_value.EmbedxG2SumIndex(), + grad + feature_value_accessor_.common_push_value.EmbedxGIndex(), + g_show); + } + } + + __host__ __device__ size_t Dim() { return EmbedDim() + EmbedxDim(); } + __host__ __device__ size_t EmbedDim() { return _lr_embedding_dim; } + __host__ __device__ size_t EmbedxDim() { return _embedding_dim; } + __host__ __device__ size_t G2SumIndex() { return 0; } + __host__ __device__ size_t EmbedxG2SumIndex() { return 0; } +}; + +class SparseAdamOptimizer : public Optimizer { + public: + __host__ SparseAdamOptimizer( + CommonFeatureValueAccessor feature_value_accessor) + : Optimizer(feature_value_accessor) { + _lr_embedding_dim = 1; + _embedding_dim = feature_value_accessor_.common_feature_value.EmbedWDim(); + } __device__ void update_lr(const OptimizerConfig& optimizer_config, - float& w, // NOLINT - float& g2sum, - float g, // NOLINT + int n, + float* w, + float* sgd, + const float* g, float scale) { - double add_g2sum = 0; - double ratio = optimizer_config.learning_rate * - sqrt(optimizer_config.initial_g2sum / - (optimizer_config.initial_g2sum + g2sum)); - double scaled_grad = g / scale; + float* moment1 = sgd + GSumIndex(); + float* moment2 = sgd + G2SumIndex(); + float* beta1_pow = sgd + Beta1PowIndex(); + float* beta2_pow = sgd + Beta2PowIndex(); - w += scaled_grad * ratio; + float beta1_pow_ = *beta1_pow; + float beta2_pow_ = *beta2_pow; - if (w < optimizer_config.min_bound) w = optimizer_config.min_bound; - if (w > optimizer_config.max_bound) w = optimizer_config.max_bound; + float epsilon = 1e-08; + double ratio = optimizer_config.learning_rate * sqrt(1.0 - beta2_pow_) / + (1.0 - beta1_pow_); + for (int i = 0; i < n; ++i) { + double scaled_grad = g[i] / scale; - add_g2sum += scaled_grad * scaled_grad; + double new_moment1 = + optimizer_config.beta1_decay_rate * moment1[i] + + (1.0 - optimizer_config.beta1_decay_rate) * scaled_grad; + double new_moment2 = + optimizer_config.beta2_decay_rate * moment2[i] + + (1.0 - optimizer_config.beta2_decay_rate) * scaled_grad * scaled_grad; + w[i] += ratio * (new_moment1 / (sqrt(new_moment2) + epsilon)); + + if (w[i] < optimizer_config.mf_min_bound) + w[i] = optimizer_config.mf_min_bound; + if (w[i] > optimizer_config.mf_max_bound) + w[i] = optimizer_config.mf_max_bound; - g2sum += add_g2sum; + moment1[i] = new_moment1; + moment2[i] = new_moment2; + } + (*beta1_pow) *= optimizer_config.beta1_decay_rate; + (*beta2_pow) *= optimizer_config.beta2_decay_rate; } __device__ void update_mf(const OptimizerConfig& optimizer_config, int n, float* w, - float& g2sum, // NOLINT + float* sgd, const float* g, float scale) { - double add_g2sum = 0; - double ratio = optimizer_config.mf_learning_rate * - sqrt(optimizer_config.mf_initial_g2sum / - (optimizer_config.mf_initial_g2sum + g2sum)); + float* moment1 = sgd + EmbedxGSumIndex(); + float* moment2 = sgd + EmbedxG2SumIndex(); + float* beta1_pow = sgd + EmbedxBeta1PowIndex(); + float* beta2_pow = sgd + EmbedxBeta2PowIndex(); + + float beta1_pow_ = *beta1_pow; + float beta2_pow_ = *beta2_pow; + + float epsilon = 1e-08; + double ratio = optimizer_config.learning_rate * sqrt(1.0 - beta2_pow_) / + (1.0 - beta1_pow_); for (int i = 0; i < n; ++i) { double scaled_grad = g[i] / scale; - w[i] += scaled_grad * ratio; + double new_moment1 = + optimizer_config.beta1_decay_rate * moment1[i] + + (1.0 - optimizer_config.beta1_decay_rate) * scaled_grad; + double new_moment2 = + optimizer_config.beta2_decay_rate * moment2[i] + + (1.0 - optimizer_config.beta2_decay_rate) * scaled_grad * scaled_grad; + w[i] += ratio * (new_moment1 / (sqrt(new_moment2) + epsilon)); if (w[i] < optimizer_config.mf_min_bound) w[i] = optimizer_config.mf_min_bound; if (w[i] > optimizer_config.mf_max_bound) w[i] = optimizer_config.mf_max_bound; - add_g2sum += scaled_grad * scaled_grad; - } - g2sum += add_g2sum / n; + moment1[i] = new_moment1; + moment2[i] = new_moment2; + } + (*beta1_pow) *= optimizer_config.beta1_decay_rate; + (*beta2_pow) *= optimizer_config.beta2_decay_rate; } __device__ void update_value(const OptimizerConfig& optimizer_config, - ValType& val, // NOLINT - const GradType& grad) { - val.slot = grad.slot; - val.show += grad.show; - val.clk += grad.clk; - val.delta_score += optimizer_config.nonclk_coeff * (grad.show - grad.clk) + - optimizer_config.clk_coeff * grad.clk; + float& val, // NOLINT + const float& grad) { + printf( + "Warning: update_value will not used. Please use dy_mf_update_value\n"); + } + __device__ void dy_mf_update_value(const OptimizerConfig& optimizer_config, + float* ptr, + const float* grad) { + float g_show = grad[feature_value_accessor_.common_push_value.ShowIndex()]; + float g_click = + grad[feature_value_accessor_.common_push_value.ClickIndex()]; - update_lr(optimizer_config, val.lr, val.lr_g2sum, grad.lr_g, grad.show); + ptr[feature_value_accessor_.common_feature_value.SlotIndex()] = + grad[feature_value_accessor_.common_push_value.SlotIndex()]; + ptr[feature_value_accessor_.common_feature_value.ShowIndex()] += g_show; + ptr[feature_value_accessor_.common_feature_value.ClickIndex()] += g_click; + ptr[feature_value_accessor_.common_feature_value.DeltaScoreIndex()] += + optimizer_config.nonclk_coeff * (g_show - g_click) + + optimizer_config.clk_coeff * g_click; - if (val.mf_size == 0) { + update_lr( + optimizer_config, + 1, + ptr + feature_value_accessor_.common_feature_value.EmbedWIndex(), + ptr + feature_value_accessor_.common_feature_value.EmbedG2SumIndex(), + grad + feature_value_accessor_.common_push_value.EmbedGIndex(), + g_show); + int mf_dim = + int(ptr[feature_value_accessor_.common_feature_value.MfDimIndex()]); + if (ptr[feature_value_accessor_.common_feature_value.MfSizeIndex()] == 0) { if (optimizer_config.mf_create_thresholds <= - optimizer_config.nonclk_coeff * (val.show - val.clk) + - optimizer_config.clk_coeff * val.clk) { - val.mf_size = MF_DIM + 1; - val.mf[0] = 0; + optimizer_config.nonclk_coeff * + (ptr[feature_value_accessor_.common_feature_value + .ShowIndex()] - + ptr[feature_value_accessor_.common_feature_value + .ClickIndex()]) + + optimizer_config.clk_coeff * + ptr[feature_value_accessor_.common_feature_value + .ClickIndex()]) { + ptr[feature_value_accessor_.common_feature_value.MfSizeIndex()] = + feature_value_accessor_.common_feature_value.MFSize(mf_dim) / + sizeof(float); + int tid_x = blockIdx.x * blockDim.x + threadIdx.x; curandState state; curand_init(clock64(), tid_x, 0, &state); - for (int i = 0; i < MF_DIM; ++i) { - val.mf[i + 1] = + for (int i = 0; i < mf_dim; ++i) { + ptr[feature_value_accessor_.common_feature_value.EmbedxWIndex() + i] = (curand_uniform(&state)) * optimizer_config.mf_initial_range; } + ptr[feature_value_accessor_.common_feature_value.EmbedxG2SumIndex() + + EmbedxBeta1PowIndex()] = optimizer_config.beta1_decay_rate; + ptr[feature_value_accessor_.common_feature_value.EmbedxG2SumIndex() + + EmbedxBeta2PowIndex()] = optimizer_config.beta2_decay_rate; } } else { - update_mf(optimizer_config, - MF_DIM, - &val.mf[1], - val.mf[0], - grad.mf_g, - grad.show); + update_mf( + optimizer_config, + mf_dim, + ptr + feature_value_accessor_.common_feature_value.EmbedxWIndex(), + ptr + feature_value_accessor_.common_feature_value.EmbedxG2SumIndex(), + grad + feature_value_accessor_.common_push_value.EmbedxGIndex(), + g_show); + } + // printf("EmbedxGIndex: %f, mf_gsum: %f, ", + // feature_value_accessor_.common_push_value.EmbedxGIndex(), + // ptr[feature_value_accessor_.common_feature_value.EmbedxG2SumIndex()]); + } + + __host__ __device__ size_t Dim() { return EmbedDim() + EmbedxDim(); } + __host__ __device__ size_t EmbedDim() { return _lr_embedding_dim * 2 + 2; } + __host__ __device__ size_t EmbedxDim() { return _embedding_dim * 2 + 2; } + __host__ __device__ size_t GSumIndex() { return 0; } + __host__ __device__ size_t G2SumIndex() { + return GSumIndex() + _lr_embedding_dim; + } + __host__ __device__ size_t Beta1PowIndex() { + return G2SumIndex() + _lr_embedding_dim; + } + __host__ __device__ size_t Beta2PowIndex() { return Beta1PowIndex() + 1; } + __host__ __device__ size_t EmbedxGSumIndex() { return 0; } + __host__ __device__ size_t EmbedxG2SumIndex() { + return EmbedxGSumIndex() + _embedding_dim; + } + __host__ __device__ size_t EmbedxBeta1PowIndex() { + return EmbedxG2SumIndex() + _embedding_dim; + } + __host__ __device__ size_t EmbedxBeta2PowIndex() { + return EmbedxBeta1PowIndex() + 1; + } +}; + +class SparseAdamSharedOptimizer : public Optimizer { + public: + __host__ SparseAdamSharedOptimizer( + CommonFeatureValueAccessor feature_value_accessor) + : Optimizer(feature_value_accessor) { + _lr_embedding_dim = 1; + _embedding_dim = feature_value_accessor_.common_feature_value.EmbedWDim(); + } + + __device__ void update_value_work(const OptimizerConfig& optimizer_config, + int n, + float* w, + float* sgd, + const float* g, + float scale) { + float* moment1 = sgd + GSumIndex(); + float* moment2 = sgd + G2SumIndex(); + float* beta1_pow = sgd + Beta1PowIndex(); + float* beta2_pow = sgd + Beta2PowIndex(); + + float beta1_pow_ = *beta1_pow; + float beta2_pow_ = *beta2_pow; + float moment1_ = *moment1; + float moment2_ = *moment2; + float epsilon = 1e-08; + double ratio = optimizer_config.learning_rate * sqrt(1.0 - beta2_pow_) / + (1.0 - beta1_pow_); + + double sum_mom1 = 0.0; + double sum_mom2 = 0.0; + for (int i = 0; i < n; ++i) { + double scaled_grad = g[i] / scale; + + double new_moment1 = + optimizer_config.beta1_decay_rate * moment1_ + + (1.0 - optimizer_config.beta1_decay_rate) * scaled_grad; + double new_moment2 = + optimizer_config.beta2_decay_rate * moment2_ + + (1.0 - optimizer_config.beta2_decay_rate) * scaled_grad * scaled_grad; + w[i] += ratio * (new_moment1 / (sqrt(new_moment2) + epsilon)); + + if (w[i] < optimizer_config.mf_min_bound) + w[i] = optimizer_config.mf_min_bound; + if (w[i] > optimizer_config.mf_max_bound) + w[i] = optimizer_config.mf_max_bound; + + sum_mom1 += new_moment1; + sum_mom2 += new_moment2; } + + (*moment1) = sum_mom1 / n; + (*moment2) = sum_mom2 / n; + (*beta1_pow) *= optimizer_config.beta1_decay_rate; + (*beta2_pow) *= optimizer_config.beta2_decay_rate; + } + + __device__ void update_value(const OptimizerConfig& optimizer_config, + float& val, // NOLINT + const float& grad) { + printf( + "Warning: update_value will not used. Please use dy_mf_update_value\n"); } __device__ void dy_mf_update_value(const OptimizerConfig& optimizer_config, - ValType* ptr, - const GradType& grad) { - ptr->slot = grad.slot; - ptr->show += grad.show; - ptr->clk += grad.clk; - ptr->delta_score += optimizer_config.nonclk_coeff * (grad.show - grad.clk) + - optimizer_config.clk_coeff * grad.clk; - - update_lr(optimizer_config, ptr->lr, ptr->lr_g2sum, grad.lr_g, grad.show); - // use MF_DIM temporarily - // ptr->mf_dim = grad.mf_dim; - - if (ptr->mf_size == 0) { + float* ptr, + const float* grad) { + float g_show = grad[feature_value_accessor_.common_push_value.ShowIndex()]; + float g_click = + grad[feature_value_accessor_.common_push_value.ClickIndex()]; + + ptr[feature_value_accessor_.common_feature_value.SlotIndex()] = + grad[feature_value_accessor_.common_push_value.SlotIndex()]; + ptr[feature_value_accessor_.common_feature_value.ShowIndex()] += g_show; + ptr[feature_value_accessor_.common_feature_value.ClickIndex()] += g_click; + ptr[feature_value_accessor_.common_feature_value.DeltaScoreIndex()] += + optimizer_config.nonclk_coeff * (g_show - g_click) + + optimizer_config.clk_coeff * g_click; + + update_value_work( + optimizer_config, + 1, + ptr + feature_value_accessor_.common_feature_value.EmbedWIndex(), + ptr + feature_value_accessor_.common_feature_value.EmbedG2SumIndex(), + grad + feature_value_accessor_.common_push_value.EmbedGIndex(), + g_show); + int mf_dim = + int(ptr[feature_value_accessor_.common_feature_value.MfDimIndex()]); + if (ptr[feature_value_accessor_.common_feature_value.MfSizeIndex()] == 0) { if (optimizer_config.mf_create_thresholds <= - optimizer_config.nonclk_coeff * (ptr->show - ptr->clk) + - optimizer_config.clk_coeff * ptr->clk) { - ptr->mf_size = ptr->mf_dim + 1; + optimizer_config.nonclk_coeff * + (ptr[feature_value_accessor_.common_feature_value + .ShowIndex()] - + ptr[feature_value_accessor_.common_feature_value + .ClickIndex()]) + + optimizer_config.clk_coeff * + ptr[feature_value_accessor_.common_feature_value + .ClickIndex()]) { + ptr[feature_value_accessor_.common_feature_value.MfSizeIndex()] = + feature_value_accessor_.common_feature_value.MFSize(mf_dim) / + sizeof(float); - // ptr->mf_size = MF_DIM + 1; - ptr->mf[0] = 0; int tid_x = blockIdx.x * blockDim.x + threadIdx.x; curandState state; curand_init(clock64(), tid_x, 0, &state); - for (int i = 0; i < ptr->mf_dim; ++i) { - ptr->mf[i + 1] = + for (int i = 0; i < mf_dim; ++i) { + ptr[feature_value_accessor_.common_feature_value.EmbedxWIndex() + i] = (curand_uniform(&state)) * optimizer_config.mf_initial_range; } + ptr[feature_value_accessor_.common_feature_value.EmbedxG2SumIndex() + + EmbedxBeta1PowIndex()] = optimizer_config.beta1_decay_rate; + ptr[feature_value_accessor_.common_feature_value.EmbedxG2SumIndex() + + EmbedxBeta2PowIndex()] = optimizer_config.beta2_decay_rate; } } else { - update_mf(optimizer_config, - ptr->mf_dim, - &(ptr->mf[1]), - ptr->mf[0], - grad.mf_g, - grad.show); // for local test + update_value_work( + optimizer_config, + mf_dim, + ptr + feature_value_accessor_.common_feature_value.EmbedxWIndex(), + ptr + feature_value_accessor_.common_feature_value.EmbedxG2SumIndex(), + grad + feature_value_accessor_.common_push_value.EmbedxGIndex(), + g_show); } } + + __host__ __device__ size_t Dim() { return EmbedDim() + EmbedxDim(); } + __host__ __device__ size_t EmbedDim() { return 4; } + __host__ __device__ size_t EmbedxDim() { return 4; } + __host__ __device__ size_t GSumIndex() { return 0; } + __host__ __device__ size_t G2SumIndex() { return GSumIndex() + 1; } + __host__ __device__ size_t Beta1PowIndex() { return G2SumIndex() + 1; } + __host__ __device__ size_t Beta2PowIndex() { return Beta1PowIndex() + 1; } + __host__ __device__ size_t EmbedxGSumIndex() { return 0; } + __host__ __device__ size_t EmbedxG2SumIndex() { + return EmbedxGSumIndex() + 1; + } + __host__ __device__ size_t EmbedxBeta1PowIndex() { + return EmbedxG2SumIndex() + 1; + } + __host__ __device__ size_t EmbedxBeta2PowIndex() { + return EmbedxBeta1PowIndex() + 1; + } }; #endif diff --git a/paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h b/paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h index 0db72992215a2..2db259941c873 100644 --- a/paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h +++ b/paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h @@ -27,13 +27,19 @@ class OptimizerConfig { float learning_rate = 0.05; float initial_g2sum = 3.0; float initial_range = 0; + float beta1_decay_rate = 0.9; // adam + float beta2_decay_rate = 0.999; // adam + float ada_epsilon = 1e-8; float mf_create_thresholds = 10; float mf_learning_rate = 0.05; float mf_initial_g2sum = 3.0; float mf_initial_range = 1e-4; + float mf_beta1_decay_rate = 0.9; // adam + float mf_beta2_decay_rate = 0.999; // adam float mf_min_bound = -10; float mf_max_bound = 10; + float mf_ada_epsilon = 1e-8; void set_sparse_sgd(float nonclk_coeff, float clk_coeff, @@ -41,7 +47,10 @@ class OptimizerConfig { float max_bound, float learning_rate, float initial_g2sum, - float initial_range) { + float initial_range, + float beta1_decay_rate, + float beta2_decay_rate, + float ada_epsilon) { this->nonclk_coeff = nonclk_coeff; this->clk_coeff = clk_coeff; this->min_bound = min_bound; @@ -49,6 +58,9 @@ class OptimizerConfig { this->learning_rate = learning_rate; this->initial_g2sum = initial_g2sum; this->initial_range = initial_range; + this->beta1_decay_rate = beta1_decay_rate; + this->beta2_decay_rate = beta2_decay_rate; + this->ada_epsilon = ada_epsilon; } void set_sparse_sgd(const OptimizerConfig& optimizer_config) { @@ -59,6 +71,9 @@ class OptimizerConfig { this->learning_rate = optimizer_config.learning_rate; this->initial_g2sum = optimizer_config.initial_g2sum; this->initial_range = optimizer_config.initial_range; + this->beta1_decay_rate = optimizer_config.beta1_decay_rate; + this->beta2_decay_rate = optimizer_config.beta2_decay_rate; + this->ada_epsilon = optimizer_config.ada_epsilon; } void set_embedx_sgd(float mf_create_thresholds, @@ -66,13 +81,19 @@ class OptimizerConfig { float mf_initial_g2sum, float mf_initial_range, float mf_min_bound, - float mf_max_bound) { + float mf_max_bound, + float mf_beta1_decay_rate, + float mf_beta2_decay_rate, + float mf_ada_epsilon) { this->mf_create_thresholds = mf_create_thresholds; this->mf_learning_rate = mf_learning_rate; this->mf_initial_g2sum = mf_initial_g2sum; this->mf_initial_range = mf_initial_range; this->mf_min_bound = mf_min_bound; this->mf_max_bound = mf_max_bound; + this->mf_beta1_decay_rate = mf_beta1_decay_rate; + this->mf_beta2_decay_rate = mf_beta2_decay_rate; + this->mf_ada_epsilon = mf_ada_epsilon; } void set_embedx_sgd(const OptimizerConfig& optimizer_config) { @@ -82,6 +103,9 @@ class OptimizerConfig { this->mf_initial_range = optimizer_config.mf_initial_range; this->mf_min_bound = optimizer_config.mf_min_bound; this->mf_max_bound = optimizer_config.mf_max_bound; + this->mf_beta1_decay_rate = optimizer_config.mf_beta1_decay_rate; + this->mf_beta2_decay_rate = optimizer_config.mf_beta2_decay_rate; + this->mf_ada_epsilon = optimizer_config.mf_ada_epsilon; } }; diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc index 94fa386aac488..d9bb6e946f42d 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc @@ -33,9 +33,9 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/data_set.h" #include "paddle/fluid/platform/timer.h" #if defined(PADDLE_WITH_PSCORE) -#include "paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h" #include "paddle/fluid/distributed/ps/table/depends/feature_value.h" #endif @@ -135,7 +135,7 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task) { std::string data_set_name = std::string(typeid(*dataset_).name()); if (data_set_name.find("SlotRecordDataset") != std::string::npos) { - SlotRecordDataset* dataset = dynamic_cast(dataset_); + SlotRecordDataset* dataset = (SlotRecordDataset*)(dataset_); auto input_channel = dataset->GetInputChannel(); VLOG(0) << "psgpu wrapperinputslotchannle size: " << input_channel->Size(); const std::deque& vec_data = input_channel->GetData(); @@ -185,7 +185,7 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task) { } else { CHECK(data_set_name.find("MultiSlotDataset") != std::string::npos); VLOG(0) << "ps_gpu_wrapper use MultiSlotDataset"; - MultiSlotDataset* dataset = dynamic_cast(dataset_); + MultiSlotDataset* dataset = (MultiSlotDataset*)(dataset_); auto input_channel = dataset->GetInputChannel(); const std::deque& vec_data = input_channel->GetData(); @@ -540,17 +540,17 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { &device_vals, &device_task_keys, &device_task_ptrs](int dev, int shard_id) { - auto& task_keys = device_task_keys[shard_id]; + // auto& task_keys = device_task_keys[shard_id]; #ifdef PADDLE_WITH_PSLIB auto& task_ptrs = device_task_ptrs[shard_id]; #endif -#ifdef PADDLE_WITH_PSCORE - auto& task_ptrs = device_task_ptrs[shard_id]; -#endif + // #ifdef PADDLE_WITH_PSCORE + // auto& task_ptrs = device_task_ptrs[shard_id]; + // #endif - int len = prefix_sum[dev][shard_id + 1] - prefix_sum[dev][shard_id]; - int cur = prefix_sum[dev][shard_id]; + // int len = prefix_sum[dev][shard_id + 1] - prefix_sum[dev][shard_id]; + // int cur = prefix_sum[dev][shard_id]; #ifdef PADDLE_WITH_PSLIB for (int j = 0; j < len; ++j) { device_keys[dev][cur + j] = task_keys[dev][j]; @@ -579,33 +579,6 @@ void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { } } #endif -#ifdef PADDLE_WITH_PSCORE - for (int j = 0; j < len; ++j) { - device_keys[dev][cur + j] = task_keys[dev][j]; - float* ptr_val = task_ptrs[dev][j]->data(); - FeatureValue& val = device_vals[dev][cur + j]; - size_t dim = task_ptrs[dev][j]->size(); - val.delta_score = ptr_val[2]; - val.show = ptr_val[3]; - val.clk = ptr_val[4]; - val.slot = ptr_val[0]; - val.lr = ptr_val[5]; - val.lr_g2sum = ptr_val[6]; - val.cpu_ptr = (uint64_t)(task_ptrs[dev][j]); - - if (dim > 7) { - val.mf_size = MF_DIM + 1; - for (int x = 0; x < val.mf_size; x++) { - val.mf[x] = ptr_val[x + 7]; - } - } else { - val.mf_size = 0; - for (int x = 0; x < MF_DIM + 1; x++) { - val.mf[x] = 0; - } - } - } -#endif VLOG(3) << "GpuPs build hbmps done"; }; @@ -665,16 +638,25 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr gpu_task) { return; } std::vector threads(device_num); - HeterPs_ = HeterPsBase::get_instance(size_max, resource_); + auto accessor_wrapper_ptr = + GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); + HeterPs_ = HeterPsBase::get_instance( + size_max, resource_, fleet_config_, accessor_class_, optimizer_type_); #ifdef PADDLE_WITH_CUDA HeterPs_->set_nccl_comm_and_size(inner_comms_, inter_comms_, node_size_); + HeterPs_->set_sparse_sgd(optimizer_config_); + HeterPs_->set_embedx_sgd(optimizer_config_); #endif - auto build_dymf_mem_pool = [this, &gpu_task](int i, int j) { + auto build_dymf_mem_pool = [this, &gpu_task, &accessor_wrapper_ptr](int i, + int j) { this->HeterPs_->set_multi_mf_dim(multi_mf_dim_, max_mf_dim_); int mf_dim = this->index_dim_vec_[j]; + VLOG(0) << "building table: " << i << "with mf dim: " << mf_dim + << " feature_value_size:" + << accessor_wrapper_ptr->GetFeatureValueSize(mf_dim); size_t feature_value_size = - TYPEALIGN(8, sizeof(FeatureValue) + ((mf_dim + 1) * sizeof(float))); + accessor_wrapper_ptr->GetFeatureValueSize(mf_dim); auto& device_dim_keys = gpu_task->device_dim_keys_[i][j]; auto& device_dim_ptrs = gpu_task->device_dim_ptr_[i][j]; size_t len = device_dim_keys.size(); @@ -682,12 +664,13 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr gpu_task) { this->mem_pools_[i * this->multi_mf_dim_ + j] = new MemoryPool(len, feature_value_size); }; - auto build_dymf_hbm_pool = [this, &gpu_task](int i, int j) { + auto build_dymf_hbm_pool = [this, &gpu_task, &accessor_wrapper_ptr](int i, + int j) { auto& device_dim_keys = gpu_task->device_dim_keys_[i][j]; size_t len = device_dim_keys.size(); int mf_dim = this->index_dim_vec_[j]; size_t feature_value_size = - TYPEALIGN(8, sizeof(FeatureValue) + ((mf_dim + 1) * sizeof(float))); + accessor_wrapper_ptr->GetFeatureValueSize(mf_dim); auto& mem_pool = this->mem_pools_[i * this->multi_mf_dim_ + j]; platform::CUDADeviceGuard guard(resource_->dev_id(i)); @@ -710,13 +693,13 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr gpu_task) { delete mem_pool; }; int thread_num = 16; - auto build_dynamic_mf_func = [this, &gpu_task, thread_num]( - int i, int j, int z) { + auto build_dynamic_mf_func = [this, + &gpu_task, + thread_num, + &accessor_wrapper_ptr](int i, int j, int z) { // this->HeterPs_->set_multi_mf_dim(multi_mf_dim_, max_mf_dim_); int mf_dim = this->index_dim_vec_[j]; VLOG(0) << "building table: " << i << "with mf dim: " << mf_dim; - // size_t feature_value_size = - // TYPEALIGN(8, sizeof(FeatureValue) + ((mf_dim + 1) * sizeof(float))); auto& device_dim_keys = gpu_task->device_dim_keys_[i][j]; auto& device_dim_ptrs = gpu_task->device_dim_ptr_[i][j]; size_t len = device_dim_keys.size(); @@ -743,10 +726,10 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr gpu_task) { // ============ add for multi-thread ================ for (size_t k = left; k < right; k++) { - FeatureValue* val = (FeatureValue*)(mem_pool->mem_address(k)); +#ifdef PADDLE_WITH_PSLIB + float* val = (float*)(mem_pool->mem_address(k)); float* ptr_val = device_dim_ptrs[k]->data(); size_t dim = device_dim_ptrs[k]->size(); -#ifdef PADDLE_WITH_PSLIB val->delta_score = ptr_val[paddle::ps::DownpourCtrDymfAccessor:: DownpourCtrDymfFeatureValue::delta_score_index()]; @@ -765,23 +748,6 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr gpu_task) { ptr_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue:: mf_dim_index()] = float(mf_dim); val->mf_dim = mf_dim; -#endif -#ifdef PADDLE_WITH_PSCORE - paddle::distributed::CtrDymfAccessor accessor; - val->delta_score = - ptr_val[accessor.common_feature_value.DeltaScoreIndex()]; - val->show = ptr_val[accessor.common_feature_value.ShowIndex()]; - val->clk = ptr_val[accessor.common_feature_value.ClickIndex()]; - val->slot = int(ptr_val[accessor.common_feature_value.SlotIndex()]); - val->lr = ptr_val[accessor.common_feature_value.EmbedWIndex()]; - val->lr_g2sum = ptr_val[accessor.common_feature_value.EmbedG2SumIndex()]; - - val->cpu_ptr = (uint64_t)(device_dim_ptrs[k]); - - // TODO(xuefeng) set mf_dim while using DownpourCtrDymfAccessor - ptr_val[accessor.common_feature_value.MfDimIndex()] = float(mf_dim); - val->mf_dim = mf_dim; -#endif if (dim > 8) { // CpuPS alreay expand as mf_dim val->mf_size = mf_dim + 1; for (int x = 0; x < val->mf_dim + 1; x++) { @@ -793,6 +759,12 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr gpu_task) { val->mf[x] = 0; } } +#endif +#ifdef PADDLE_WITH_PSCORE + void* val = mem_pool->mem_address(k); + accessor_wrapper_ptr->BuildFill( + val, device_dim_ptrs[k], cpu_table_accessor_, mf_dim); +#endif } }; @@ -945,7 +917,10 @@ void PSGPUWrapper::EndPass() { } } int thread_num = 8; - auto dump_pool_to_cpu_func = [this, thread_num](int i, int j, int z) { + auto accessor_wrapper_ptr = + GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); + auto dump_pool_to_cpu_func = [this, thread_num, &accessor_wrapper_ptr]( + int i, int j, int z) { PADDLE_ENFORCE_GPU_SUCCESS(cudaSetDevice(this->resource_->dev_id(i))); auto& hbm_pool = this->hbm_pools_[i * this->multi_mf_dim_ + j]; auto& device_keys = this->current_task_->device_dim_keys_[i][j]; @@ -965,9 +940,11 @@ void PSGPUWrapper::EndPass() { } // ============ multi-thread process feasign============ int mf_dim = this->index_dim_vec_[j]; - VLOG(0) << "dump pool to cpu table: " << i << "with mf dim: " << mf_dim; size_t feature_value_size = - TYPEALIGN(8, sizeof(FeatureValue) + ((mf_dim + 1) * sizeof(float))); + accessor_wrapper_ptr->GetFeatureValueSize(mf_dim); + VLOG(0) << "dump pool to cpu table: " << i << "with mf dim: " << mf_dim + << " key_len :" << len + << " feature_value_size:" << feature_value_size; char* test_build_values = (char*)malloc(feature_value_size * real_len); uint64_t offset = left * feature_value_size; cudaMemcpy(test_build_values, @@ -981,7 +958,7 @@ void PSGPUWrapper::EndPass() { continue; } size_t local_offset = (i - left) * feature_value_size; - FeatureValue* gpu_val = (FeatureValue*)(test_build_values + local_offset); + float* gpu_val = (float*)(test_build_values + local_offset); #ifdef PADDLE_WITH_PSLIB auto* downpour_value = (paddle::ps::DownpourFixedFeatureValue*)(gpu_val->cpu_ptr); @@ -1002,31 +979,15 @@ void PSGPUWrapper::EndPass() { embed_g2sum_index()] = gpu_val->lr_g2sum; cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue:: slot_index()] = gpu_val->slot; -#endif -#ifdef PADDLE_WITH_PSCORE - auto* downpour_value = - (paddle::distributed::FixedFeatureValue*)(gpu_val->cpu_ptr); - int downpour_value_size = downpour_value->size(); - if (gpu_val->mf_size > 0 && downpour_value_size == 8) { - downpour_value->resize(gpu_val->mf_dim + 1 + downpour_value_size); - } - float* cpu_val = downpour_value->data(); - - paddle::distributed::CtrDymfAccessor accessor; - cpu_val[accessor.common_feature_value.DeltaScoreIndex()] = - gpu_val->delta_score; - cpu_val[accessor.common_feature_value.ShowIndex()] = gpu_val->show; - cpu_val[accessor.common_feature_value.ClickIndex()] = gpu_val->clk; - cpu_val[accessor.common_feature_value.EmbedWIndex()] = gpu_val->lr; - cpu_val[accessor.common_feature_value.EmbedG2SumIndex()] = - gpu_val->lr_g2sum; - cpu_val[accessor.common_feature_value.SlotIndex()] = gpu_val->slot; -#endif if (gpu_val->mf_size > 0) { for (int x = 0; x < gpu_val->mf_dim + 1; x++) { cpu_val[x + 8] = gpu_val->mf[x]; } } +#endif +#ifdef PADDLE_WITH_PSCORE + accessor_wrapper_ptr->DumpFill(gpu_val, cpu_table_accessor_, mf_dim); +#endif } free(test_build_values); }; @@ -1066,79 +1027,8 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, const std::vector& values, const std::vector& slot_lengths, const int hidden_size) { - platform::Timer all_timer; - platform::Timer pull_gpups_timer; - all_timer.Start(); - int64_t total_length = - std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL); - VLOG(3) << "Begine Gpu/Xpu Ps PullSparse"; - auto buf = memory::Alloc(place, total_length * sizeof(FeatureValue)); - FeatureValue* total_values_gpu = reinterpret_cast(buf->ptr()); - if (platform::is_cpu_place(place)) { - PADDLE_THROW(platform::errors::Unimplemented( - "Warning:: CPUPlace is not supported in GpuPs now.")); - } else if (platform::is_gpu_place(place)) { -#ifdef PADDLE_WITH_CUDA - VLOG(3) << "Begin copy keys, key_num[" << total_length << "]"; - int device_id = place.GetDeviceId(); - int devid_2_index = HeterPs_->get_index_by_devid(device_id); - LoDTensor& total_keys_tensor = keys_tensor[devid_2_index]; - uint64_t* total_keys = reinterpret_cast( - total_keys_tensor.mutable_data({total_length, 1}, place)); - - // construct slot_level lod info - auto slot_lengths_lod = slot_lengths; - for (size_t i = 1; i < slot_lengths_lod.size(); i++) { - slot_lengths_lod[i] += slot_lengths_lod[i - 1]; - } - auto buf_key = memory::Alloc(place, keys.size() * sizeof(uint64_t*)); - auto buf_length = - memory::Alloc(place, slot_lengths.size() * sizeof(int64_t)); - uint64_t** gpu_keys = reinterpret_cast(buf_key->ptr()); - int64_t* gpu_len = reinterpret_cast(buf_length->ptr()); - cudaMemcpy(gpu_keys, - keys.data(), - keys.size() * sizeof(uint64_t*), - cudaMemcpyHostToDevice); - cudaMemcpy(gpu_len, - slot_lengths_lod.data(), - slot_lengths.size() * sizeof(int64_t), - cudaMemcpyHostToDevice); - - this->CopyKeys(place, - gpu_keys, - total_keys, - gpu_len, - static_cast(slot_lengths.size()), - static_cast(total_length)); - VLOG(3) << "Begin call PullSparseGPU in GPUPS, dev: " << devid_2_index - << " len: " << total_length; - pull_gpups_timer.Start(); - HeterPs_->pull_sparse(devid_2_index, - total_keys, - total_values_gpu, - static_cast(total_length)); - pull_gpups_timer.Pause(); - - VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length - << "]"; - this->CopyForPull(place, - gpu_keys, - values, - total_values_gpu, - gpu_len, - static_cast(slot_lengths.size()), - hidden_size, - total_length); - } else { - PADDLE_THROW(platform::errors::PreconditionNotMet( - "GpuPs: PullSparse Only Support CUDAPlace Now.")); - } - all_timer.Pause(); - VLOG(3) << "GpuPs PullSparse total costs: " << all_timer.ElapsedSec() - << " s, of which GPUPS costs: " << pull_gpups_timer.ElapsedSec() - << " s"; - VLOG(3) << "End PullSparse"; + VLOG(0) << "Warning:: recommand use pull_gpups_sparse op instead. This " + "PullSparse is not used."; } void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, @@ -1156,13 +1046,16 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL); size_t feature_value_size = 0; - feature_value_size = TYPEALIGN( - 8, sizeof(FeatureValue) + sizeof(float) * (index_dim_vec_.back() + 1)); + auto accessor_wrapper_ptr = + GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); + feature_value_size = accessor_wrapper_ptr->GetFeatureValueSize(max_mf_dim_); + VLOG(3) << "PullSparse max_dim:" << max_mf_dim_ + << " feature_value_size:" << feature_value_size; #ifdef PADDLE_WITH_CUDA VLOG(3) << "Begine Gpu Ps PullSparse"; auto buf = memory::Alloc(place, total_length * feature_value_size); - FeatureValue* total_values_gpu = reinterpret_cast(buf->ptr()); + float* total_values_gpu = reinterpret_cast(buf->ptr()); #endif #ifdef PADDLE_WITH_XPU_KP VLOG(3) << "Begine Xpu Ps PullSparse"; @@ -1224,19 +1117,19 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length << "]"; - this->CopyForPull(place, - gpu_keys, - values, - total_values_gpu, - gpu_len, - static_cast(slot_lengths.size()), - hidden_size, - total_length, - gpu_dim); + accessor_wrapper_ptr->CopyForPull(place, + gpu_keys, + values, + total_values_gpu, + gpu_len, + static_cast(slot_lengths.size()), + hidden_size, + total_length, + gpu_dim, + val_type_size_); pull_gpups_timer.Pause(); -#endif } else if (platform::is_xpu_place(place)) { #ifdef PADDLE_WITH_XPU_KP VLOG(3) << "Begin copy keys, key_num[" << total_length << "]"; @@ -1283,14 +1176,15 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length << "]"; - this->CopyForPull(place, - xpu_keys, - values, - total_values_gpu, - xpu_len, - static_cast(slot_lengths.size()), - hidden_size, - total_length); + accessor_wrapper_ptr->CopyForPull(place, + xpu_keys, + values, + total_values_gpu, + xpu_len, + static_cast(slot_lengths.size()), + hidden_size, + total_length, + val_type_size_); #endif } else { PADDLE_THROW(platform::errors::PreconditionNotMet( @@ -1317,12 +1211,13 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place, std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL); // #ifdef PADDLE_WITH_CUDA VLOG(3) << "Begin GPUPS PushSparseGrad"; - size_t grad_value_size = - TYPEALIGN(8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float))); + auto accessor_wrapper_ptr = + GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); + size_t grad_value_size = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_); auto buf = memory::Alloc(place, total_length * grad_value_size); - VLOG(3) << "Push Sparse Max mf dimention: " << max_mf_dim_; - FeaturePushValue* total_grad_values_gpu = - reinterpret_cast(buf->ptr()); + VLOG(3) << "Push Sparse Max mf dimention: " << max_mf_dim_ + << "grad_value_size:" << grad_value_size; + float* total_grad_values_gpu = reinterpret_cast(buf->ptr()); if (platform::is_cpu_place(place)) { PADDLE_THROW(platform::errors::Unimplemented( "Warning:: CPUPlace is not supported in GPUPS now.")); @@ -1334,23 +1229,15 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place, uint64_t* total_keys = reinterpret_cast(cached_total_keys_tensor.data()); VLOG(3) << "Begin copy grad tensor to gpups struct"; - if (!multi_mf_dim_) { - this->CopyForPush(place, - grad_values, - total_grad_values_gpu, - slot_lengths, - hidden_size, - total_length, - batch_size); - } else { - this->CopyForPush(place, - grad_values, - total_grad_values_gpu, - slot_lengths, - total_length, - batch_size, - grad_value_size); - } + accessor_wrapper_ptr->CopyForPush(place, + grad_values, + total_grad_values_gpu, + slot_lengths, + total_length, + batch_size, + grad_value_size, + slot_vector_, + slot_mf_dim_vector_); VLOG(3) << "Begin call PushSparseGPU in GPUPS, dev: " << devid_2_index << " len: " << total_length; @@ -1369,13 +1256,14 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place, uint64_t* total_keys = reinterpret_cast(cached_total_keys_tensor.data()); VLOG(3) << "Begin copy grad tensor to xpups struct"; - this->CopyForPush(place, - grad_values, - total_grad_values_gpu, - slot_lengths, - hidden_size, - total_length, - batch_size); + accessor_wrapper_ptr->CopyForPush(place, + grad_values, + total_grad_values_gpu, + slot_lengths, + hidden_size, + total_length, + batch_size, + slot_vector_); VLOG(3) << "Begin call PushSparseXPU in XPUPS, dev: " << devid_2_index << " len: " << total_length; diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu index 734765fa95423..f8624f48d08f3 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu @@ -26,90 +26,6 @@ limitations under the License. */ namespace paddle { namespace framework { -__global__ void PullCopy(float** dest, - const FeatureValue* src, - const int64_t* len, - int hidden, - int slot_num, - int total_len, - uint64_t** keys) { - CUDA_KERNEL_LOOP(i, total_len) { - int low = 0; - int high = slot_num - 1; - while (low < high) { - int mid = (low + high) / 2; - if (i < len[mid]) - high = mid; - else - low = mid + 1; - } - int x = low; - int y = i - (x ? len[x - 1] : 0); - if (*(keys[x] + y) == 0) { - *(dest[x] + y * hidden) = 0; - *(dest[x] + y * hidden + 1) = 0; - *(dest[x] + y * hidden + 2) = 0; - } else { - *(dest[x] + y * hidden) = (src + i)->show; - *(dest[x] + y * hidden + 1) = (src + i)->clk; - *(dest[x] + y * hidden + 2) = (src + i)->lr; - } - if ((src + i)->mf_size == 0 || *(keys[x] + y) == 0) { - for (int j = 0; j < hidden - 3; j++) { - *(dest[x] + y * hidden + 3 + j) = 0; - } - } else { - for (int j = 0; j < hidden - 3; j++) { - *(dest[x] + y * hidden + 3 + j) = (src + i)->mf[1 + j]; - } - } - } -} - -__global__ void PullCopy(float** dest, - const FeatureValue* src, - const int64_t* len, - int slot_num, - int total_len, - uint64_t** keys, - uint64_t max_val_size, - int* gpu_dim) { - CUDA_KERNEL_LOOP(i, total_len) { - int low = 0; - int high = slot_num - 1; - while (low < high) { - int mid = (low + high) / 2; - if (i < len[mid]) - high = mid; - else - low = mid + 1; - } - int x = low; - int y = i - (x ? len[x - 1] : 0); - FeatureValue* feature_value_ptr = - (FeatureValue*)((char*)src + uint64_t(i) * uint64_t(max_val_size)); - int mf_dim = gpu_dim[x] - 3; - if (*(keys[x] + y) == 0) { - *(dest[x] + y * (mf_dim + 3)) = 0; - *(dest[x] + y * (mf_dim + 3) + 1) = 0; - *(dest[x] + y * (mf_dim + 3) + 2) = 0; - } else { - *(dest[x] + y * (mf_dim + 3)) = feature_value_ptr->show; - *(dest[x] + y * (mf_dim + 3) + 1) = feature_value_ptr->clk; - *(dest[x] + y * (mf_dim + 3) + 2) = feature_value_ptr->lr; - } - if ((feature_value_ptr)->mf_size == 0 || *(keys[x] + y) == 0) { - for (int j = 0; j < mf_dim; j++) { - *(dest[x] + y * (mf_dim + 3) + 3 + j) = 0; - } - } else { - for (int j = 0; j < mf_dim; j++) { - *(dest[x] + y * (mf_dim + 3) + 3 + j) = feature_value_ptr->mf[1 + j]; - } - } - } -} - __global__ void CopyKeysKernel(uint64_t** src_keys, uint64_t* dest_total_keys, const int64_t* len, @@ -161,101 +77,8 @@ __global__ void PushCopy(FeaturePushValue* dest, } } -__global__ void PushCopyWithPool(FeaturePushValue* dest, - float** src, - int64_t* len, - int slot_num, - uint64_t total_len, - int bs, - int* slot_vector, - int* mf_dim_vector, - size_t grad_value_size) { - CUDA_KERNEL_LOOP(i, total_len) { - int low = 0; - int high = slot_num - 1; - while (low < high) { - int mid = (low + high) / 2; - if (i < len[mid]) - high = mid; - else - low = mid + 1; - } - int x = low; - int y = i - (x ? len[low - 1] : 0); - FeaturePushValue* cur = - (FeaturePushValue*)((char*)dest + i * grad_value_size); - cur->slot = slot_vector[x]; - int mf_dim = mf_dim_vector[x]; - cur->mf_dim = mf_dim; - cur->show = *(src[x] + y * (mf_dim + 3)); - cur->clk = *(src[x] + y * (mf_dim + 3) + 1); - cur->lr_g = *(src[x] + y * (mf_dim + 3) + 2) * -1. * bs; - for (int j = 0; j < cur->mf_dim; j++) { - cur->mf_g[j] = *(src[x] + y * (mf_dim + 3) + 3 + j) * -1. * bs; - } - } -} PSGPUWrapper::~PSGPUWrapper() { delete HeterPs_; } -void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place, - uint64_t** gpu_keys, - const std::vector& values, - const FeatureValue* total_values_gpu, - const int64_t* gpu_len, - const int slot_num, - const int hidden_size, - const int64_t total_length) { - auto stream = dynamic_cast( - platform::DeviceContextPool::Instance().Get(place)) - ->stream(); - auto buf_value = memory::Alloc(place, values.size() * sizeof(float*)); - float** gpu_values = reinterpret_cast(buf_value->ptr()); - cudaMemcpy(gpu_values, - values.data(), - values.size() * sizeof(float*), - cudaMemcpyHostToDevice); - - PullCopy<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>( - gpu_values, - total_values_gpu, - gpu_len, - hidden_size, - slot_num, - total_length, - gpu_keys); - cudaStreamSynchronize(stream); -} - -void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place, - uint64_t** gpu_keys, - const std::vector& values, - const FeatureValue* total_values_gpu, - const int64_t* gpu_len, - const int slot_num, - const int hidden_size, - const int64_t total_length, - int* gpu_dim) { - auto stream = dynamic_cast( - platform::DeviceContextPool::Instance().Get(place)) - ->stream(); - auto buf_value = memory::Alloc(place, values.size() * sizeof(float*)); - float** gpu_values = reinterpret_cast(buf_value->ptr()); - cudaMemcpy(gpu_values, - values.data(), - values.size() * sizeof(float*), - cudaMemcpyHostToDevice); - PullCopy<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>( - gpu_values, - total_values_gpu, - gpu_len, - slot_num, - total_length, - gpu_keys, - val_type_size_, - gpu_dim); - cudaStreamSynchronize(stream); -} - void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place, uint64_t** origin_keys, uint64_t* total_keys, @@ -270,125 +93,26 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place, cudaStreamSynchronize(stream); } -void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place, - const std::vector& grad_values, - FeaturePushValue* total_grad_values_gpu, - const std::vector& slot_lengths, - const int hidden_size, - const int64_t total_length, - const int batch_size) { - auto stream = dynamic_cast( - platform::DeviceContextPool::Instance().Get(place)) - ->stream(); - auto slot_lengths_lod = slot_lengths; - for (int i = 1; i < slot_lengths_lod.size(); i++) { - slot_lengths_lod[i] += slot_lengths_lod[i - 1]; - } - auto buf_grad_value = - memory::Alloc(place, grad_values.size() * sizeof(float*)); - auto buf_length = memory::Alloc(place, slot_lengths.size() * sizeof(int64_t)); - auto buf_slot_vector = - memory::Alloc(place, slot_lengths_lod.size() * sizeof(int)); - - float** gpu_values = reinterpret_cast(buf_grad_value->ptr()); - int64_t* gpu_len = reinterpret_cast(buf_length->ptr()); - int* d_slot_vector = reinterpret_cast(buf_slot_vector->ptr()); - - cudaMemcpy(gpu_values, - grad_values.data(), - grad_values.size() * sizeof(float*), - cudaMemcpyHostToDevice); - cudaMemcpy(gpu_len, - slot_lengths_lod.data(), - slot_lengths.size() * sizeof(int64_t), - cudaMemcpyHostToDevice); - cudaMemcpy(d_slot_vector, - slot_vector_.data(), - slot_lengths_lod.size() * sizeof(int), - cudaMemcpyHostToDevice); - - PushCopy<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>( - total_grad_values_gpu, - gpu_values, - gpu_len, - hidden_size, - slot_lengths.size(), - total_length, - batch_size, - d_slot_vector); - cudaStreamSynchronize(stream); -} - -void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place, - const std::vector& grad_values, - FeaturePushValue* total_grad_values_gpu, - const std::vector& slot_lengths, - const uint64_t total_length, - const int batch_size, - size_t grad_value_size) { - auto stream = dynamic_cast( - platform::DeviceContextPool::Instance().Get(place)) - ->stream(); - auto slot_lengths_lod = slot_lengths; - for (int i = 1; i < slot_lengths_lod.size(); i++) { - slot_lengths_lod[i] += slot_lengths_lod[i - 1]; - } - auto buf_grad_value = - memory::Alloc(place, grad_values.size() * sizeof(float*)); - auto buf_length = memory::Alloc(place, slot_lengths.size() * sizeof(int64_t)); - auto buf_slot_vector = - memory::Alloc(place, slot_lengths_lod.size() * sizeof(int)); - auto buf_mf_dim_vector = - memory::Alloc(place, slot_lengths_lod.size() * sizeof(int)); - float** gpu_values = reinterpret_cast(buf_grad_value->ptr()); - int64_t* gpu_len = reinterpret_cast(buf_length->ptr()); - int* d_slot_vector = reinterpret_cast(buf_slot_vector->ptr()); - int* d_mf_dim_vector = reinterpret_cast(buf_mf_dim_vector->ptr()); - cudaMemcpy(gpu_values, - grad_values.data(), - grad_values.size() * sizeof(float*), - cudaMemcpyHostToDevice); - cudaMemcpy(gpu_len, - slot_lengths_lod.data(), - slot_lengths.size() * sizeof(int64_t), - cudaMemcpyHostToDevice); - cudaMemcpy(d_slot_vector, - slot_vector_.data(), - slot_lengths_lod.size() * sizeof(int), - cudaMemcpyHostToDevice); - cudaMemcpy(d_mf_dim_vector, - slot_mf_dim_vector_.data(), - slot_lengths_lod.size() * sizeof(int), - cudaMemcpyHostToDevice); - PushCopyWithPool<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>( - total_grad_values_gpu, - gpu_values, - gpu_len, - slot_lengths.size(), - total_length, - batch_size, - d_slot_vector, - d_mf_dim_vector, - grad_value_size); - cudaStreamSynchronize(stream); -} - void PSGPUWrapper::SetSparseSGD(float nonclk_coeff, float clk_coeff, float min_bound, float max_bound, float learning_rate, float initial_g2sum, - float initial_range) { - OptimizerConfig optimizer_config; - optimizer_config.set_sparse_sgd(nonclk_coeff, - clk_coeff, - min_bound, - max_bound, - learning_rate, - initial_g2sum, - initial_range); - HeterPs_->set_sparse_sgd(optimizer_config); + float initial_range, + float beta1_decay_rate, + float beta2_decay_rate, + float ada_epsilon) { + optimizer_config_.set_sparse_sgd(nonclk_coeff, + clk_coeff, + min_bound, + max_bound, + learning_rate, + initial_g2sum, + initial_range, + beta1_decay_rate, + beta2_decay_rate, + ada_epsilon); } void PSGPUWrapper::SetEmbedxSGD(float mf_create_thresholds, @@ -396,15 +120,19 @@ void PSGPUWrapper::SetEmbedxSGD(float mf_create_thresholds, float mf_initial_g2sum, float mf_initial_range, float mf_min_bound, - float mf_max_bound) { - OptimizerConfig optimizer_config; - optimizer_config.set_embedx_sgd(mf_create_thresholds, - mf_learning_rate, - mf_initial_g2sum, - mf_initial_range, - mf_min_bound, - mf_max_bound); - HeterPs_->set_embedx_sgd(optimizer_config); + float mf_max_bound, + float mf_beta1_decay_rate, + float mf_beta2_decay_rate, + float mf_ada_epsilon) { + optimizer_config_.set_embedx_sgd(mf_create_thresholds, + mf_learning_rate, + mf_initial_g2sum, + mf_initial_range, + mf_min_bound, + mf_max_bound, + mf_beta1_decay_rate, + mf_beta2_decay_rate, + mf_ada_epsilon); } } // end namespace framework diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h index 65f86acce9151..0d1669a42b1e9 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h @@ -51,7 +51,10 @@ limitations under the License. */ #include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN #include "paddle/fluid/platform/place.h" #ifdef PADDLE_WITH_PSCORE +#include "paddle/fluid/distributed/ps/table/accessor.h" +#include "paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h" #include "paddle/fluid/distributed/ps/wrapper/fleet.h" +#include "paddle/fluid/distributed/the_one_ps.pb.h" #endif #ifdef PADDLE_WITH_PSLIB #include "afs_api.h" @@ -64,9 +67,6 @@ limitations under the License. */ namespace paddle { namespace framework { -#define TYPEALIGN(ALIGNVAL, LEN) \ - (((uint64_t)(LEN) + ((ALIGNVAL)-1)) & ~((uint64_t)((ALIGNVAL)-1))) - class Dataset; #ifdef PADDLE_WITH_PSLIB @@ -98,7 +98,7 @@ class AfsWrapper { class PSGPUWrapper { public: - virtual ~PSGPUWrapper(); + ~PSGPUWrapper(); PSGPUWrapper() { HeterPs_ = NULL; @@ -139,37 +139,6 @@ class PSGPUWrapper { const int64_t* gpu_len, int slot_num, int total_len); - void CopyForPull(const paddle::platform::Place& place, - uint64_t** gpu_keys, - const std::vector& values, - const FeatureValue* total_values_gpu, - const int64_t* gpu_len, - const int slot_num, - const int hidden_size, - const int64_t total_length); - void CopyForPull(const paddle::platform::Place& place, - uint64_t** gpu_keys, - const std::vector& values, - const FeatureValue* total_values_gpu, - const int64_t* gpu_len, - const int slot_num, - const int hidden_size, - const int64_t total_length, - int* gpu_dim); - void CopyForPush(const paddle::platform::Place& place, - const std::vector& grad_values, - FeaturePushValue* total_grad_values_gpu, - const std::vector& slot_lengths, - const int hidden_size, - const int64_t total_length, - const int batch_size); - void CopyForPush(const paddle::platform::Place& place, - const std::vector& grad_values, - FeaturePushValue* total_grad_values_gpu, - const std::vector& slot_lengths, - const uint64_t total_length, - const int batch_size, - size_t grad_value_size); void BuildGPUTask(std::shared_ptr gpu_task); void PreBuildTask(std::shared_ptr gpu_task); @@ -274,13 +243,96 @@ class PSGPUWrapper { float max_bound, float learning_rate, float initial_g2sum, - float initial_range); + float initial_range, + float beta1_decay_rate, + float beta2_decay_rate, + float ada_epsilon); void SetEmbedxSGD(float mf_create_thresholds, float mf_learning_rate, float mf_initial_g2sum, float mf_initial_range, float mf_min_bound, - float mf_max_bound); + float mf_max_bound, + float mf_beta1_decay_rate, + float mf_beta2_decay_rate, + float mf_ada_epsilon); + +#ifdef PADDLE_WITH_PSCORE + void add_sparse_optimizer( + std::unordered_map& config, // NOLINT + const ::paddle::distributed::SparseCommonSGDRuleParameter& sgd_param, + const std::string& prefix = "") { + auto optimizer_name = sgd_param.name(); + if (optimizer_name == "SparseNaiveSGDRule") { + config[prefix + "optimizer_type"] = 0; + config[prefix + "learning_rate"] = sgd_param.naive().learning_rate(); + config[prefix + "initial_range"] = sgd_param.naive().initial_range(); + config[prefix + "min_bound"] = sgd_param.naive().weight_bounds()[0]; + config[prefix + "max_bound"] = sgd_param.naive().weight_bounds()[1]; + } else if (optimizer_name == "SparseAdaGradSGDRule") { + config[prefix + "optimizer_type"] = 1; + config[prefix + "learning_rate"] = sgd_param.adagrad().learning_rate(); + config[prefix + "initial_range"] = sgd_param.adagrad().initial_range(); + config[prefix + "initial_g2sum"] = sgd_param.adagrad().initial_g2sum(); + config[prefix + "min_bound"] = sgd_param.adagrad().weight_bounds()[0]; + config[prefix + "max_bound"] = sgd_param.adagrad().weight_bounds()[1]; + } else if (optimizer_name == "StdAdaGradSGDRule") { + config[prefix + "optimizer_type"] = 2; + config[prefix + "learning_rate"] = sgd_param.adagrad().learning_rate(); + config[prefix + "initial_range"] = sgd_param.adagrad().initial_range(); + config[prefix + "initial_g2sum"] = sgd_param.adagrad().initial_g2sum(); + config[prefix + "min_bound"] = sgd_param.adagrad().weight_bounds()[0]; + config[prefix + "max_bound"] = sgd_param.adagrad().weight_bounds()[1]; + } else if (optimizer_name == "SparseAdamSGDRule") { + config[prefix + "optimizer_type"] = 3; + config[prefix + "learning_rate"] = sgd_param.adam().learning_rate(); + config[prefix + "initial_range"] = sgd_param.adam().initial_range(); + config[prefix + "beta1_decay_rate"] = sgd_param.adam().beta1_decay_rate(); + config[prefix + "beta2_decay_rate"] = sgd_param.adam().beta2_decay_rate(); + config[prefix + "ada_epsilon"] = sgd_param.adam().ada_epsilon(); + config[prefix + "min_bound"] = sgd_param.adam().weight_bounds()[0]; + config[prefix + "max_bound"] = sgd_param.adam().weight_bounds()[1]; + } else if (optimizer_name == "SparseSharedAdamSGDRule") { + config[prefix + "optimizer_type"] = 4; + config[prefix + "learning_rate"] = sgd_param.adam().learning_rate(); + config[prefix + "initial_range"] = sgd_param.adam().initial_range(); + config[prefix + "beta1_decay_rate"] = sgd_param.adam().beta1_decay_rate(); + config[prefix + "beta2_decay_rate"] = sgd_param.adam().beta2_decay_rate(); + config[prefix + "ada_epsilon"] = sgd_param.adam().ada_epsilon(); + config[prefix + "min_bound"] = sgd_param.adam().weight_bounds()[0]; + config[prefix + "max_bound"] = sgd_param.adam().weight_bounds()[1]; + } + } + + void InitializeGPUServer(paddle::distributed::PSParameter ps_param) { + auto sparse_table = + ps_param.server_param().downpour_server_param().downpour_table_param(0); + auto sparse_table_accessor = sparse_table.accessor(); + auto sparse_table_accessor_parameter = + sparse_table_accessor.ctr_accessor_param(); + accessor_class_ = sparse_table_accessor.accessor_class(); + + std::unordered_map config; + config["embedx_dim"] = sparse_table_accessor.embedx_dim(); + config["nonclk_coeff"] = sparse_table_accessor_parameter.nonclk_coeff(); + config["clk_coeff"] = sparse_table_accessor_parameter.click_coeff(); + config["mf_create_thresholds"] = sparse_table_accessor.embedx_threshold(); + + if (accessor_class_ == "CtrDymfAccessor") { + // optimizer config for embed_w and embedx + add_sparse_optimizer(config, sparse_table_accessor.embed_sgd_param()); + add_sparse_optimizer( + config, sparse_table_accessor.embedx_sgd_param(), "mf_"); + } + + fleet_config_ = config; + GlobalAccessorTransfor::GetInstance().Init(accessor_class_); + GlobalAccessorTransfor::GetInstance().GetAccessorWrapper()->Configure( + config); + InitializeGPUServer(config); + } +#endif + void InitializeGPUServer(std::unordered_map config) { float nonclk_coeff = (config.find("nonclk_coeff") == config.end()) ? 1.0 @@ -288,54 +340,83 @@ class PSGPUWrapper { float clk_coeff = (config.find("clk_coeff") == config.end()) ? 1.0 : config["clk_coeff"]; float min_bound = (config.find("min_bound") == config.end()) - ? -10000.0 + ? -10.0 : config["min_bound"]; - float max_bound = (config.find("max_bound") == config.end()) - ? 10000.0 - : config["max_bound"]; + float max_bound = + (config.find("max_bound") == config.end()) ? 10.0 : config["max_bound"]; float learning_rate = (config.find("learning_rate") == config.end()) - ? 1.0 + ? 0.05 : config["learning_rate"]; float initial_g2sum = (config.find("initial_g2sum") == config.end()) - ? 1.0 + ? 3.0 : config["initial_g2sum"]; float initial_range = (config.find("initial_range") == config.end()) - ? 1.0 + ? 1e-4 : config["initial_range"]; - + float beta1_decay_rate = (config.find("beta1_decay_rate") == config.end()) + ? 0.9 + : config["beta1_decay_rate"]; + float beta2_decay_rate = (config.find("beta2_decay_rate") == config.end()) + ? 0.999 + : config["beta2_decay_rate"]; + float ada_epsilon = (config.find("ada_epsilon") == config.end()) + ? 1e-8 + : config["ada_epsilon"]; // mf config settings float mf_create_thresholds = (config.find("mf_create_thresholds") == config.end()) ? static_cast(1.0) : config["mf_create_thresholds"]; float mf_learning_rate = (config.find("mf_learning_rate") == config.end()) - ? 1.0 + ? 0.05 : config["mf_learning_rate"]; float mf_initial_g2sum = (config.find("mf_initial_g2sum") == config.end()) - ? 1.0 + ? 3.0 : config["mf_initial_g2sum"]; float mf_initial_range = (config.find("mf_initial_range") == config.end()) - ? 1.0 + ? 1e-4 : config["mf_initial_range"]; float mf_min_bound = (config.find("mf_min_bound") == config.end()) - ? 1.0 + ? -10.0 : config["mf_min_bound"]; float mf_max_bound = (config.find("mf_max_bound") == config.end()) - ? 1.0 + ? 10.0 : config["mf_max_bound"]; + float mf_beta1_decay_rate = + (config.find("mf_beta1_decay_rate") == config.end()) + ? 0.9 + : config["mf_beta1_decay_rate"]; + float mf_beta2_decay_rate = + (config.find("mf_beta2_decay_rate") == config.end()) + ? 0.999 + : config["mf_beta2_decay_rate"]; + float mf_ada_epsilon = (config.find("mf_ada_epsilon") == config.end()) + ? 1e-8 + : config["mf_ada_epsilon"]; this->SetSparseSGD(nonclk_coeff, clk_coeff, min_bound, max_bound, learning_rate, initial_g2sum, - initial_range); + initial_range, + beta1_decay_rate, + beta2_decay_rate, + ada_epsilon); this->SetEmbedxSGD(mf_create_thresholds, mf_learning_rate, mf_initial_g2sum, mf_initial_range, mf_min_bound, - mf_max_bound); + mf_max_bound, + mf_beta1_decay_rate, + mf_beta2_decay_rate, + mf_ada_epsilon); + + // set optimizer type(naive,adagrad,std_adagrad,adam,share_adam) + optimizer_type_ = (config.find("optimizer_type") == config.end()) + ? 1 + : static_cast(config["optimizer_type"]); } void SetDate(int year, int month, int day) { @@ -380,7 +461,7 @@ class PSGPUWrapper { if (slot_info_initialized_) { return; } - SlotRecordDataset* dataset = dynamic_cast(dataset_); + SlotRecordDataset* dataset = (SlotRecordDataset*)(dataset_); auto slots_vec = dataset->GetSlots(); slot_offset_vector_.clear(); for (auto& slot : slot_vector_) { @@ -421,10 +502,13 @@ class PSGPUWrapper { for (size_t i = 0; i < slot_index_vec_.size(); i++) { slot_index_vec_[i] = dim_index_map[slot_mf_dim_vector_[i]]; } - val_type_size_ = - TYPEALIGN(8, sizeof(FeatureValue) + sizeof(float) * (max_mf_dim_ + 1)); - grad_type_size_ = - TYPEALIGN(8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float))); + + auto accessor_wrapper_ptr = + GlobalAccessorTransfor::GetInstance().GetAccessorWrapper(); + val_type_size_ = accessor_wrapper_ptr->GetFeatureValueSize(max_mf_dim_); + grad_type_size_ = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_); + VLOG(0) << "InitSlotInfo: val_type_size_" << val_type_size_ + << " grad_type_size_:" << grad_type_size_; slot_info_initialized_ = true; } #endif @@ -445,6 +529,12 @@ class PSGPUWrapper { const std::string& conf); #endif +#ifdef PADDLE_WITH_PSCORE + void SetTableAccessor(paddle::distributed::ValueAccessor* accessor) { + cpu_table_accessor_ = accessor; + } +#endif + private: static std::shared_ptr s_instance_; Dataset* dataset_; @@ -497,6 +587,12 @@ class PSGPUWrapper { int day_; bool slot_info_initialized_ = false; int use_afs_api_ = 0; + int optimizer_type_ = 1; + std::string accessor_class_; + std::unordered_map fleet_config_; +#ifdef PADDLE_WITH_PSCORE + paddle::distributed::ValueAccessor* cpu_table_accessor_; +#endif #ifdef PADDLE_WITH_CUDA std::vector mem_pools_; @@ -521,6 +617,7 @@ class PSGPUWrapper { bool running_ = false; std::vector> pull_thread_pool_; std::vector> hbm_thread_pool_; + OptimizerConfig optimizer_config_; protected: static bool is_initialized_; diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps b/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps index ef6c70e624d4c..3505bff72e90a 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps @@ -28,9 +28,13 @@ limitations under the License. */ namespace paddle { namespace framework { -__global__ void PullCopy(float* dest, const FeatureValue* src, - const long long* len, int hidden, int slot_num, - int total_len, unsigned long long* keys) { +__global__ void PullCopy(float* dest, + const FeatureValue* src, + const long long* len, + int hidden, + int slot_num, + int total_len, + unsigned long long* keys) { int cid = core_id(); int ncores = core_num(); if (cid >= ncores) { @@ -42,8 +46,8 @@ __global__ void PullCopy(float* dest, const FeatureValue* src, GM2LM(len, local_len, slot_num * sizeof(int64_t)); __global_ptr__ unsigned long long* local_keys[slot_num]; - GM2LM(keys, local_keys, - slot_num * sizeof(__global_ptr__ unsigned long long*)); + GM2LM( + keys, local_keys, slot_num * sizeof(__global_ptr__ unsigned long long*)); __global_ptr__ float* local_dest[slot_num]; GM2LM(dest, local_dest, slot_num * sizeof(__global_ptr__ float*)); @@ -64,10 +68,11 @@ __global__ void PullCopy(float* dest, const FeatureValue* src, // copy read_len (length) of slots' val to LM for (int k = 0; k < slot_len; k += read_len) { int real_read_len = min(read_len, slot_len - k); - GM2LM(src + dest_len + k, local_slot_vals, + GM2LM(src + dest_len + k, + local_slot_vals, real_read_len * sizeof(FeatureValue)); - GM2LM(local_keys[i] + k, local_slot_keys, - real_read_len * sizeof(uint64_t)); + GM2LM( + local_keys[i] + k, local_slot_keys, real_read_len * sizeof(uint64_t)); for (int j = 0; j < real_read_len; j++) { if (local_slot_keys[j] == 0) { local_dest_vals[j * hidden] = 0; @@ -89,7 +94,8 @@ __global__ void PullCopy(float* dest, const FeatureValue* src, } } } - LM2GM(local_dest_vals, local_dest[i] + k * hidden, + LM2GM(local_dest_vals, + local_dest[i] + k * hidden, real_read_len * hidden * sizeof(float)); } } @@ -97,7 +103,8 @@ __global__ void PullCopy(float* dest, const FeatureValue* src, __global__ void CopyKeysKernel(unsigned long long* src_keys, unsigned long long* dest_total_keys, - const long long* len, int slot_num, + const long long* len, + int slot_num, int total_len) { int cid = core_id(); int ncores = core_num(); @@ -110,7 +117,8 @@ __global__ void CopyKeysKernel(unsigned long long* src_keys, GM2LM(len, local_len, slot_num * sizeof(long long)); __global_ptr__ unsigned long long* local_keys[slot_num]; - GM2LM(src_keys, local_keys, + GM2LM(src_keys, + local_keys, slot_num * sizeof(__global_ptr__ unsigned long long*)); for (int i = thread_id; i < slot_num; i += nthreads) { @@ -123,16 +131,23 @@ __global__ void CopyKeysKernel(unsigned long long* src_keys, for (int k = 0; k < slot_len; k += read_len) { int real_read_len = min(read_len, slot_len - k); - GM2LM(local_keys[i] + k, local_slot_keys, + GM2LM(local_keys[i] + k, + local_slot_keys, real_read_len * sizeof(unsigned long long)); - LM2GM(local_slot_keys, dest_total_keys + dest_len + k, + LM2GM(local_slot_keys, + dest_total_keys + dest_len + k, real_read_len * sizeof(unsigned long long)); } } } -__global__ void PushCopy(FeaturePushValue* dest, float* src, long long* len, - int hidden, int slot_num, int total_len, int bs, +__global__ void PushCopy(FeaturePushValue* dest, + float* src, + long long* len, + int hidden, + int slot_num, + int total_len, + int bs, int* slot_vector) { int cid = core_id(); int ncores = core_num(); @@ -163,7 +178,8 @@ __global__ void PushCopy(FeaturePushValue* dest, float* src, long long* len, // copy read_len(length) of slots' grad to LM for (int k = 0; k < slot_len; k += read_len) { int real_read_len = min(read_len, slot_len - k); - GM2LM(local_src[i] + k * hidden, local_slot_grads, + GM2LM(local_src[i] + k * hidden, + local_slot_grads, real_read_len * hidden * sizeof(float)); // copy from slots' grad to total grad for (int j = 0; j < real_read_len; j++) { @@ -176,7 +192,8 @@ __global__ void PushCopy(FeaturePushValue* dest, float* src, long long* len, local_slot_grads[j * hidden + 3 + m] * -1. * bs; } } - LM2GM(local_dest_grads, dest + dest_len + k, + LM2GM(local_dest_grads, + dest + dest_len + k, real_read_len * sizeof(FeaturePushValue)); } } @@ -184,40 +201,11 @@ __global__ void PushCopy(FeaturePushValue* dest, float* src, long long* len, PSGPUWrapper::~PSGPUWrapper() { delete HeterPs_; } -void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place, - uint64_t** gpu_keys, - const std::vector& values, - const FeatureValue* total_values_gpu, - const int64_t* gpu_len, const int slot_num, - const int hidden_size, - const int64_t total_length) { - XPUStream stream = nullptr; - auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); - stream = static_cast(dev_ctx) - ->x_context() - ->xpu_stream; - // float* buf_value = nullptr; - // xpu_malloc(reinterpret_cast(&buf_value), - // values.size() * sizeof(float*)); - // float** gpu_values = reinterpret_cast(&buf_value); - float* gpu_values = nullptr; - xpu_malloc(reinterpret_cast(&gpu_values), - values.size() * sizeof(float*)); - xpu_memcpy(gpu_values, values.data(), values.size() * sizeof(float*), - XPU_HOST_TO_DEVICE); - - // unsigned long long** c_keys = (unsigned long long**)gpu_keys; - unsigned long long* c_keys = reinterpret_cast(gpu_keys); - const long long* c_len = (const long long*)gpu_len; - PullCopy<<<2, 64, stream>>>(gpu_values, total_values_gpu, c_len, hidden_size, - slot_num, total_length, c_keys); - - xpu_wait(stream); -} - void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place, - uint64_t** origin_keys, uint64_t* total_keys, - const int64_t* gpu_len, int slot_num, + uint64_t** origin_keys, + uint64_t* total_keys, + const int64_t* gpu_len, + int slot_num, int total_len) { XPUStream stream = nullptr; auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); @@ -232,66 +220,49 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place, xpu_wait(stream); } -void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place, - const std::vector& grad_values, - FeaturePushValue* total_grad_values_gpu, - const std::vector& slot_lengths, - const int hidden_size, - const int64_t total_length, - const int batch_size) { - XPUStream stream = nullptr; - auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); - stream = static_cast(dev_ctx) - ->x_context() - ->xpu_stream; - auto slot_lengths_lod = slot_lengths; - for (size_t i = 1; i < slot_lengths_lod.size(); i++) { - slot_lengths_lod[i] += slot_lengths_lod[i - 1]; - } - - float* gpu_values = nullptr; - int64_t* gpu_len = nullptr; - int* d_slot_vector = nullptr; - - xpu_malloc(reinterpret_cast(&gpu_values), - grad_values.size() * sizeof(float*)); - xpu_malloc(reinterpret_cast(&gpu_len), - slot_lengths.size() * sizeof(int64_t)); - xpu_malloc(reinterpret_cast(&d_slot_vector), - slot_lengths_lod.size() * sizeof(int)); - - xpu_memcpy(gpu_values, grad_values.data(), - grad_values.size() * sizeof(float*), XPU_HOST_TO_DEVICE); - xpu_memcpy(gpu_len, slot_lengths_lod.data(), - slot_lengths.size() * sizeof(int64_t), XPU_HOST_TO_DEVICE); - xpu_memcpy(d_slot_vector, slot_vector_.data(), - slot_lengths_lod.size() * sizeof(int), XPU_HOST_TO_DEVICE); - - long long* c_len = (long long*)gpu_len; - PushCopy<<<2, 64, stream>>>(total_grad_values_gpu, gpu_values, c_len, - hidden_size, slot_lengths.size(), total_length, - batch_size, d_slot_vector); - xpu_wait(stream); -} - -void PSGPUWrapper::SetSparseSGD(float nonclk_coeff, float clk_coeff, - float min_bound, float max_bound, - float learning_rate, float initial_g2sum, - float initial_range) { +void PSGPUWrapper::SetSparseSGD(float nonclk_coeff, + float clk_coeff, + float min_bound, + float max_bound, + float learning_rate, + float initial_g2sum, + float initial_range, + float beta1_decay_rate, + float beta2_decay_rate, + float ada_epsilon) { OptimizerConfig optimizer_config; - optimizer_config.set_sparse_sgd(nonclk_coeff, clk_coeff, min_bound, max_bound, - learning_rate, initial_g2sum, initial_range); + optimizer_config.set_sparse_sgd(nonclk_coeff, + clk_coeff, + min_bound, + max_bound, + learning_rate, + initial_g2sum, + initial_range, + beta1_decay_rate, + beta2_decay_rate, + ada_epsilon); HeterPs_->set_sparse_sgd(optimizer_config); } void PSGPUWrapper::SetEmbedxSGD(float mf_create_thresholds, - float mf_learning_rate, float mf_initial_g2sum, - float mf_initial_range, float mf_min_bound, - float mf_max_bound) { + float mf_learning_rate, + float mf_initial_g2sum, + float mf_initial_range, + float mf_min_bound, + float mf_max_bound, + float mf_beta1_decay_rate, + float mf_beta2_decay_rate, + float mf_ada_epsilon) { OptimizerConfig optimizer_config; - optimizer_config.set_embedx_sgd(mf_create_thresholds, mf_learning_rate, - mf_initial_g2sum, mf_initial_range, - mf_min_bound, mf_max_bound); + optimizer_config.set_embedx_sgd(mf_create_thresholds, + mf_learning_rate, + mf_initial_g2sum, + mf_initial_range, + mf_min_bound, + mf_max_bound, + mf_beta1_decay_rate, + mf_beta2_decay_rate, + mf_ada_epsilon); HeterPs_->set_embedx_sgd(optimizer_config); } diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py index 902854a7c7279..c58b539b6877d 100755 --- a/python/paddle/distributed/fleet/base/distributed_strategy.py +++ b/python/paddle/distributed/fleet/base/distributed_strategy.py @@ -594,6 +594,21 @@ def sparse_optimizer_config(sgd, strategy, prefix): bounds = strategy.get(prefix + 'sparse_weight_bounds', [-10, 10]) sgd.adam.weight_bounds.extend(bounds) + elif optimizer_name == "shared_adam": + sgd.name = 'SparseSharedAdamSGDRule' + sgd.adam.learning_rate = strategy.get( + prefix + 'sparse_learning_rate', 0.001) + sgd.adam.initial_range = strategy.get( + prefix + 'sparse_initial_range', 1e-4) + sgd.adam.beta1_decay_rate = strategy.get( + prefix + 'sparse_beta1_decay_rate', 0.9) + sgd.adam.beta2_decay_rate = strategy.get( + prefix + 'sparse_beta2_decay_rate', 0.999) + sgd.adam.ada_epsilon = strategy.get( + prefix + 'sparse_ada_epsilon', 1e-8) + bounds = strategy.get(prefix + 'sparse_weight_bounds', + [-10, 10]) + sgd.adam.weight_bounds.extend(bounds) def set_sparse_table_config(table_data, config): for key in config: diff --git a/python/paddle/distributed/ps/the_one_ps.py b/python/paddle/distributed/ps/the_one_ps.py index c6ba48e5e32b5..7d240983a1c28 100755 --- a/python/paddle/distributed/ps/the_one_ps.py +++ b/python/paddle/distributed/ps/the_one_ps.py @@ -195,7 +195,7 @@ def _set(self, accessor_proto, varname, program_id, context): sgd_param.naive.initial_range = 0.0001 if len(sgd_param.naive.weight_bounds) == 0: sgd_param.naive.weight_bounds.extend([-10.0, 10.0]) - if sgd_param.name == "SparseAdamSGDRule": + if sgd_param.name == "SparseAdamSGDRule" or sgd_param.name == "SparseSharedAdamSGDRule": if not sgd_param.adam.HasField("learning_rate"): sgd_param.adam.learning_rate = 0.001 if not sgd_param.adam.HasField("initial_range"): diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps13.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps13.py new file mode 100644 index 0000000000000..c5ae2365b07cd --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps13.py @@ -0,0 +1,201 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os + +os.environ["WITH_DISTRIBUTE"] = "ON" + +import unittest +import tempfile +import shutil + +import paddle +import paddle.fluid as fluid +import paddle.distributed.fleet.base.role_maker as role_maker +import paddle.distributed.fleet as fleet + +paddle.enable_static() + +# For Net +base_lr = 0.2 +emb_lr = base_lr * 3 +dict_dim = 1500 +emb_dim = 128 +hid_dim = 128 +margin = 0.1 +sample_rate = 1 +batch_size = 4 + + +# this unittest is tested for SparseSharedAdamSGDRule +class TestPSPassWithBow(unittest.TestCase): + + def net(self): + + def get_acc(cos_q_nt, cos_q_pt, batch_size): + cond = fluid.layers.less_than(cos_q_nt, cos_q_pt) + cond = fluid.layers.cast(cond, dtype='float64') + cond_3 = fluid.layers.reduce_sum(cond) + acc = fluid.layers.elementwise_div(cond_3, + fluid.layers.fill_constant( + shape=[1], + value=batch_size * 1.0, + dtype='float64'), + name="simnet_acc") + return acc + + def get_loss(cos_q_pt, cos_q_nt): + loss_op1 = fluid.layers.elementwise_sub( + fluid.layers.fill_constant_batch_size_like(input=cos_q_pt, + shape=[-1, 1], + value=margin, + dtype='float32'), + cos_q_pt) + loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt) + loss_op3 = fluid.layers.elementwise_max( + fluid.layers.fill_constant_batch_size_like(input=loss_op2, + shape=[-1, 1], + value=0.0, + dtype='float32'), + loss_op2) + avg_cost = fluid.layers.mean(loss_op3) + return avg_cost + + is_distributed = False + is_sparse = True + + # query + q = fluid.layers.data(name="query_ids", + shape=[1], + dtype="int64", + lod_level=1) + # embedding + q_emb = fluid.contrib.layers.sparse_embedding( + input=q, + size=[dict_dim, emb_dim], + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01), + name="__emb__", + learning_rate=emb_lr)) + q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim]) + # vsum + q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum') + q_ss = fluid.layers.softsign(q_sum) + # fc layer after conv + q_fc = fluid.layers.fc( + input=q_ss, + size=hid_dim, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01), + name="__q_fc__", + learning_rate=base_lr)) + # label data + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + # pt + pt = fluid.layers.data(name="pos_title_ids", + shape=[1], + dtype="int64", + lod_level=1) + # embedding + pt_emb = fluid.contrib.layers.sparse_embedding( + input=pt, + size=[dict_dim, emb_dim], + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01), + name="__emb__", + learning_rate=emb_lr)) + pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim]) + # vsum + pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum') + pt_ss = fluid.layers.softsign(pt_sum) + # fc layer + pt_fc = fluid.layers.fc( + input=pt_ss, + size=hid_dim, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01), + name="__fc__", + learning_rate=base_lr), + bias_attr=fluid.ParamAttr(name="__fc_b__")) + # nt + nt = fluid.layers.data(name="neg_title_ids", + shape=[1], + dtype="int64", + lod_level=1) + # embedding + nt_emb = fluid.contrib.layers.sparse_embedding( + input=nt, + size=[dict_dim, emb_dim], + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01), + name="__emb__", + learning_rate=emb_lr)) + nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim]) + # vsum + nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum') + nt_ss = fluid.layers.softsign(nt_sum) + # fc layer + nt_fc = fluid.layers.fc( + input=nt_ss, + size=hid_dim, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01), + name="__fc__", + learning_rate=base_lr), + bias_attr=fluid.ParamAttr(name="__fc_b__")) + cos_q_pt = fluid.layers.cos_sim(q_fc, pt_fc) + cos_q_nt = fluid.layers.cos_sim(q_fc, nt_fc) + # loss + avg_cost = get_loss(cos_q_pt, cos_q_nt) + # acc + acc = get_acc(cos_q_nt, cos_q_pt, batch_size) + return [avg_cost, acc, cos_q_pt] + + def test(self): + os.environ["PADDLE_PSERVER_NUMS"] = "2" + os.environ["PADDLE_TRAINERS_NUM"] = "2" + os.environ["POD_IP"] = "127.0.0.1" + os.environ["PADDLE_PORT"] = "36001" + os.environ["PADDLE_TRAINER_ID"] = "0" + os.environ["PADDLE_TRAINERS_NUM"] = "2" + os.environ[ + "PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001,127.0.0.2:36001" + os.environ["TRAINING_ROLE"] = "PSERVER" + + role = role_maker.PaddleCloudRoleMaker() + fleet.init(role) + loss, acc, _ = self.net() + + strategy = paddle.distributed.fleet.DistributedStrategy() + strategy.a_sync = True + + configs = {} + configs['__emb__'] = { + "table_parameters.__emb__.accessor.embed_sgd_param.name": + "SparseSharedAdamSGDRule", + "table_parameters.__emb__.accessor.embedx_sgd_param.name": + "SparseSharedAdamSGDRule", + } + strategy.sparse_table_configs = configs + optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) + optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) + optimizer.minimize(loss) + + fleet.init_server() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py index 455a7a30cfd18..9ac88c802111f 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py @@ -334,6 +334,14 @@ def test_fleet_desc_configs(self): strategy.sparse_table_configs[0].accessor.embed_sgd_param.adagrad. initial_range, 0.0001) + strategy = paddle.distributed.fleet.DistributedStrategy() + configs = {} + configs['emb'] = {"sparse_optimizer": "shared_adam"} + strategy.fleet_desc_configs = configs + self.assertEqual( + strategy.sparse_table_configs[0].accessor.embed_sgd_param.adam. + beta1_decay_rate, 0.9) + def test_trainer_desc_configs(self): strategy = paddle.distributed.fleet.DistributedStrategy() configs = { diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py index 559f2d95b915f..53ab93f57ce56 100755 --- a/tools/parallel_UT_rule.py +++ b/tools/parallel_UT_rule.py @@ -671,7 +671,8 @@ 'test_trt_convert_reduce_sum', 'save_quant2_model_lstm', 'test_trt_convert_slice', - 'test_quant2_int8_lstm_mkldnn' + 'test_quant2_int8_lstm_mkldnn', + 'test_dist_fleet_ps13' ] # mem=0 but always timeout or failed : It run 15 job each time in Single cases;