From 4628a1524381b8190e57c637fe931bd49737cf90 Mon Sep 17 00:00:00 2001 From: qingshui Date: Mon, 20 Feb 2023 19:33:28 +0800 Subject: [PATCH] fix multi-machine save model (#217) --- paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu | 3 ++- python/paddle/distributed/ps/the_one_ps.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu index 401e2f545ca262..865daaa5d8d4ab 100644 --- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu +++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu @@ -170,7 +170,8 @@ void GraphGpuWrapper::init_type_keys( auto place = platform::CUDAPlace(gpuid); platform::CUDADeviceGuard guard(gpuid); keys[f_idx][j] = - memory::AllocShared(place, tmp_keys[j].size() * sizeof(uint64_t)); + memory::AllocShared(place, tmp_keys[j].size() * sizeof(uint64_t), + phi::Stream(reinterpret_cast(stream))); cudaMemcpyAsync(keys[f_idx][j]->ptr(), tmp_keys[j].data(), sizeof(uint64_t) * tmp_keys[j].size(), diff --git a/python/paddle/distributed/ps/the_one_ps.py b/python/paddle/distributed/ps/the_one_ps.py index cca1d1552fbb8a..592f12f4abf804 100755 --- a/python/paddle/distributed/ps/the_one_ps.py +++ b/python/paddle/distributed/ps/the_one_ps.py @@ -1729,7 +1729,7 @@ def _save_dense_params(self, *args, **kwargs): def _save_persistables(self, *args, **kwargs): fleet.util.barrier() - if self.role_maker._is_first_worker(): + if self.context['use_ps_gpu'] or self.role_maker._is_first_worker(): self._save_distributed_persistables(*args, **kwargs) fleet.util.barrier() @@ -1747,7 +1747,7 @@ def _load_one_table(self, table_id, path, mode): def _load_persistables(self, path, mode): fleet.util.barrier() - if self.role_maker._is_first_worker(): + if self.context['use_ps_gpu'] or self.role_maker._is_first_worker(): self._worker.load_model(path, mode) fleet.util.barrier()