From b95891beb2665b72410c3036cd4dbd02ea1ff98f Mon Sep 17 00:00:00 2001 From: sdtblck <46172032+sdtblck@users.noreply.github.com> Date: Sat, 23 Jan 2021 22:00:30 +0100 Subject: [PATCH 1/2] Fix all Pipeline Module Parameters being sent to cuda:0 so, it seems that internally, PipelineModule calls self.to('cuda'). This appears to send the model parameters to GPU0, for *every visible gpu* running the deepspeed script, and makes scaling up difficult. Instead, we can send the parameters to the current device with `self.to(f'cuda:{self.global_rank}')`. I'm actually unsure as to why the entire model's parameters are getting sent to one single GPU at this stage at all - aren't they meant to be partitioned equally across all GPUs? Would appreciate anyone clearing this up for me. --- deepspeed/runtime/pipe/module.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepspeed/runtime/pipe/module.py b/deepspeed/runtime/pipe/module.py index 6d24ed469f3a..8035a8b97e78 100644 --- a/deepspeed/runtime/pipe/module.py +++ b/deepspeed/runtime/pipe/module.py @@ -186,7 +186,7 @@ def forward(self, inputs): #with torch.random.fork_rng(devices=[torch.cuda.current_device()]): self._build() - self.to('cuda') + self.to(f'cuda:{self.global_rank}') self.tied_comms = self._index_tied_modules() self._synchronize_tied_weights() From b9c6f70c230513e4d8bc80d16a986b6036a1e5ed Mon Sep 17 00:00:00 2001 From: sdtblck <46172032+sdtblck@users.noreply.github.com> Date: Fri, 29 Jan 2021 16:20:03 +0100 Subject: [PATCH 2/2] change global_rank to local_rank --- deepspeed/runtime/pipe/module.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/deepspeed/runtime/pipe/module.py b/deepspeed/runtime/pipe/module.py index 8035a8b97e78..5204d4b09de4 100644 --- a/deepspeed/runtime/pipe/module.py +++ b/deepspeed/runtime/pipe/module.py @@ -148,6 +148,8 @@ def forward(self, inputs): self.world_group = dist.new_group(ranks=range(dist.get_world_size())) self.global_rank = dist.get_rank(group=self.world_group) self.world_size = dist.get_world_size(group=self.world_group) + self.local_rank = int(os.environ.get("LOCAL_RANK", None)) + assert self.local_rank != None if topology: self._topo = topology @@ -186,7 +188,7 @@ def forward(self, inputs): #with torch.random.fork_rng(devices=[torch.cuda.current_device()]): self._build() - self.to(f'cuda:{self.global_rank}') + self.to(f'cuda:{self.local_rank}') self.tied_comms = self._index_tied_modules() self._synchronize_tied_weights()