From b95891beb2665b72410c3036cd4dbd02ea1ff98f Mon Sep 17 00:00:00 2001
From: sdtblck <46172032+sdtblck@users.noreply.github.com>
Date: Sat, 23 Jan 2021 22:00:30 +0100
Subject: [PATCH 1/2] Fix all Pipeline Module Parameters being sent to cuda:0

so, it seems that internally, PipelineModule calls self.to('cuda'). This appears to send the model parameters to GPU0, for *every visible gpu* running the deepspeed script, and makes scaling up difficult.

Instead, we can send the parameters to the current device with `self.to(f'cuda:{self.global_rank}')`.

I'm actually unsure as to why the entire model's parameters are getting sent to one single GPU at this stage at all - aren't they meant to be partitioned equally across all GPUs?

Would appreciate anyone clearing this up for me.
---
 deepspeed/runtime/pipe/module.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deepspeed/runtime/pipe/module.py b/deepspeed/runtime/pipe/module.py
index 6d24ed469f3a..8035a8b97e78 100644
--- a/deepspeed/runtime/pipe/module.py
+++ b/deepspeed/runtime/pipe/module.py
@@ -186,7 +186,7 @@ def forward(self, inputs):
 
         #with torch.random.fork_rng(devices=[torch.cuda.current_device()]):
         self._build()
-        self.to('cuda')
+        self.to(f'cuda:{self.global_rank}')
 
         self.tied_comms = self._index_tied_modules()
         self._synchronize_tied_weights()

From b9c6f70c230513e4d8bc80d16a986b6036a1e5ed Mon Sep 17 00:00:00 2001
From: sdtblck <46172032+sdtblck@users.noreply.github.com>
Date: Fri, 29 Jan 2021 16:20:03 +0100
Subject: [PATCH 2/2] change global_rank to local_rank

---
 deepspeed/runtime/pipe/module.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/deepspeed/runtime/pipe/module.py b/deepspeed/runtime/pipe/module.py
index 8035a8b97e78..5204d4b09de4 100644
--- a/deepspeed/runtime/pipe/module.py
+++ b/deepspeed/runtime/pipe/module.py
@@ -148,6 +148,8 @@ def forward(self, inputs):
         self.world_group = dist.new_group(ranks=range(dist.get_world_size()))
         self.global_rank = dist.get_rank(group=self.world_group)
         self.world_size = dist.get_world_size(group=self.world_group)
+        self.local_rank = int(os.environ.get("LOCAL_RANK", None))
+        assert self.local_rank != None
 
         if topology:
             self._topo = topology
@@ -186,7 +188,7 @@ def forward(self, inputs):
 
         #with torch.random.fork_rng(devices=[torch.cuda.current_device()]):
         self._build()
-        self.to(f'cuda:{self.global_rank}')
+        self.to(f'cuda:{self.local_rank}')
 
         self.tied_comms = self._index_tied_modules()
         self._synchronize_tied_weights()