From ecc8b115afb4669d895403a2a05218901582c750 Mon Sep 17 00:00:00 2001
From: hamlet <gvvvv@163.com>
Date: Mon, 12 Apr 2021 11:59:16 +0800
Subject: [PATCH] Add find_unused_parameters option

As unused parameters in modules may not be expected sometimes,
add an explicit error msg when it occurred and an option to avoid the error: https://github.com/microsoft/DeepSpeed/issues/707
---
 deepspeed/runtime/zero/stage2.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/deepspeed/runtime/zero/stage2.py b/deepspeed/runtime/zero/stage2.py
index cd29625958c9..a91f99f775ea 100755
--- a/deepspeed/runtime/zero/stage2.py
+++ b/deepspeed/runtime/zero/stage2.py
@@ -119,7 +119,8 @@ def __init__(self,
                  allreduce_always_fp32=False,
                  postscale_gradients=True,
                  gradient_predivide_factor=1.0,
-                 gradient_accumulation_steps=1):
+                 gradient_accumulation_steps=1,
+                 find_unused_parameters=False):
 
         # Load pre-installed or JIT compile (un)flatten ops
         util_ops = UtilsBuilder().load()
@@ -173,6 +174,7 @@ def __init__(self,
         self.postscale_gradients = postscale_gradients
         self.gradient_accumulation_steps = gradient_accumulation_steps
         self.micro_step_id = 0
+        self.find_unused_parameters = find_unused_parameters
 
         if self.reduce_scatter:
             assert not self.allreduce_always_fp32, "allreduce_always_fp32 is not yet supported with ZeRO-2 with reduce scatter enabled"
@@ -889,6 +891,17 @@ def complete_grad_norm_calculation_for_cpu_offload(self, params):
                 if param_id in self.norm_for_param_grads:
                     param_norm = self.norm_for_param_grads[param_id]
                     total_norm += param_norm.item()**2
+                else:
+                    # As unused parameters in modules may not be expected sometimes,
+                    # add an explicit error msg when it occurred and an option to avoid the error
+                    # Error msg adapted from torch.nn.parallel.DistributedDataParallel
+                    assert self.find_unused_parameters, """
+                        This error indicates that your module has parameters that were not used in producing loss.
+                        You can enable unused parameter detection by
+                        (1) passing the keyword argument `find_unused_parameters=True` to `deepspeed.runtime.engine.DeepSpeedEngine`;
+                        (2) making sure all trainable parameters and `forward` function outputs participate in calculating loss.
+                    """
+
 
         # Sum across all model parallel GPUs.
         total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])