microsoft · tjruwase · Apr 25, 2021 · Apr 12, 2021 · Apr 12, 2021 · Apr 12, 2021
@@ -352,6 +352,9 @@ def zero_cpu_offload(self):
     def zero_sub_group_size(self):
         return self._config.zero_config.sub_group_size
 
+    def zero_find_unused_parameters(self):
+        return self._config.zero_config.find_unused_parameters
+
     def zero_optimization_stage(self):
         return self._config.zero_optimization_stage
 
@@ -789,7 +792,8 @@ def _configure_zero_optimizer(self, optimizer):
                 mpu=self.mpu,
                 postscale_gradients=self.postscale_gradients(),
                 gradient_predivide_factor=self.gradient_predivide_factor(),
-                gradient_accumulation_steps=self.gradient_accumulation_steps())
+                gradient_accumulation_steps=self.gradient_accumulation_steps(),
+                find_unused_parameters=self.zero_find_unused_parameters())
         elif zero_stage == ZERO_OPTIMIZATION_WEIGHTS:
             print("Initializing ZeRO Stage 3") if dist.get_rank() == 0 else None
             from deepspeed.runtime.zero.stage3 import FP16_DeepSpeedZeroOptimizer_Stage3

@@ -31,6 +31,9 @@ def __init__(self, param_dict):
         self.offload_optimizer = None
         self.sub_group_size = None
 
+        #Stage2 Specific Parameters
+        self.find_unused_parameters = None
+
         #Stage3 Specific Parameters
         self.prefetch_bucket_size = None
         self.param_persistence_threshold = None
@@ -151,6 +154,11 @@ def _initialize(self, zero_config_dict):
                                                ZERO_OPTIMIZATION_SUB_GROUP_SIZE,
                                                ZERO_OPTIMIZATION_SUB_GROUP_SIZE_DEFAULT)
 
+        self.find_unused_parameters = get_scalar_param(
+            zero_config_dict,
+            ZERO_OPTIMIZATION_FIND_UNUSED_PARAMETERS,
+            ZERO_OPTIMIZATION_FIND_UNUSED_PARAMETERS_DEFAULT)
+
         self.max_live_parameters = get_scalar_param(
             zero_config_dict,
             ZERO_OPTIMIZATION_MAX_LIVE_PARAMETERS,

@@ -29,7 +29,8 @@
     "cpu_offload_use_pin_memory": [true|false] (deprecated),
     "sub_group_size" : 1000000000000,
     "offload_param": {...},
-    "offload_optimizer": {...}
+    "offload_optimizer": {...},
+    "stage2_find_unused_parameters": [true|false]
     }
 }
 '''
@@ -113,6 +114,14 @@
 ZERO_OPTIMIZATION_GATHER_FP16_WEIGHTS_ON_MODEL_SAVE = 'stage3_gather_fp16_weights_on_model_save'
 ZERO_OPTIMIZATION_GATHER_FP16_WEIGHTS_ON_MODEL_SAVE_DEFAULT = False
 
+# Used in stage2 complete_grad_norm_calculation_for_cpu_offload
+# Enable this option to avoid:
+# https://github.com/microsoft/DeepSpeed/issues/707
+# torch.nn.parallel.DistributedDataParallel has the same option with
+# similar usage
+ZERO_OPTIMIZATION_FIND_UNUSED_PARAMETERS = 'stage2_find_unused_parameters'
+ZERO_OPTIMIZATION_FIND_UNUSED_PARAMETERS_DEFAULT = False
+
 ZERO_OPTIMIZATION_DEFAULT = {
     ZERO_OPTIMIZATION_STAGE:
     ZERO_OPTIMIZATION_STAGE_DEFAULT,
@@ -145,5 +154,7 @@
     ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD:
     ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD_DEFAULT,
     ZERO_OPTIMIZATION_GATHER_FP16_WEIGHTS_ON_MODEL_SAVE:
-    ZERO_OPTIMIZATION_GATHER_FP16_WEIGHTS_ON_MODEL_SAVE_DEFAULT
+    ZERO_OPTIMIZATION_GATHER_FP16_WEIGHTS_ON_MODEL_SAVE_DEFAULT,
+    ZERO_OPTIMIZATION_FIND_UNUSED_PARAMETERS:
+    ZERO_OPTIMIZATION_FIND_UNUSED_PARAMETERS_DEFAULT
 }
@@ -95,7 +95,8 @@ def __init__(self,
                  allreduce_always_fp32=False,
                  postscale_gradients=True,
                  gradient_predivide_factor=1.0,
-                 gradient_accumulation_steps=1):
+                 gradient_accumulation_steps=1,
+                 find_unused_parameters=False):
 
         if dist.get_rank() == 0:
             logger.info(f"Reduce bucket size {reduce_bucket_size}")
@@ -149,6 +150,7 @@ def __init__(self,
         self.postscale_gradients = postscale_gradients
         self.gradient_accumulation_steps = gradient_accumulation_steps
         self.micro_step_id = 0
+        self.find_unused_parameters = find_unused_parameters
 
         if self.reduce_scatter:
             assert not self.allreduce_always_fp32, "allreduce_always_fp32 is not yet supported with ZeRO-2 with reduce scatter enabled"
@@ -886,6 +888,19 @@ def complete_grad_norm_calculation_for_cpu_offload(self, params):
                 if param_id in self.norm_for_param_grads:
                     param_norm = self.norm_for_param_grads[param_id]
                     total_norm += param_norm.item()**2
+                else:
+                    # As unused parameters in modules may not be expected sometimes,
+                    # add an explicit error msg when it occurred and an option to
+                    # avoid the error
+                    # Error msg adapted from torch.nn.parallel.DistributedDataParallel
+                    assert self.find_unused_parameters, """
+                        This error indicates that your module has parameters that
+                        were not used in producing loss.
+                        You can avoid this error by
+                        (1) enable stage2_find_unused_parameters option in zero_optimization config;
+                        (2) making sure all trainable parameters and `forward` function
+                            outputs participate in calculating loss.
+                    """
 
         # Sum across all model parallel GPUs.
         total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])

@@ -299,7 +299,8 @@ Enabling and configuring ZeRO memory optimizations
     "stage3_param_persistence_threshold" : 1e6,
     "sub_group_size" : 1e12,
     "elastic_checkpoint" : [true|false],
-    "stage3_gather_fp16_weights_on_model_save": [true|false]
+    "stage3_gather_fp16_weights_on_model_save": [true|false],
+    "stage2_find_unused_parameters": [true|false]
     }
 ```
 
@@ -396,6 +397,7 @@ Enabling and configuring ZeRO memory optimizations
 | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
 | Consolidate the weights before saving the model by `save_fp16_model()`. Since the weights are partitioned across GPUs, they aren't part of `state_dict`, so this function automatically gather the weights when this option is enabled and then saves the fp16 model weights. | `False` |
 
+
 ***cpu_offload***: [boolean]
 
 **Deprecated:** **cpu_offload** is disabled and will be removed in future, please use `offload_optimizer` instead.
@@ -538,6 +540,10 @@ Configuring the asynchronous I/O module for offloading parameter and optimizer s
 | ------------------------------------------------------------------------------------------------------------------------------------- | ------- |
 | Submit requests to storage device in an overlapped fashion without waiting for completion of earlier requests. | `true`  |
 
+***stage2_find_unused_parameters***: [boolean]
+| Description                                                                                                                                                          | Default |
+| -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
+| As unused parameters in modules may not be expected sometimes, it will cause an explicit error msg when it occurred and enable this option to avoid the error, `torch.nn.parallel.DistributedDataParallel` has the same `find_unused_parameters` option with similar usage. | `False` |
 
 ### Logging