From 42d60d043c2ca101b0d1d730e89d3a0868b27318 Mon Sep 17 00:00:00 2001 From: Jackmin801 <56836461+Jackmin801@users.noreply.github.com> Date: Sun, 1 Oct 2023 13:38:10 +0200 Subject: [PATCH 1/5] small docstring fix --- deepspeed/runtime/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepspeed/runtime/utils.py b/deepspeed/runtime/utils.py index 7024b93d6820..8805be951370 100755 --- a/deepspeed/runtime/utils.py +++ b/deepspeed/runtime/utils.py @@ -491,12 +491,12 @@ def get_weight_norm(parameters, norm_type=2, mpu=None): Arguments: parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a single Tensor that will have gradients normalized - max_norm (float or int): max norm of the gradients norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for infinity norm. Returns: Total norm of the parameters (viewed as a single vector). + -1 if the norm value is NaN or Inf. """ if isinstance(parameters, torch.Tensor): parameters = [parameters] From cb126bae0820392f3afcd42220b0639a5fbdd4b3 Mon Sep 17 00:00:00 2001 From: Jackmin801 <56836461+Jackmin801@users.noreply.github.com> Date: Sun, 1 Oct 2023 13:57:10 +0200 Subject: [PATCH 2/5] default is no longer none in zero_grad --- deepspeed/runtime/fp16/fused_optimizer.py | 1 - deepspeed/runtime/fp16/unfused_optimizer.py | 1 - deepspeed/runtime/zero/stage3.py | 1 - deepspeed/runtime/zero/stage_1_and_2.py | 1 - 4 files changed, 4 deletions(-) diff --git a/deepspeed/runtime/fp16/fused_optimizer.py b/deepspeed/runtime/fp16/fused_optimizer.py index ad95e1f7c8ad..5839795112e4 100755 --- a/deepspeed/runtime/fp16/fused_optimizer.py +++ b/deepspeed/runtime/fp16/fused_optimizer.py @@ -137,7 +137,6 @@ def zero_grad(self, set_to_none=False): """ Zero FP16 parameter grads. """ - # For speed, set model fp16 grad to None by default for group in self.fp16_groups: for p in group: if set_to_none: diff --git a/deepspeed/runtime/fp16/unfused_optimizer.py b/deepspeed/runtime/fp16/unfused_optimizer.py index 9a61250d69f4..023f935ddcd3 100755 --- a/deepspeed/runtime/fp16/unfused_optimizer.py +++ b/deepspeed/runtime/fp16/unfused_optimizer.py @@ -115,7 +115,6 @@ def zero_grad(self, set_to_none=False): Zero FP16 parameter grads. """ # FP32 grad should never exist outside of the step function - # For speed, set model fp16 grad to None by default for group in self.fp16_groups: for p in group: if set_to_none: diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py index 9509b5a692ca..66804ef8d8d5 100644 --- a/deepspeed/runtime/zero/stage3.py +++ b/deepspeed/runtime/zero/stage3.py @@ -1576,7 +1576,6 @@ def zero_grad(self, set_to_none=False): self.micro_step_id = 0 # FP32 grad should never exist. - # For speed, set model fp16 grad to None by default for group in self.fp16_groups: for p in group: if set_to_none: diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py index d63a3463e43d..cd97e73ad21c 100755 --- a/deepspeed/runtime/zero/stage_1_and_2.py +++ b/deepspeed/runtime/zero/stage_1_and_2.py @@ -1524,7 +1524,6 @@ def zero_grad(self, set_to_none=False): Zero FP16 parameter grads. """ # FP32 grad should never exist. - # For speed, set model fp16 grad to None by default # zero all pointers to grad tensors for group in self.bit16_groups: for p in group: From 3b39bbd1482b21bed0eff5a43b9554d5a42cb682 Mon Sep 17 00:00:00 2001 From: Jackmin801 <56836461+Jackmin801@users.noreply.github.com> Date: Sun, 1 Oct 2023 14:50:03 +0200 Subject: [PATCH 3/5] typo --- deepspeed/runtime/fp16/unfused_optimizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepspeed/runtime/fp16/unfused_optimizer.py b/deepspeed/runtime/fp16/unfused_optimizer.py index 023f935ddcd3..26d9380d939b 100755 --- a/deepspeed/runtime/fp16/unfused_optimizer.py +++ b/deepspeed/runtime/fp16/unfused_optimizer.py @@ -215,7 +215,7 @@ def step(self, closure=None): norm_group_value = get_weight_norm(grads_for_norm, mpu=self.mpu) norm_groups.append(norm_group_value) - # copying gradients to fp32 to wor k with fp32 parameters + # copying gradients to fp32 to work with fp32 parameters for fp32_param, fp16_param in zip(self.fp32_groups[i], self.fp16_groups[i]): if fp16_param.grad is None: fp32_param.grad = torch.zeros(fp16_param.size(), dtype=fp32_param.dtype, device=fp32_param.device) From d4030ced84e94cae32d0c402e5262196e9850c06 Mon Sep 17 00:00:00 2001 From: Jackmin801 <56836461+Jackmin801@users.noreply.github.com> Date: Sun, 1 Oct 2023 19:52:06 +0200 Subject: [PATCH 4/5] max_norm not in method signature --- deepspeed/runtime/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/deepspeed/runtime/utils.py b/deepspeed/runtime/utils.py index 8805be951370..b066090265c8 100755 --- a/deepspeed/runtime/utils.py +++ b/deepspeed/runtime/utils.py @@ -394,7 +394,6 @@ def get_grad_norm(parameters, norm_type=2, mpu=None): Arguments: parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a single Tensor that will have gradients normalized - max_norm (float or int): max norm of the gradients norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for infinity norm. From 095c0820be008625dfb6acdfe6e1b6489d572600 Mon Sep 17 00:00:00 2001 From: Jackmin801 <56836461+Jackmin801@users.noreply.github.com> Date: Mon, 2 Oct 2023 22:55:24 +0200 Subject: [PATCH 5/5] Revert "default is no longer none in zero_grad" This reverts commit cb126bae0820392f3afcd42220b0639a5fbdd4b3. --- deepspeed/runtime/fp16/fused_optimizer.py | 1 + deepspeed/runtime/fp16/unfused_optimizer.py | 1 + deepspeed/runtime/zero/stage3.py | 1 + deepspeed/runtime/zero/stage_1_and_2.py | 1 + 4 files changed, 4 insertions(+) diff --git a/deepspeed/runtime/fp16/fused_optimizer.py b/deepspeed/runtime/fp16/fused_optimizer.py index 5839795112e4..ad95e1f7c8ad 100755 --- a/deepspeed/runtime/fp16/fused_optimizer.py +++ b/deepspeed/runtime/fp16/fused_optimizer.py @@ -137,6 +137,7 @@ def zero_grad(self, set_to_none=False): """ Zero FP16 parameter grads. """ + # For speed, set model fp16 grad to None by default for group in self.fp16_groups: for p in group: if set_to_none: diff --git a/deepspeed/runtime/fp16/unfused_optimizer.py b/deepspeed/runtime/fp16/unfused_optimizer.py index 26d9380d939b..8e84d0650e47 100755 --- a/deepspeed/runtime/fp16/unfused_optimizer.py +++ b/deepspeed/runtime/fp16/unfused_optimizer.py @@ -115,6 +115,7 @@ def zero_grad(self, set_to_none=False): Zero FP16 parameter grads. """ # FP32 grad should never exist outside of the step function + # For speed, set model fp16 grad to None by default for group in self.fp16_groups: for p in group: if set_to_none: diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py index 66804ef8d8d5..9509b5a692ca 100644 --- a/deepspeed/runtime/zero/stage3.py +++ b/deepspeed/runtime/zero/stage3.py @@ -1576,6 +1576,7 @@ def zero_grad(self, set_to_none=False): self.micro_step_id = 0 # FP32 grad should never exist. + # For speed, set model fp16 grad to None by default for group in self.fp16_groups: for p in group: if set_to_none: diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py index cd97e73ad21c..d63a3463e43d 100755 --- a/deepspeed/runtime/zero/stage_1_and_2.py +++ b/deepspeed/runtime/zero/stage_1_and_2.py @@ -1524,6 +1524,7 @@ def zero_grad(self, set_to_none=False): Zero FP16 parameter grads. """ # FP32 grad should never exist. + # For speed, set model fp16 grad to None by default # zero all pointers to grad tensors for group in self.bit16_groups: for p in group: