Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Small docstring fix #4431

Merged
merged 6 commits into from
Oct 3, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion deepspeed/runtime/fp16/fused_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,6 @@ def zero_grad(self, set_to_none=False):
"""
Zero FP16 parameter grads.
"""
# For speed, set model fp16 grad to None by default
loadams marked this conversation as resolved.
Show resolved Hide resolved
for group in self.fp16_groups:
for p in group:
if set_to_none:
Expand Down
3 changes: 1 addition & 2 deletions deepspeed/runtime/fp16/unfused_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,6 @@ def zero_grad(self, set_to_none=False):
Zero FP16 parameter grads.
"""
# FP32 grad should never exist outside of the step function
# For speed, set model fp16 grad to None by default
for group in self.fp16_groups:
for p in group:
if set_to_none:
Expand Down Expand Up @@ -216,7 +215,7 @@ def step(self, closure=None):
norm_group_value = get_weight_norm(grads_for_norm, mpu=self.mpu)
norm_groups.append(norm_group_value)

# copying gradients to fp32 to wor k with fp32 parameters
# copying gradients to fp32 to work with fp32 parameters
for fp32_param, fp16_param in zip(self.fp32_groups[i], self.fp16_groups[i]):
if fp16_param.grad is None:
fp32_param.grad = torch.zeros(fp16_param.size(), dtype=fp32_param.dtype, device=fp32_param.device)
Expand Down
3 changes: 1 addition & 2 deletions deepspeed/runtime/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,7 +394,6 @@ def get_grad_norm(parameters, norm_type=2, mpu=None):
Arguments:
parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
single Tensor that will have gradients normalized
max_norm (float or int): max norm of the gradients
norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
infinity norm.

Expand Down Expand Up @@ -491,12 +490,12 @@ def get_weight_norm(parameters, norm_type=2, mpu=None):
Arguments:
parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
single Tensor that will have gradients normalized
max_norm (float or int): max norm of the gradients
norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
infinity norm.

Returns:
Total norm of the parameters (viewed as a single vector).
-1 if the norm value is NaN or Inf.
"""
if isinstance(parameters, torch.Tensor):
parameters = [parameters]
Expand Down
1 change: 0 additions & 1 deletion deepspeed/runtime/zero/stage3.py
Original file line number Diff line number Diff line change
Expand Up @@ -1576,7 +1576,6 @@ def zero_grad(self, set_to_none=False):
self.micro_step_id = 0

# FP32 grad should never exist.
# For speed, set model fp16 grad to None by default
for group in self.fp16_groups:
for p in group:
if set_to_none:
Expand Down
1 change: 0 additions & 1 deletion deepspeed/runtime/zero/stage_1_and_2.py
Original file line number Diff line number Diff line change
Expand Up @@ -1524,7 +1524,6 @@ def zero_grad(self, set_to_none=False):
Zero FP16 parameter grads.
"""
# FP32 grad should never exist.
# For speed, set model fp16 grad to None by default
# zero all pointers to grad tensors
for group in self.bit16_groups:
for p in group:
Expand Down