Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add find_unused_parameters option to DeepSpeedEngine #945

Merged
merged 32 commits into from
Apr 25, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
976629d
Merge pull request #3 from microsoft/master
ghosthamlet Apr 12, 2021
3c15d98
Add find_unused_parameters option
ghosthamlet Apr 12, 2021
ecc8b11
Add find_unused_parameters option
ghosthamlet Apr 12, 2021
429c0bb
Fix syntax error
ghosthamlet Apr 12, 2021
e32db04
Fix yapf error
ghosthamlet Apr 12, 2021
f27165f
Fix yapf error
ghosthamlet Apr 12, 2021
bcfee31
Fix yapf error
ghosthamlet Apr 12, 2021
afc2da1
Fix yapf error
ghosthamlet Apr 12, 2021
257b5ca
Move stage2 find_unused_parameters to config file
ghosthamlet Apr 24, 2021
f72e2e7
Add stage2 find_unused_parameters
ghosthamlet Apr 24, 2021
26d6a28
Add stage2 find_unused_parameters
ghosthamlet Apr 24, 2021
9249fa7
Add stage2_find_unused_parameters option
ghosthamlet Apr 24, 2021
6605053
Change error msg to reflect zero_optimization config change
ghosthamlet Apr 24, 2021
a197882
Merge branch 'master' into find_unused_parameters
ghosthamlet Apr 24, 2021
10702d1
Fix yapf error
ghosthamlet Apr 24, 2021
8844dea
Fix yapf errors
ghosthamlet Apr 24, 2021
8abcaee
Change find_unused_parameters option name
ghosthamlet Apr 25, 2021
7cedcdf
Change find_unused_parameters option name
ghosthamlet Apr 25, 2021
86464f7
Change find_unused_parameters option name
ghosthamlet Apr 25, 2021
0837674
Change find_unused_parameters option name
ghosthamlet Apr 25, 2021
ec4bdee
Change find_unused_parameters option name
ghosthamlet Apr 25, 2021
8b07db1
Add UnusedParametersModel for test option find_unused_parameters
ghosthamlet Apr 25, 2021
785d910
Add unit test for stage2 find_unused_parameters
ghosthamlet Apr 25, 2021
1430bd5
Add cpu-adam compatible check
ghosthamlet Apr 25, 2021
43ebbbb
Remove dups import
ghosthamlet Apr 25, 2021
42bccaa
Trim spaces
ghosthamlet Apr 25, 2021
d3d2eb4
Fix yapf errors
ghosthamlet Apr 25, 2021
1579123
Trim spaces
ghosthamlet Apr 25, 2021
405aa2f
Add False Positive test check
ghosthamlet Apr 25, 2021
ad3724e
Fix find_unused_parameters test
ghosthamlet Apr 25, 2021
89c9b81
Trim spaces
ghosthamlet Apr 25, 2021
2bb8b66
Fix yapf error
ghosthamlet Apr 25, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion deepspeed/runtime/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,9 @@ def zero_param_persistence_threshold(self):
def zero_gather_fp16_weights_on_model_save(self):
return self._config.zero_config.gather_fp16_weights_on_model_save

def zero_find_unused_parameters(self):
return self._config.zero_config.find_unused_parameters

def fp16_enabled(self):
return self._config.fp16_enabled

Expand Down Expand Up @@ -789,7 +792,8 @@ def _configure_zero_optimizer(self, optimizer):
mpu=self.mpu,
postscale_gradients=self.postscale_gradients(),
gradient_predivide_factor=self.gradient_predivide_factor(),
gradient_accumulation_steps=self.gradient_accumulation_steps())
gradient_accumulation_steps=self.gradient_accumulation_steps(),
find_unused_parameters=self.zero_find_unused_parameters())
elif zero_stage == ZERO_OPTIMIZATION_WEIGHTS:
print("Initializing ZeRO Stage 3") if dist.get_rank() == 0 else None
from deepspeed.runtime.zero.stage3 import FP16_DeepSpeedZeroOptimizer_Stage3
Expand Down
7 changes: 7 additions & 0 deletions deepspeed/runtime/zero/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ def __init__(self, param_dict):
self.max_reuse_distance = None
self.gather_fp16_weights_on_model_save = None

self.find_unused_parameters = None

if ZERO_OPTIMIZATION in param_dict.keys():
zero_config_dict = param_dict[ZERO_OPTIMIZATION]
if type(zero_config_dict) is bool:
Expand Down Expand Up @@ -175,3 +177,8 @@ def _initialize(self, zero_config_dict):
zero_config_dict,
ZERO_OPTIMIZATION_GATHER_FP16_WEIGHTS_ON_MODEL_SAVE,
ZERO_OPTIMIZATION_GATHER_FP16_WEIGHTS_ON_MODEL_SAVE_DEFAULT)

self.find_unused_parameters = get_scalar_param(
zero_config_dict,
ZERO_OPTIMIZATION_FIND_UNUSED_PARAMETERS,
ZERO_OPTIMIZATION_FIND_UNUSED_PARAMETERS_DEFAULT)
15 changes: 13 additions & 2 deletions deepspeed/runtime/zero/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@
"cpu_offload_use_pin_memory": [true|false] (deprecated),
"sub_group_size" : 1000000000000,
"offload_param": {...},
"offload_optimizer": {...}
"offload_optimizer": {...},
"find_unused_parameters": [true|false]
}
}
'''
Expand Down Expand Up @@ -113,6 +114,14 @@
ZERO_OPTIMIZATION_GATHER_FP16_WEIGHTS_ON_MODEL_SAVE = 'stage3_gather_fp16_weights_on_model_save'
ZERO_OPTIMIZATION_GATHER_FP16_WEIGHTS_ON_MODEL_SAVE_DEFAULT = False

# Now just used in stage2 complete_grad_norm_calculation_for_cpu_offload
# Enable this option to avoid:
# https://github.com/microsoft/DeepSpeed/issues/707
# torch.nn.parallel.DistributedDataParallel has the same option with
# similar usage
ZERO_OPTIMIZATION_FIND_UNUSED_PARAMETERS = 'find_unused_parameters'
ZERO_OPTIMIZATION_FIND_UNUSED_PARAMETERS_DEFAULT = False

ZERO_OPTIMIZATION_DEFAULT = {
ZERO_OPTIMIZATION_STAGE:
ZERO_OPTIMIZATION_STAGE_DEFAULT,
Expand Down Expand Up @@ -145,5 +154,7 @@
ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD:
ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD_DEFAULT,
ZERO_OPTIMIZATION_GATHER_FP16_WEIGHTS_ON_MODEL_SAVE:
ZERO_OPTIMIZATION_GATHER_FP16_WEIGHTS_ON_MODEL_SAVE_DEFAULT
ZERO_OPTIMIZATION_GATHER_FP16_WEIGHTS_ON_MODEL_SAVE_DEFAULT,
ZERO_OPTIMIZATION_FIND_UNUSED_PARAMETERS:
ZERO_OPTIMIZATION_FIND_UNUSED_PARAMETERS_DEFAULT
}
17 changes: 16 additions & 1 deletion deepspeed/runtime/zero/stage2.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,8 @@ def __init__(self,
allreduce_always_fp32=False,
postscale_gradients=True,
gradient_predivide_factor=1.0,
gradient_accumulation_steps=1):
gradient_accumulation_steps=1,
find_unused_parameters=False):

if dist.get_rank() == 0:
logger.info(f"Reduce bucket size {reduce_bucket_size}")
Expand Down Expand Up @@ -149,6 +150,7 @@ def __init__(self,
self.postscale_gradients = postscale_gradients
self.gradient_accumulation_steps = gradient_accumulation_steps
self.micro_step_id = 0
self.find_unused_parameters = find_unused_parameters

if self.reduce_scatter:
assert not self.allreduce_always_fp32, "allreduce_always_fp32 is not yet supported with ZeRO-2 with reduce scatter enabled"
Expand Down Expand Up @@ -886,6 +888,19 @@ def complete_grad_norm_calculation_for_cpu_offload(self, params):
if param_id in self.norm_for_param_grads:
param_norm = self.norm_for_param_grads[param_id]
total_norm += param_norm.item()**2
else:
# As unused parameters in modules may not be expected sometimes,
# add an explicit error msg when it occurred and an option to
# avoid the error
# Error msg adapted from torch.nn.parallel.DistributedDataParallel
assert self.find_unused_parameters, """
This error indicates that your module has parameters that
were not used in producing loss.
You can avoid this error by
(1) enable find_unused_parameters option in zero_optimization config;
(2) making sure all trainable parameters and `forward` function
outputs participate in calculating loss.
"""

# Sum across all model parallel GPUs.
total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
Expand Down
8 changes: 7 additions & 1 deletion docs/_pages/config-json.md
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,8 @@ Enabling and configuring ZeRO memory optimizations
"stage3_param_persistence_threshold" : 1e6,
"sub_group_size" : 1e12,
"elastic_checkpoint" : [true|false],
"stage3_gather_fp16_weights_on_model_save": [true|false]
"stage3_gather_fp16_weights_on_model_save": [true|false],
"find_unused_parameters": [true|false]
}
```

Expand Down Expand Up @@ -396,6 +397,7 @@ Enabling and configuring ZeRO memory optimizations
| -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
| Consolidate the weights before saving the model by `save_fp16_model()`. Since the weights are partitioned across GPUs, they aren't part of `state_dict`, so this function automatically gather the weights when this option is enabled and then saves the fp16 model weights. | `False` |


***cpu_offload***: [boolean]

**Deprecated:** **cpu_offload** is disabled and will be removed in future, please use `offload_optimizer` instead.
Expand Down Expand Up @@ -538,6 +540,10 @@ Configuring the asynchronous I/O module for offloading parameter and optimizer s
| ------------------------------------------------------------------------------------------------------------------------------------- | ------- |
| Submit requests to storage device in an overlapped fashion without waiting for completion of earlier requests. | `true` |

***find_unused_parameters***: [boolean]
| Description | Default |
| -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
| As unused parameters in modules may not be expected sometimes, it will cause an explicit error msg when it occurred and enable this option to avoid the error, `torch.nn.parallel.DistributedDataParallel` has the same `find_unused_parameters` option with similar usage. Now it just used in stage2. | `False` |

### Logging

Expand Down
7 changes: 7 additions & 0 deletions tests/unit/simple_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,13 @@ def forward(self, x, y):
return self.cross_entropy_loss(hidden_dim, y)


class UnusedParametersModel(SimpleModel):
def __init__(self, hidden_dim, empty_grad=False):
super().__init__(hidden_dim, empty_grad)

self.unused_linear = torch.nn.Linear(hidden_dim, hidden_dim)


class LinearStack(torch.nn.Module):
def __init__(self, input_dim=128, hidden_dim=128, output_dim=128, num_layers=4):
super().__init__()
Expand Down
70 changes: 70 additions & 0 deletions tests/unit/test_find_unused_parameters.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import torch
import pytest
import json
import argparse
import os
from common import distributed_test
from simple_model import UnusedParametersModel, random_dataloader, args_from_dict
from deepspeed.ops.op_builder import CPUAdamBuilder

import deepspeed


@pytest.mark.parametrize('find_unused_parameters', [False, True])
def test_stage2_find_unused_parameters(tmpdir, find_unused_parameters):
use_cpu_offload = True

if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
pytest.skip("cpu-adam is not compatible")

config_dict = {
"train_micro_batch_size_per_gpu": 2,
"gradient_accumulation_steps": 2,
"steps_per_print": 1,
"zero_optimization": {
"stage": 2,
"cpu_offload": use_cpu_offload,
"find_unused_parameters": find_unused_parameters
},
"optimizer": {
"type": "Adam",
"params": {
"lr": 1e-3
}
},
"fp16": {
"enabled": True,
"initial_scale_power": 8
}
}

args = args_from_dict(tmpdir, config_dict)
hidden_dim = 4

model = UnusedParametersModel(hidden_dim=hidden_dim)

@distributed_test(world_size=[1])
def _test_stage2_find_unused_parameters(args, model, hidden_dim):
model, _, _, _ = deepspeed.initialize(args=args,
model=model,
model_parameters=model.parameters())

data_loader = random_dataloader(model=model,
total_samples=10,
hidden_dim=hidden_dim,
device=model.device)

def _loop():
for n, batch in enumerate(data_loader):
loss = model(batch[0], batch[1])
model.backward(loss)
model.step()

if not find_unused_parameters:
with pytest.raises(AssertionError) as e:
_loop()
assert e.value.args and 'find_unused_parameters' in e.value.args[0]
else:
_loop()

_test_stage2_find_unused_parameters(args=args, model=model, hidden_dim=hidden_dim)