forked from microsoft/DeepSpeed
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Elastic training support (microsoft#602)
Co-authored-by: Samyam Rajbhandari <[email protected]>
- Loading branch information
Showing
16 changed files
with
883 additions
and
22 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
#!/usr/bin/env python | ||
|
||
import argparse | ||
import json | ||
|
||
import deepspeed | ||
from deepspeed.elasticity import compute_elastic_config | ||
|
||
|
||
if __name__ == '__main__': | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument('-c', '--config', type=str, help="DeepSpeed config json") | ||
parser.add_argument('-w', '--world-size', type=int, default=0, help="Intended/current world size") | ||
args = parser.parse_args() | ||
ds_config = json.load(open(args.config, 'r')) | ||
|
||
ds_version = deepspeed.__version__ | ||
|
||
elastic_config = ds_config['elasticity'] | ||
print('------------------------------------------') | ||
print("Elasticity config:") | ||
print('------------------------------------------') | ||
print(json.dumps(elastic_config, indent=4, sort_keys=True)) | ||
|
||
if args.world_size > 0: | ||
final_batch_size, valid_gpus, micro_batch_size = compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version, world_size=args.world_size) | ||
print('------------------------------------------') | ||
print(f"Calculated results for world size {args.world_size}:") | ||
print('------------------------------------------') | ||
print(f'final_batch_size .... {final_batch_size}') | ||
print(f'valid_gpus .......... {valid_gpus}') | ||
print(f'micro_batch_size .... {micro_batch_size}') | ||
else: | ||
final_batch_size, valid_gpus = compute_elastic_config(ds_config=ds_config, target_deepspeed_version=ds_version) | ||
print('------------------------------------------') | ||
print("Calculated results:") | ||
print('------------------------------------------') | ||
print(f'final_batch_size .... {final_batch_size}') | ||
print(f'valid_gpus .......... {valid_gpus}') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from .elasticity import compute_elastic_config, elasticity_enabled, ensure_immutable_elastic_config |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
""" | ||
Copyright 2020 The Microsoft DeepSpeed Team | ||
""" | ||
|
||
import json | ||
from .constants import * | ||
|
||
|
||
class ElasticityError(Exception): | ||
""" | ||
Base exception for all elasticity related errors | ||
""" | ||
pass | ||
|
||
|
||
class ElasticityConfigError(ElasticityError): | ||
""" | ||
Elasticity configuration error | ||
""" | ||
pass | ||
|
||
|
||
class ElasticityIncompatibleWorldSize(ElasticityError): | ||
""" | ||
Attempting to run a world size that is incompatible with a given elastic config | ||
""" | ||
pass | ||
|
||
|
||
class ElasticityConfig: | ||
""" | ||
Elastic config object, constructed from a param dictionary that only contains elastic | ||
config parameters, example below: | ||
If elasticity is enabled, user must specify (at least) max_train_batch_size | ||
and micro_batch_sizes. | ||
{ | ||
"enabled": true, | ||
"max_train_batch_size": 2000, | ||
"micro_batch_sizes": [2,4,6], | ||
"min_gpus": 1, | ||
"max_gpus" : 10000 | ||
"min_time": 20 | ||
"ignore_non_elastic_batch_info": false | ||
"version": 0.1 | ||
} | ||
""" | ||
def __init__(self, param_dict): | ||
self.enabled = param_dict.get(ENABLED, ENABLED_DEFAULT) | ||
if self.enabled: | ||
if MAX_ACCEPTABLE_BATCH_SIZE in param_dict: | ||
self.max_acceptable_batch_size = param_dict[MAX_ACCEPTABLE_BATCH_SIZE] | ||
else: | ||
raise ElasticityConfigError( | ||
f"Elasticity config missing {MAX_ACCEPTABLE_BATCH_SIZE}") | ||
if MICRO_BATCHES in param_dict: | ||
self.micro_batches = param_dict[MICRO_BATCHES] | ||
else: | ||
raise ElasticityConfigError(f"Elasticity config missing {MICRO_BATCHES}") | ||
else: | ||
self.max_acceptable_batch_size = param_dict.get( | ||
MAX_ACCEPTABLE_BATCH_SIZE, | ||
MAX_ACCEPTABLE_BATCH_SIZE_DEFAULT) | ||
self.micro_batches = param_dict.get(MICRO_BATCHES, MICRO_BATCHES_DEFAULT) | ||
self.min_gpus = param_dict.get(MIN_GPUS, MIN_GPUS_DEFAULT) | ||
self.max_gpus = param_dict.get(MAX_GPUS, MAX_GPUS_DEFAULT) | ||
self.min_time = param_dict.get(MIN_TIME, MIN_TIME_DEFAULT) | ||
self.version = param_dict.get(VERSION, VERSION_DEFAULT) | ||
self.prefer_larger_batch_size = param_dict.get(PREFER_LARGER_BATCH, | ||
PREFER_LARGER_BATCH_DEFAULT) | ||
self.ignore_non_elastic_batch_info = param_dict.get( | ||
IGNORE_NON_ELASTIC_BATCH_INFO, | ||
IGNORE_NON_ELASTIC_BATCH_INFO_DEFAULT) | ||
|
||
def repr(self): | ||
return self.__dict__ | ||
|
||
def __repr__(self): | ||
return json.dumps(self.__dict__, sort_keys=True, indent=4) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
""" | ||
Copyright 2020 The Microsoft DeepSpeed Team | ||
""" | ||
|
||
######################################### | ||
# Elasticity | ||
######################################### | ||
''' Elasticity Utility in DeepSpeed can be used to create highly elastic jobs compatible | ||
with a large number of GPUs. For elastic jobs, DeepSpeed will provide a batch size that | ||
can support a large number of GPUs based on the user specified parameters | ||
''' | ||
FORMAT = ''' | ||
Elasticity should be enabled as: | ||
"elasticity": { | ||
"enabled": true, | ||
"max_train_batch_size": 2000, | ||
"micro_batch_sizes": [2,4,6], | ||
"min_gpus": 1, | ||
"max_gpus" : 10000 | ||
"min_time": 20, | ||
"prefer_larger_batch": true, | ||
"ignore_non_elastic_batch_info": false, | ||
"version": 0.1 | ||
} | ||
''' | ||
|
||
ELASTICITY = 'elasticity' | ||
|
||
# Current elasticity version | ||
LATEST_ELASTICITY_VERSION = 0.1 | ||
|
||
ENABLED = 'enabled' | ||
ENABLED_DEFAULT = False | ||
|
||
# Max acceptable train_batch_size | ||
MAX_ACCEPTABLE_BATCH_SIZE = 'max_train_batch_size' | ||
MAX_ACCEPTABLE_BATCH_SIZE_DEFAULT = 2000 | ||
|
||
# Acceptable micro batch sizes, same as train_micro_batch_size_per_gpu | ||
MICRO_BATCHES = 'micro_batch_sizes' | ||
MICRO_BATCHES_DEFAULT = [2, 4, 6] | ||
|
||
# Min/max of GPUs to search over | ||
MIN_GPUS = 'min_gpus' | ||
MIN_GPUS_DEFAULT = 1 | ||
MAX_GPUS = 'max_gpus' | ||
MAX_GPUS_DEFAULT = 10000 | ||
|
||
# Minimum running time (minutes) before the scheduler will scale us | ||
MIN_TIME = "min_time" | ||
MIN_TIME_DEFAULT = "20" | ||
|
||
# When finding a suitable batch size, attempt to find one that is closest | ||
# to the max train batch size given. | ||
PREFER_LARGER_BATCH = 'prefer_larger_batch' | ||
PREFER_LARGER_BATCH_DEFAULT = True | ||
|
||
# In order to reduce confusion, if elastic mode is enabled we | ||
# require (via assert) that no batch info is set outside of the | ||
# elastic config. You can turn off this assert via this config | ||
# but keep in mind that all batch info defined outside the | ||
# elastic mode *will be ignored*. | ||
IGNORE_NON_ELASTIC_BATCH_INFO = 'ignore_non_elastic_batch_info' | ||
IGNORE_NON_ELASTIC_BATCH_INFO_DEFAULT = False | ||
|
||
# Version of elastic logic to use | ||
VERSION = "version" | ||
VERSION_DEFAULT = LATEST_ELASTICITY_VERSION | ||
|
||
# Minimum deepspeed version to use elasticity | ||
MINIMUM_DEEPSPEED_VERSION = "0.3.8" | ||
|
||
# Environment variable storing elastic config from resource scheduler | ||
DEEPSPEED_ELASTICITY_CONFIG = "DEEPSPEED_ELASTICITY_CONFIG" |
Oops, something went wrong.