From 028d868c8d4f21f66ea691295a1e1d13092cefc5 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Tue, 2 Nov 2021 13:41:24 -0700 Subject: [PATCH 1/8] [deepspeed] zero inference --- src/transformers/deepspeed.py | 33 +++++++++++++++++++++++++++++-- src/transformers/trainer.py | 7 +------ tests/deepspeed/test_deepspeed.py | 28 ++++++++++++++++++++++++++ 3 files changed, 60 insertions(+), 8 deletions(-) diff --git a/src/transformers/deepspeed.py b/src/transformers/deepspeed.py index bb5d25d4b2375b..f8d6b579e592d0 100644 --- a/src/transformers/deepspeed.py +++ b/src/transformers/deepspeed.py @@ -111,6 +111,24 @@ def get_value(self, ds_key_long, default=None): return default return config.get(ds_key, default) + def del_config_sub_tree(self, ds_key_long, must_exist=False): + config = self.config + + # find the config node of interest if it exists + nodes = ds_key_long.split(".") + for node in nodes: + parent_config = config + config = config.get(node) + if config is None: + if must_exist: + raise ValueError(f"Can't find {ds_key_long} entry in the config") + else: + return + + # if found remove it + if parent_config is not None: + parent_config.pop(node) + def is_true(self, ds_key_long): """ Returns :obj:`True`/:obj:`False` only if the value is set, always :obj:`False` otherwise. So use this method to @@ -280,7 +298,7 @@ def deepspeed_config(): return None -def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None): +def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None, inference=False): """ Init DeepSpeed, after updating the DeepSpeed configuration with any relevant Trainer's args. @@ -290,6 +308,7 @@ def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None): trainer: Trainer object num_training_steps: per single gpu resume_from_checkpoint: path to a checkpoint if to resume from after normal DeepSpeedEngine load + inference: launch in inference mode (no optimizer) Returns: model, optimizer, lr_scheduler @@ -357,7 +376,17 @@ def _lr_scheduler_callable(optimizer): # set the Deepspeed log level consistent with the trainer ds_logger.setLevel(args.get_process_log_level()) - model_parameters = filter(lambda p: p.requires_grad, model.parameters()) + if inference: + optimizer = None + lr_scheduler = None + hf_deepspeed_config.del_config_sub_tree("optimizer") + hf_deepspeed_config.del_config_sub_tree("lr_scheduler") + model_parameters = None + else: + model_parameters = filter(lambda p: p.requires_grad, model.parameters()) + + # XXX: validate + print(config) model, optimizer, _, lr_scheduler = deepspeed.initialize( model=model, diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 4cda1dcb840e83..5b8e35369e3470 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -2221,15 +2221,10 @@ def evaluation_loop( # XXX: eval doesn't have `resume_from_checkpoint` arg but we should be able to do eval # from the checkpoint eventually - deepspeed_engine, _, _ = deepspeed_init(self, num_training_steps=0, resume_from_checkpoint=None) + deepspeed_engine, _, _ = deepspeed_init(self, num_training_steps=0, resume_from_checkpoint=None, inference=True) self.model = deepspeed_engine.module self.model_wrapped = deepspeed_engine self.deepspeed = deepspeed_engine - # XXX: we don't need optim/sched for inference, but this needs to be sorted out, since - # for example the Z3-optimizer is a must for zero3 to work even for inference - what we - # don't need is the deepspeed basic optimizer which is self.optimizer.optimizer - deepspeed_engine.optimizer.optimizer = None - deepspeed_engine.lr_scheduler = None model = self._wrap_model(self.model, training=False) diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py index a7ba14b022ef83..f28ac5ce3bf49a 100644 --- a/tests/deepspeed/test_deepspeed.py +++ b/tests/deepspeed/test_deepspeed.py @@ -709,6 +709,34 @@ def test_fp32_distributed(self, stage): fp16=False, ) + @require_torch_multi_gpu + @parameterized.expand(stages) + def test_inference_fp16(self, stage): + # this is just inference, so no optimizer should be loaded + self.run_and_check( + stage=stage, + model_name=T5_TINY, + distributed=True, + do_train=False, + do_eval=True, + quality_checks=False, + fp16=True, + ) + + @require_torch_multi_gpu + @parameterized.expand(stages) + def test_inference_fp32(self, stage): + # this is just inference, so no optimizer should be loaded + self.run_and_check( + stage=stage, + model_name=T5_TINY, + distributed=True, + do_train=False, + do_eval=True, + quality_checks=False, + fp16=False, + ) + @parameterized.expand(stages) def test_resume_train_not_from_ds_checkpoint(self, stage): # do normal training and then resume not from the deepspeed checkpoint but explicitly from From 5c1011ff0f60b1f12d31882507c95a90d4921630 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Tue, 2 Nov 2021 13:57:13 -0700 Subject: [PATCH 2/8] only z3 makes sense for inference --- src/transformers/deepspeed.py | 7 ++++--- tests/deepspeed/test_deepspeed.py | 22 +++++----------------- 2 files changed, 9 insertions(+), 20 deletions(-) diff --git a/src/transformers/deepspeed.py b/src/transformers/deepspeed.py index f8d6b579e592d0..4c667c6e1a625b 100644 --- a/src/transformers/deepspeed.py +++ b/src/transformers/deepspeed.py @@ -376,6 +376,10 @@ def _lr_scheduler_callable(optimizer): # set the Deepspeed log level consistent with the trainer ds_logger.setLevel(args.get_process_log_level()) + # only Z3 makes sense for the inference + if inference and not hf_deepspeed_config.is_zero3(): + raise ValueError("ZeRO inference only makes sense with ZeRO Stage 3 - please adjust your config") + if inference: optimizer = None lr_scheduler = None @@ -385,9 +389,6 @@ def _lr_scheduler_callable(optimizer): else: model_parameters = filter(lambda p: p.requires_grad, model.parameters()) - # XXX: validate - print(config) - model, optimizer, _, lr_scheduler = deepspeed.initialize( model=model, model_parameters=model_parameters, diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py index f28ac5ce3bf49a..662d8cd31a2e5b 100644 --- a/tests/deepspeed/test_deepspeed.py +++ b/tests/deepspeed/test_deepspeed.py @@ -710,11 +710,13 @@ def test_fp32_distributed(self, stage): ) @require_torch_multi_gpu - @parameterized.expand(stages) - def test_inference_fp16(self, stage): + @parameterized.expand(["fp16", "fp32"]) + def test_inference(self, dtype): # this is just inference, so no optimizer should be loaded + # it only works for z3 (makes no sense with z1-z2) + fp16 = True if dtype == "fp16" else False self.run_and_check( - stage=stage, + stage=ZERO3, model_name=T5_TINY, distributed=True, do_train=False, @@ -723,20 +725,6 @@ def test_inference_fp16(self, stage): fp16=True, ) - @require_torch_multi_gpu - @parameterized.expand(stages) - def test_inference_fp32(self, stage): - # this is just inference, so no optimizer should be loaded - self.run_and_check( - stage=stage, - model_name=T5_TINY, - distributed=True, - do_train=False, - do_eval=True, - quality_checks=False, - fp16=False, - ) - @parameterized.expand(stages) def test_resume_train_not_from_ds_checkpoint(self, stage): # do normal training and then resume not from the deepspeed checkpoint but explicitly from From ad4395baa39eeeb6b731473604dd531de51b9f3c Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Tue, 2 Nov 2021 14:09:49 -0700 Subject: [PATCH 3/8] fix and style --- src/transformers/trainer.py | 4 +++- tests/deepspeed/test_deepspeed.py | 32 +++++++++++++++---------------- 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 5b8e35369e3470..3c3b07080e83ce 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -2221,7 +2221,9 @@ def evaluation_loop( # XXX: eval doesn't have `resume_from_checkpoint` arg but we should be able to do eval # from the checkpoint eventually - deepspeed_engine, _, _ = deepspeed_init(self, num_training_steps=0, resume_from_checkpoint=None, inference=True) + deepspeed_engine, _, _ = deepspeed_init( + self, num_training_steps=0, resume_from_checkpoint=None, inference=True + ) self.model = deepspeed_engine.module self.model_wrapped = deepspeed_engine self.deepspeed = deepspeed_engine diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py index 662d8cd31a2e5b..9e9d9c5996091a 100644 --- a/tests/deepspeed/test_deepspeed.py +++ b/tests/deepspeed/test_deepspeed.py @@ -709,22 +709,6 @@ def test_fp32_distributed(self, stage): fp16=False, ) - @require_torch_multi_gpu - @parameterized.expand(["fp16", "fp32"]) - def test_inference(self, dtype): - # this is just inference, so no optimizer should be loaded - # it only works for z3 (makes no sense with z1-z2) - fp16 = True if dtype == "fp16" else False - self.run_and_check( - stage=ZERO3, - model_name=T5_TINY, - distributed=True, - do_train=False, - do_eval=True, - quality_checks=False, - fp16=True, - ) - @parameterized.expand(stages) def test_resume_train_not_from_ds_checkpoint(self, stage): # do normal training and then resume not from the deepspeed checkpoint but explicitly from @@ -743,6 +727,22 @@ def test_resume_train_not_from_ds_checkpoint(self, stage): self.do_checks(output_dir, do_train=do_train, do_eval=do_eval) + @require_torch_multi_gpu + @parameterized.expand(["fp16", "fp32"]) + def test_inference(self, dtype): + # this is just inference, so no optimizer should be loaded + # it only works for z3 (makes no sense with z1-z2) + fp16 = True if dtype == "fp16" else False + self.run_and_check( + stage=ZERO3, + model_name=T5_TINY, + distributed=True, + do_train=False, + do_eval=True, + quality_checks=False, + fp16=fp16, + ) + def do_checks(self, output_dir, do_train=True, do_eval=True, quality_checks=True): if do_train: From 6d3be9bffe90cecd1e4233a40f6e9b7ec81357c5 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Fri, 19 Nov 2021 17:46:50 -0800 Subject: [PATCH 4/8] docs --- docs/source/main_classes/deepspeed.rst | 53 +++++++++++++++++++ setup.py | 2 +- src/transformers/dependency_versions_table.py | 2 +- 3 files changed, 55 insertions(+), 2 deletions(-) diff --git a/docs/source/main_classes/deepspeed.rst b/docs/source/main_classes/deepspeed.rst index db639bb53d5531..cebb2d8fb0e22f 100644 --- a/docs/source/main_classes/deepspeed.rst +++ b/docs/source/main_classes/deepspeed.rst @@ -46,6 +46,18 @@ won't be possible on a single GPU. parts of DeepSpeed like ``zero.Init`` for ZeRO stage 3 and higher. To tap into this feature read the docs on :ref:`deepspeed-non-trainer-integration`. +What is integrated: + +Training: + +1. full ZeRO protocol with ZeRO-Infinity + +Inference: + +1. DeepSpeed ZeRO Inference - same as Training but doesn't require Optimizer + +There is also DeepSpeed Inference - this is a totally different technology which uses Tensor Parallelism instead of +ZeRO (coming soon). @@ -1628,6 +1640,47 @@ larger multi-dimensional shape, this means that the parameter is partitioned and +.. _deepspeed-zero-inference: + + +Inference +======================================================================================================================= + +ZeRO Inference uses the same config as ZeRO-3 Training. You just don't need the optimizer and scheduler sections. In +fact you can leave these in the config file if you want to share the same one with the training. They will just be +ignored. + +Otherwise you just need to pass the usual :class:`~transformers.TrainingArguments` arguments. For example: + +.. code-block:: bash + + deepspeed --num_gpus=2 your_program.py --do_eval --deepspeed ds_config.json + +The only important thing is that you need to use ZeRO-3 configuration, since ZeRO-2 provides no benefit whatsoever for +the inference as only ZeRO-3 performs sharding of parameters, whereas ZeRO-1 shards gradients and optimizer states. + +Here is an example of running ``run_translation.py`` under DeepSpeed deploying all available GPUs: + +.. code-block:: bash + + deepspeed examples/pytorch/translation/run_translation.py \ + --deepspeed tests/deepspeed/ds_config_zero3.json \ + --model_name_or_path t5-small --output_dir output_dir \ + --do_eval --max_eval_samples 50 --warmup_steps 50 \ + --max_source_length 128 --val_max_target_length 128 \ + --overwrite_output_dir --per_device_eval_batch_size 4 \ + --predict_with_generate --dataset_config "ro-en" --fp16 \ + --source_lang en --target_lang ro --dataset_name wmt16 \ + --source_prefix "translate English to Romanian: " + +Since for inference there is no need for additional large memory used by the optimizer states and the gradients you +should be able to fit much larger batches and/or sequence length onto the same hardware. + + +Additionally DeepSpeed is currently developing a related product called Deepspeed-Inference which has no relationship +to the ZeRO technology, but instead uses tensor parallelism to scale models that can't fit onto a single GPU. This is a +work in progress and we will provide the integration once that product is complete. + Filing Issues ======================================================================================================================= diff --git a/setup.py b/setup.py index 2b9a19ba5745ac..97f4d0085d32e3 100644 --- a/setup.py +++ b/setup.py @@ -90,7 +90,7 @@ "cookiecutter==1.7.2", "dataclasses", "datasets", - "deepspeed>=0.5.3", + "deepspeed>=0.5.7", "docutils==0.16.0", "fairscale>0.3", "faiss-cpu", diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index 9168ad5c05c580..b23fc4e92701a8 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -8,7 +8,7 @@ "cookiecutter": "cookiecutter==1.7.2", "dataclasses": "dataclasses", "datasets": "datasets", - "deepspeed": "deepspeed>=0.5.3", + "deepspeed": "deepspeed>=0.5.7", "docutils": "docutils==0.16.0", "fairscale": "fairscale>0.3", "faiss-cpu": "faiss-cpu", From 026015292b50f9d7d80cc218367bc73ae7ab6123 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Fri, 19 Nov 2021 18:02:28 -0800 Subject: [PATCH 5/8] rework --- src/transformers/deepspeed.py | 74 +++++++++++++++++++---------------- 1 file changed, 40 insertions(+), 34 deletions(-) diff --git a/src/transformers/deepspeed.py b/src/transformers/deepspeed.py index 4c667c6e1a625b..487dd4ea5f445e 100644 --- a/src/transformers/deepspeed.py +++ b/src/transformers/deepspeed.py @@ -298,31 +298,7 @@ def deepspeed_config(): return None -def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None, inference=False): - """ - Init DeepSpeed, after updating the DeepSpeed configuration with any relevant Trainer's args. - - If ``resume_from_checkpoint`` was passed then an attempt to resume from a previously saved checkpoint will be made. - - Args: - trainer: Trainer object - num_training_steps: per single gpu - resume_from_checkpoint: path to a checkpoint if to resume from after normal DeepSpeedEngine load - inference: launch in inference mode (no optimizer) - - Returns: model, optimizer, lr_scheduler - - """ - import deepspeed - from deepspeed.utils import logger as ds_logger - - model = trainer.model - args = trainer.args - - hf_deepspeed_config = args.hf_deepspeed_config - hf_deepspeed_config.trainer_config_finalize(args, model, num_training_steps) - - # resume config update - some bits like `model` and `num_training_steps` only become available during train +def deepspeed_optim_sched(trainer, hf_deepspeed_config, args, num_training_steps): config = hf_deepspeed_config.config # Optimizer + Scheduler @@ -370,25 +346,55 @@ def _lr_scheduler_callable(optimizer): else: lr_scheduler = trainer.create_scheduler(num_training_steps=num_training_steps, optimizer=optimizer) - # keep for quick debug: - # from pprint import pprint; pprint(config) + return optimizer, lr_scheduler - # set the Deepspeed log level consistent with the trainer - ds_logger.setLevel(args.get_process_log_level()) - # only Z3 makes sense for the inference - if inference and not hf_deepspeed_config.is_zero3(): - raise ValueError("ZeRO inference only makes sense with ZeRO Stage 3 - please adjust your config") +def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None, inference=False): + """ + Init DeepSpeed, after updating the DeepSpeed configuration with any relevant Trainer's args. + + If ``resume_from_checkpoint`` was passed then an attempt to resume from a previously saved checkpoint will be made. + + Args: + trainer: Trainer object + num_training_steps: per single gpu + resume_from_checkpoint: path to a checkpoint if to resume from after normal DeepSpeedEngine load + inference: launch in inference mode (no optimizer and no lr scheduler) + + Returns: model, optimizer, lr_scheduler + + """ + import deepspeed + from deepspeed.utils import logger as ds_logger + + model = trainer.model + args = trainer.args + + # resume config update - some bits like `model` and `num_training_steps` only become available during train + hf_deepspeed_config = args.hf_deepspeed_config + hf_deepspeed_config.trainer_config_finalize(args, model, num_training_steps) + config = hf_deepspeed_config.config + + # set the Deepspeed log level consistent with the Trainer + ds_logger.setLevel(args.get_process_log_level()) if inference: - optimizer = None - lr_scheduler = None + # only Z3 makes sense for the inference + if not hf_deepspeed_config.is_zero3(): + raise ValueError("ZeRO inference only makes sense with ZeRO Stage 3 - please adjust your config") + + # in case the training config is re-used for inference hf_deepspeed_config.del_config_sub_tree("optimizer") hf_deepspeed_config.del_config_sub_tree("lr_scheduler") + optimizer, lr_scheduler = None, None model_parameters = None else: + optimizer, lr_scheduler = deepspeed_optim_sched(trainer, hf_deepspeed_config, args, num_training_steps) model_parameters = filter(lambda p: p.requires_grad, model.parameters()) + # keep for quick debug: + # from pprint import pprint; pprint(config) + model, optimizer, _, lr_scheduler = deepspeed.initialize( model=model, model_parameters=model_parameters, From 4bbd4c7084127ab3ddac05753868cfc57bde447e Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Fri, 19 Nov 2021 18:12:48 -0800 Subject: [PATCH 6/8] fix test --- tests/deepspeed/test_deepspeed.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py index 9e9d9c5996091a..aa1f331e2385d8 100644 --- a/tests/deepspeed/test_deepspeed.py +++ b/tests/deepspeed/test_deepspeed.py @@ -669,11 +669,10 @@ class TestDeepSpeedWithLauncher(TestCasePlus): def test_basic_distributed(self, stage): self.run_and_check(stage=stage, distributed=True) - @parameterized.expand(stages) - def test_do_eval_no_train(self, stage): - # we should not fail if train is skipped + def test_do_eval_no_train(self): + # testing only zero3 since zero2 makes no sense with inference self.run_and_check( - stage=stage, + stage=ZERO3, eval_steps=1, distributed=False, do_train=False, From 1cd19d3a8233b6025ca74a41cf428b3321c6b384 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Sat, 20 Nov 2021 20:36:21 -0800 Subject: [PATCH 7/8] Apply suggestions from code review Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- docs/source/main_classes/deepspeed.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/main_classes/deepspeed.rst b/docs/source/main_classes/deepspeed.rst index cebb2d8fb0e22f..14e85bb2e3b3cd 100644 --- a/docs/source/main_classes/deepspeed.rst +++ b/docs/source/main_classes/deepspeed.rst @@ -1656,7 +1656,7 @@ Otherwise you just need to pass the usual :class:`~transformers.TrainingArgument deepspeed --num_gpus=2 your_program.py --do_eval --deepspeed ds_config.json -The only important thing is that you need to use ZeRO-3 configuration, since ZeRO-2 provides no benefit whatsoever for +The only important thing is that you need to use a ZeRO-3 configuration, since ZeRO-2 provides no benefit whatsoever for the inference as only ZeRO-3 performs sharding of parameters, whereas ZeRO-1 shards gradients and optimizer states. Here is an example of running ``run_translation.py`` under DeepSpeed deploying all available GPUs: From 0a9474862159f4c6ea2ca0ffc0fda2f335916b2b Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Sat, 20 Nov 2021 20:47:57 -0800 Subject: [PATCH 8/8] responding to suggestions --- docs/source/main_classes/deepspeed.rst | 12 +++++++----- src/transformers/deepspeed.py | 10 +++++++++- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/docs/source/main_classes/deepspeed.rst b/docs/source/main_classes/deepspeed.rst index 14e85bb2e3b3cd..5b2e6e64e5c0c5 100644 --- a/docs/source/main_classes/deepspeed.rst +++ b/docs/source/main_classes/deepspeed.rst @@ -50,11 +50,13 @@ What is integrated: Training: -1. full ZeRO protocol with ZeRO-Infinity +1. DeepSpeed ZeRO training supports the full ZeRO stages 1, 2 and 3 with ZeRO-Infinity (CPU and NVME offload). Inference: -1. DeepSpeed ZeRO Inference - same as Training but doesn't require Optimizer +1. DeepSpeed ZeRO Inference supports ZeRO stage 3 with ZeRO-Infinity. It uses the same ZeRO protocol as training, but + it doesn't use an optimizer and a lr scheduler and only stage 3 is relevant. For more details see: + :ref:`deepspeed-zero-inference`. There is also DeepSpeed Inference - this is a totally different technology which uses Tensor Parallelism instead of ZeRO (coming soon). @@ -1643,7 +1645,7 @@ larger multi-dimensional shape, this means that the parameter is partitioned and .. _deepspeed-zero-inference: -Inference +ZeRO Inference ======================================================================================================================= ZeRO Inference uses the same config as ZeRO-3 Training. You just don't need the optimizer and scheduler sections. In @@ -1656,8 +1658,8 @@ Otherwise you just need to pass the usual :class:`~transformers.TrainingArgument deepspeed --num_gpus=2 your_program.py --do_eval --deepspeed ds_config.json -The only important thing is that you need to use a ZeRO-3 configuration, since ZeRO-2 provides no benefit whatsoever for -the inference as only ZeRO-3 performs sharding of parameters, whereas ZeRO-1 shards gradients and optimizer states. +The only important thing is that you need to use a ZeRO-3 configuration, since ZeRO-2 provides no benefit whatsoever +for the inference as only ZeRO-3 performs sharding of parameters, whereas ZeRO-1 shards gradients and optimizer states. Here is an example of running ``run_translation.py`` under DeepSpeed deploying all available GPUs: diff --git a/src/transformers/deepspeed.py b/src/transformers/deepspeed.py index 487dd4ea5f445e..edbcbd50cca200 100644 --- a/src/transformers/deepspeed.py +++ b/src/transformers/deepspeed.py @@ -112,6 +112,11 @@ def get_value(self, ds_key_long, default=None): return config.get(ds_key, default) def del_config_sub_tree(self, ds_key_long, must_exist=False): + """ + Deletes a sub-section of the config file if it's found. + + Unless ``must_exist`` is :obj:`True` the section doesn't have to exist. + """ config = self.config # find the config node of interest if it exists @@ -121,7 +126,7 @@ def del_config_sub_tree(self, ds_key_long, must_exist=False): config = config.get(node) if config is None: if must_exist: - raise ValueError(f"Can't find {ds_key_long} entry in the config") + raise ValueError(f"Can't find {ds_key_long} entry in the config: {self.config}") else: return @@ -299,6 +304,9 @@ def deepspeed_config(): def deepspeed_optim_sched(trainer, hf_deepspeed_config, args, num_training_steps): + """ + A convenience wrapper that deals with optimizer and lr scheduler configuration. + """ config = hf_deepspeed_config.config # Optimizer + Scheduler