From b783dfb3ea5720de698585c643a00eb133c87ce9 Mon Sep 17 00:00:00 2001 From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com> Date: Fri, 21 Jul 2023 14:14:23 +0530 Subject: [PATCH 1/4] fix fsdp prepare to remove the warnings and fix excess memory usage --- src/transformers/trainer.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index da4e623205a7e1..fd090eb028e22c 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -1600,6 +1600,7 @@ def _inner_training_loop( and self.sharded_ddp != ShardedDDPOption.SIMPLE or is_sagemaker_mp_enabled() or self.fsdp is not None + or self.is_fsdp_enabled ) # We need to reset the scheduler, as its parameters may be different on subsequent calls @@ -1631,6 +1632,8 @@ def _inner_training_loop( use_accelerator_prepare = True if model is self.model else False if delay_optimizer_creation: + if use_accelerator_prepare: + self.model = self.accelerator.prepare(self.model) self.create_optimizer_and_scheduler(num_training_steps=max_steps) # prepare using `accelerator` prepare From 241f184cb963d9d4cc4a750c20d7ef2fca1d09df Mon Sep 17 00:00:00 2001 From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com> Date: Fri, 21 Jul 2023 15:02:30 +0530 Subject: [PATCH 2/4] Update training_args.py --- src/transformers/training_args.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index 7be24cc315c88d..da786317e79e6d 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -1567,6 +1567,7 @@ def __post_init__(self): elif fsdp_option == FSDPOption.OFFLOAD: os.environ["FSDP_OFFLOAD_PARAMS"] = "true" elif fsdp_option == FSDPOption.AUTO_WRAP: + os.environ["FSDP_AUTO_WRAP_POLICY"] = FSDP_AUTO_WRAP_POLICY[0] if self.fsdp_config["fsdp_min_num_params"] > 0: os.environ["FSDP_MIN_NUM_PARAMS"] = str(self.fsdp_config["fsdp_min_num_params"]) os.environ["FSDP_AUTO_WRAP_POLICY"] = FSDP_AUTO_WRAP_POLICY[1] @@ -1574,7 +1575,6 @@ def __post_init__(self): os.environ["FSDP_TRANSFORMER_CLS_TO_WRAP"] = ",".join( self.fsdp_config["fsdp_transformer_layer_cls_to_wrap"] ) - os.environ["FSDP_AUTO_WRAP_POLICY"] = FSDP_AUTO_WRAP_POLICY[0] prefetch_policy = self.fsdp_config.get("fsdp_backward_prefetch", "NO_PREFETCH") os.environ["FSDP_BACKWARD_PREFETCH"] = prefetch_policy.upper() From 6a9d8e236b53c7287e6434af5494aebc9ede2629 Mon Sep 17 00:00:00 2001 From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com> Date: Fri, 21 Jul 2023 15:27:49 +0530 Subject: [PATCH 3/4] parity for FSDP+XLA --- docs/source/en/main_classes/trainer.md | 4 ++-- src/transformers/trainer.py | 12 ++++++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/docs/source/en/main_classes/trainer.md b/docs/source/en/main_classes/trainer.md index dcf076cc5ac431..ad3ea57f13342d 100644 --- a/docs/source/en/main_classes/trainer.md +++ b/docs/source/en/main_classes/trainer.md @@ -441,7 +441,7 @@ as the model saving with FSDP activated is only available with recent fixes. - Remaining FSDP config is passed via `--fsdp_config `. It is either a location of FSDP json config file (e.g., `fsdp_config.json`) or an already loaded json file as `dict`. - If auto wrapping is enabled, you can either use transformer based auto wrap policy or size based auto wrap policy. - - For transformer based auto wrap policy, please specify `fsdp_transformer_layer_cls_to_wrap` in the config file. + - For transformer based auto wrap policy, it is recommended to specify `fsdp_transformer_layer_cls_to_wrap` in the config file. If not specified, the default value is `model._no_split_modules` when available. This specifies the list of transformer layer class name (case-sensitive) to wrap ,e.g, [`BertLayer`], [`GPTJBlock`], [`T5Block`] .... This is important because submodules that share weights (e.g., embedding layer) should not end up in different FSDP wrapped units. Using this policy, wrapping happens for each block containing Multi-Head Attention followed by couple of MLP layers. @@ -482,7 +482,7 @@ Pass `--fsdp "full shard"` along with following changes to be made in `--fsdp_co This setting can only be used when the xla flag is set to true, and an auto wrapping policy is specified through `fsdp_min_num_params` or `fsdp_transformer_layer_cls_to_wrap`. - You can either use transformer based auto wrap policy or size based auto wrap policy. - - For transformer based auto wrap policy, please specify `fsdp_transformer_layer_cls_to_wrap` in the config file. + - For transformer based auto wrap policy, it is recommended to specify `fsdp_transformer_layer_cls_to_wrap` in the config file. If not specified, the default value is `model._no_split_modules` when available. This specifies the list of transformer layer class name (case-sensitive) to wrap ,e.g, [`BertLayer`], [`GPTJBlock`], [`T5Block`] .... This is important because submodules that share weights (e.g., embedding layer) should not end up in different FSDP wrapped units. Using this policy, wrapping happens for each block containing Multi-Head Attention followed by couple of MLP layers. diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index fd090eb028e22c..829da1149b5cf1 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -1377,18 +1377,26 @@ def _wrap_model(self, model, training=True, dataloader=None): raise ImportError("Missing XLA FSDP related module; please make sure to use torch-xla >= 2.0.") auto_wrap_policy = None auto_wrapper_callable = None + default_transformer_cls_names_to_wrap = ( + model._no_split_modules if hasattr(model, "_no_split_modules") else None + ) + fsdp_transformer_layer_cls_to_wrap = self.args.fsdp_config.get( + "fsdp_transformer_layer_cls_to_wrap", default_transformer_cls_names_to_wrap + ) + if self.args.fsdp_config["fsdp_min_num_params"] > 0: auto_wrap_policy = functools.partial( size_based_auto_wrap_policy, min_num_params=self.args.fsdp_config["fsdp_min_num_params"] ) - elif self.args.fsdp_config.get("fsdp_transformer_layer_cls_to_wrap", None) is not None: + elif fsdp_transformer_layer_cls_to_wrap is not None: transformer_cls_to_wrap = set() - for layer_class in self.args.fsdp_config["fsdp_transformer_layer_cls_to_wrap"]: + for layer_class in fsdp_transformer_layer_cls_to_wrap: transformer_cls = get_module_class_from_name(model, layer_class) if transformer_cls is None: raise Exception("Could not find the transformer layer class to wrap in the model.") else: transformer_cls_to_wrap.add(transformer_cls) + auto_wrap_policy = functools.partial( transformer_auto_wrap_policy, # Transformer layer class to wrap From c3a225f16482f4a515702f640fabac393f2fb4ff Mon Sep 17 00:00:00 2001 From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com> Date: Fri, 21 Jul 2023 16:29:05 +0530 Subject: [PATCH 4/4] Update trainer.py --- src/transformers/trainer.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 829da1149b5cf1..3611e99035be9d 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -1377,9 +1377,7 @@ def _wrap_model(self, model, training=True, dataloader=None): raise ImportError("Missing XLA FSDP related module; please make sure to use torch-xla >= 2.0.") auto_wrap_policy = None auto_wrapper_callable = None - default_transformer_cls_names_to_wrap = ( - model._no_split_modules if hasattr(model, "_no_split_modules") else None - ) + default_transformer_cls_names_to_wrap = getattr(model, "_no_split_modules", None) fsdp_transformer_layer_cls_to_wrap = self.args.fsdp_config.get( "fsdp_transformer_layer_cls_to_wrap", default_transformer_cls_names_to_wrap )