diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index af908e48e4b8c4..e192b9da2a6adf 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -2251,7 +2251,8 @@ def _inner_training_loop( else: debug_overflow = DebugUnderflowOverflow(self.model) # noqa - delay_optimizer_creation = is_sagemaker_mp_enabled() or self.is_fsdp_xla_enabled + delay_optimizer_creation = (is_sagemaker_mp_enabled() or self.is_fsdp_xla_enabled + or (self.is_fsdp_enabled and not args.fp8)) # We need to reset the scheduler, as its parameters may be different on subsequent calls if self._created_lr_scheduler: