diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index f30b2eec73661c..0a75f88c13db86 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -566,7 +566,7 @@ class TrainingArguments: used when the xla flag is set to true, and an auto wrapping policy is specified through fsdp_min_num_params or fsdp_transformer_layer_cls_to_wrap. tp_size (`int`, *optional*): - Use tp_size to enable pytorch 2.0 tensor parallelism. Set a value greater than 1 to activate TP. The same is + Use tp_size to enable PyTorch tensor parallelism. Set a value greater than 1 to activate TP. The same is used to prepare device mesh internally. deepspeed (`str` or `dict`, *optional*): Use [Deepspeed](https://github.com/microsoft/deepspeed). This is an experimental feature and its API may @@ -1245,7 +1245,7 @@ class TrainingArguments: default=0, metadata={ "help": ( - "Use tp_size to enable pytorch 2.0 tensor parallelism." + "Use tp_size to enable pytorch tensor parallelism." "Set a value greater than 1 to activate TP." "The same is used to prepare device mesh internally." ) @@ -1978,6 +1978,7 @@ def __post_init__(self): if self.tp_size > 1: os.environ["ACCELERATE_USE_TP"] = "true" + os.environ["TP_SIZE"] = self.tp_size # accelerate integration for FSDP if len(self.fsdp) > 0 and not self.fsdp_config["xla"]: os.environ["ACCELERATE_USE_FSDP"] = "true"