diff --git a/docs/source/en/model_doc/llava.md b/docs/source/en/model_doc/llava.md index 99950a2ffd8e93..7f326bd0c006db 100644 --- a/docs/source/en/model_doc/llava.md +++ b/docs/source/en/model_doc/llava.md @@ -85,10 +85,10 @@ LLaVa also supports batched inference. Here is how you can do it: import requests from PIL import Image import torch -from transformers import AutoProcessor, LLavaForConditionalGeneration +from transformers import AutoProcessor, LlavaForConditionalGeneration # Load the model in half-precision -model = LLavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf", torch_dtype=torch.float16, device_map="auto") +model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf", torch_dtype=torch.float16, device_map="auto") processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf") # Get two different images diff --git a/docs/source/en/quantization/overview.md b/docs/source/en/quantization/overview.md index ef8ed444d9d49b..91c6ebd40dab4d 100644 --- a/docs/source/en/quantization/overview.md +++ b/docs/source/en/quantization/overview.md @@ -45,19 +45,19 @@ In short, supporting a wide range of quantization methods allows you to pick the Use the table below to help you decide which quantization method to use. -| Quantization method | On the fly quantization | CPU | CUDA GPU | RoCm GPU (AMD) | Metal (Apple Silicon) | torch.compile() support | Number of bits | Supports fine-tuning (through PEFT) | Serializable with 🤗 transformers | 🤗 transformers support | Link to library | -|-------------------------------------|-------------------------|-----|----------|----------------|-----------------------|-------------------------|----------------|-------------------------------------|--------------|------------------------|---------------------------------------------| -| [AQLM](./aqlm) | 🔴 | 🟢 | 🟢 | 🔴 | 🔴 | 🟢 | 1 / 2 | 🟢 | 🟢 | 🟢 | https://github.com/Vahe1994/AQLM | -| [AWQ](./awq) | 🔴 | 🔴 | 🟢 | 🟢 | 🔴 | ? | 4 | 🟢 | 🟢 | 🟢 | https://github.com/casper-hansen/AutoAWQ | -| [bitsandbytes](./bitsandbytes) | 🟢 | 🟡 * | 🟢 | 🟡 * | 🔴 ** | 🔴 (soon!) | 4 / 8 | 🟢 | 🟢 | 🟢 | https://github.com/bitsandbytes-foundation/bitsandbytes | -| [compressed-tensors](./compressed_tensors) | 🔴 | 🟢 | 🟢 | 🟢 | 🔴 | 🔴 | 1 - 8 | 🟢 | 🟢 | 🟢 | https://github.com/neuralmagic/compressed-tensors | -| [EETQ](./eetq) | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | ? | 8 | 🟢 | 🟢 | 🟢 | https://github.com/NetEase-FuXi/EETQ | -| GGUF / GGML (llama.cpp) | 🟢 | 🟢 | 🟢 | 🔴 | 🟢 | 🔴 | 1 - 8 | 🔴 | [See GGUF section](../gguf) | [See GGUF section](../gguf) | https://github.com/ggerganov/llama.cpp | -| [GPTQ](./gptq) | 🔴 | 🔴 | 🟢 | 🟢 | 🔴 | 🔴 | 2 - 3 - 4 - 8 | 🟢 | 🟢 | 🟢 | https://github.com/AutoGPTQ/AutoGPTQ | -| [HQQ](./hqq) | 🟢 | 🟢 | 🟢 | 🔴 | 🔴 | 🟢 | 1 - 8 | 🟢 | 🔴 | 🟢 | https://github.com/mobiusml/hqq/ | -| [Quanto](./quanto) | 🟢 | 🟢 | 🟢 | 🔴 | 🟢 | 🟢 | 2 / 4 / 8 | 🔴 | 🔴 | 🟢 | https://github.com/huggingface/quanto | -| [FBGEMM_FP8](./fbgemm_fp8.md) | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | 🔴 | 8 | 🔴 | 🟢 | 🟢 | https://github.com/pytorch/FBGEMM | -| [torchao](./torchao.md) | 🟢 | | 🟢 | 🔴 | partial support (int4 weight only) | | 4 / 8 | | 🟢🔴 | 🟢 | https://github.com/pytorch/ao | +| Quantization method | On the fly quantization | CPU | CUDA GPU | RoCm GPU (AMD) | Metal (Apple Silicon) | Intel GPU | torch.compile() support | Number of bits | Supports fine-tuning (through PEFT) | Serializable with 🤗 transformers | 🤗 transformers support | Link to library | +|-------------------------------------|-------------------------|-----|----------|----------------|-----------------------|-----------|-------------------------|----------------|-------------------------------------|--------------|------------------------|---------------------------------------------| +| [AQLM](./aqlm) | 🔴 | 🟢 | 🟢 | 🔴 | 🔴 | 🔴 | 🟢 | 1 / 2 | 🟢 | 🟢 | 🟢 | https://github.com/Vahe1994/AQLM | +| [AWQ](./awq) | 🔴 | 🟢 | 🟢 | 🟢 | 🔴 | 🟢 | ? | 4 | 🟢 | 🟢 | 🟢 | https://github.com/casper-hansen/AutoAWQ | +| [bitsandbytes](./bitsandbytes) | 🟢 | 🟡 * | 🟢 | 🟡 * | 🔴 ** | 🟡 * | 🔴 (soon!) | 4 / 8 | 🟢 | 🟢 | 🟢 | https://github.com/bitsandbytes-foundation/bitsandbytes | +| [compressed-tensors](./compressed_tensors) | 🔴 | 🟢 | 🟢 | 🟢 | 🔴 | 🔴 | 🔴 | 1 - 8 | 🟢 | 🟢 | 🟢 | https://github.com/neuralmagic/compressed-tensors | +| [EETQ](./eetq) | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | 🔴 | ? | 8 | 🟢 | 🟢 | 🟢 | https://github.com/NetEase-FuXi/EETQ | +| GGUF / GGML (llama.cpp) | 🟢 | 🟢 | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | 1 - 8 | 🔴 | [See GGUF section](../gguf) | [See GGUF section](../gguf) | https://github.com/ggerganov/llama.cpp | +| [GPTQ](./gptq) | 🔴 | 🔴 | 🟢 | 🟢 | 🔴 | 🔴 | 🔴 | 2 - 3 - 4 - 8 | 🟢 | 🟢 | 🟢 | https://github.com/AutoGPTQ/AutoGPTQ | +| [HQQ](./hqq) | 🟢 | 🟢 | 🟢 | 🔴 | 🔴 | 🔴 | 🟢 | 1 - 8 | 🟢 | 🔴 | 🟢 | https://github.com/mobiusml/hqq/ | +| [Quanto](./quanto) | 🟢 | 🟢 | 🟢 | 🔴 | 🟢 | 🔴 | 🟢 | 2 / 4 / 8 | 🔴 | 🔴 | 🟢 | https://github.com/huggingface/quanto | +| [FBGEMM_FP8](./fbgemm_fp8.md) | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | 🔴 | 🔴 | 8 | 🔴 | 🟢 | 🟢 | https://github.com/pytorch/FBGEMM | +| [torchao](./torchao.md) | 🟢 | | 🟢 | 🔴 | partial support (int4 weight only) | 🔴 | | 4 / 8 | | 🟢🔴 | 🟢 | https://github.com/pytorch/ao | diff --git a/setup.py b/setup.py index cbfcfd43428524..922258d65efab7 100644 --- a/setup.py +++ b/setup.py @@ -117,7 +117,7 @@ "fugashi>=1.0", "GitPython<3.1.19", "hf-doc-builder>=0.3.0", - "huggingface-hub>=0.23.2,<1.0", + "huggingface-hub>=0.24.0,<1.0", "importlib_metadata", "ipadic>=1.0.0,<2.0", "isort>=5.5.4", diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index a633f54a4af1a8..9543b58ad40d91 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -24,7 +24,7 @@ "fugashi": "fugashi>=1.0", "GitPython": "GitPython<3.1.19", "hf-doc-builder": "hf-doc-builder>=0.3.0", - "huggingface-hub": "huggingface-hub>=0.23.2,<1.0", + "huggingface-hub": "huggingface-hub>=0.24.0,<1.0", "importlib_metadata": "importlib_metadata", "ipadic": "ipadic>=1.0.0,<2.0", "isort": "isort>=5.5.4", diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py index b24c4c9e0d5cad..4b236b9155f158 100755 --- a/src/transformers/integrations/integration_utils.py +++ b/src/transformers/integrations/integration_utils.py @@ -918,7 +918,7 @@ def on_train_end(self, args, state, control, model=None, tokenizer=None, **kwarg if self._log_model.is_enabled and self._initialized and state.is_world_process_zero: from ..trainer import Trainer - fake_trainer = Trainer(args=args, model=model, processing_class=tokenizer) + fake_trainer = Trainer(args=args, model=model, processing_class=tokenizer, eval_dataset=["fake"]) with tempfile.TemporaryDirectory() as temp_dir: fake_trainer.save_model(temp_dir) metadata = ( diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index d0dacc4802438d..3eff746774604c 100755 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -95,7 +95,7 @@ replace_return_docstrings, strtobool, ) -from .utils.hub import convert_file_size_to_int, create_and_tag_model_card, get_checkpoint_shard_files +from .utils.hub import create_and_tag_model_card, get_checkpoint_shard_files from .utils.import_utils import ( ENV_VARS_TRUE_VALUES, is_sagemaker_mp_enabled, @@ -382,92 +382,6 @@ def check_support_param_buffer_assignment(model_to_load, state_dict, start_prefi return False -def shard_checkpoint( - state_dict: Dict[str, torch.Tensor], max_shard_size: Union[int, str] = "10GB", weights_name: str = WEIGHTS_NAME -): - """ - Splits a model state dictionary in sub-checkpoints so that the final size of each sub-checkpoint does not exceed a - given size. - - The sub-checkpoints are determined by iterating through the `state_dict` in the order of its keys, so there is no - optimization made to make each sub-checkpoint as close as possible to the maximum size passed. For example, if the - limit is 10GB and we have weights of sizes [6GB, 6GB, 2GB, 6GB, 2GB, 2GB] they will get sharded as [6GB], [6+2GB], - [6+2+2GB] and not [6+2+2GB], [6+2GB], [6GB]. - - - - If one of the model's weight is bigger than `max_shard_size`, it will end up in its own sub-checkpoint which will - have a size greater than `max_shard_size`. - - - - Args: - state_dict (`Dict[str, torch.Tensor]`): The state dictionary of a model to save. - max_shard_size (`int` or `str`, *optional*, defaults to `"10GB"`): - The maximum size of each sub-checkpoint. If expressed as a string, needs to be digits followed by a unit - (like `"5MB"`). - weights_name (`str`, *optional*, defaults to `"pytorch_model.bin"`): - The name of the model save file. - """ - logger.warning( - "Note that `shard_checkpoint` is deprecated and will be removed in v4.44. We recommend you using " - "split_torch_state_dict_into_shards from huggingface_hub library" - ) - max_shard_size = convert_file_size_to_int(max_shard_size) - - sharded_state_dicts = [{}] - last_block_size = 0 - total_size = 0 - storage_id_to_block = {} - - for key, weight in state_dict.items(): - # when bnb serialization is used the weights in the state dict can be strings - # check: https://github.com/huggingface/transformers/pull/24416 for more details - if isinstance(weight, str): - continue - else: - storage_id = id_tensor_storage(weight) - - # If a `weight` shares the same underlying storage as another tensor, we put `weight` in the same `block` - if storage_id in storage_id_to_block and weight.device != torch.device("meta"): - block_id = storage_id_to_block[storage_id] - sharded_state_dicts[block_id][key] = weight - continue - - weight_size = weight.numel() * dtype_byte_size(weight.dtype) - # If this weight is going to tip up over the maximal size, we split, but only if we have put at least one - # weight in the current shard. - if last_block_size + weight_size > max_shard_size and len(sharded_state_dicts[-1]) > 0: - sharded_state_dicts.append({}) - last_block_size = 0 - - sharded_state_dicts[-1][key] = weight - last_block_size += weight_size - total_size += weight_size - storage_id_to_block[storage_id] = len(sharded_state_dicts) - 1 - - # If we only have one shard, we return it - if len(sharded_state_dicts) == 1: - return {weights_name: sharded_state_dicts[0]}, None - - # Otherwise, let's build the index - weight_map = {} - shards = {} - for idx, shard in enumerate(sharded_state_dicts): - shard_file = weights_name.replace(".bin", f"-{idx+1:05d}-of-{len(sharded_state_dicts):05d}.bin") - shard_file = shard_file.replace( - ".safetensors", f"-{idx + 1:05d}-of-{len(sharded_state_dicts):05d}.safetensors" - ) - shards[shard_file] = shard - for key in shard.keys(): - weight_map[key] = shard_file - - # Add the metadata - metadata = {"total_size": total_size} - index = {"metadata": metadata, "weight_map": weight_map} - return shards, index - - def load_sharded_checkpoint(model, folder, strict=True, prefer_safe=True): """ This is the same as diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py index 08e42d1c8f70cb..d34528b7431453 100644 --- a/src/transformers/models/blip_2/modeling_blip_2.py +++ b/src/transformers/models/blip_2/modeling_blip_2.py @@ -2203,7 +2203,7 @@ def forward( logger.warning_once( "Expanding inputs for image tokens in BLIP-2 should be done in processing. " "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + "Using processors without these attributes in the config is deprecated and will throw an error in v4.50." ) inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1) attention_mask = torch.cat( @@ -2326,7 +2326,7 @@ def generate( logger.warning_once( "Expanding inputs for image tokens in BLIP-2 should be done in processing. " "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + "Using processors without these attributes in the config is deprecated and will throw an error in v4.50." ) inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1) attention_mask = torch.cat( diff --git a/src/transformers/models/blip_2/processing_blip_2.py b/src/transformers/models/blip_2/processing_blip_2.py index c6852378412895..4129920f9b3663 100644 --- a/src/transformers/models/blip_2/processing_blip_2.py +++ b/src/transformers/models/blip_2/processing_blip_2.py @@ -153,7 +153,7 @@ def __call__( logger.warning_once( "Expanding inputs for image tokens in BLIP-2 should be done in processing. " "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + "Using processors without these attributes in the config is deprecated and will throw an error in v4.50." ) # cast to desired return tensors type diff --git a/src/transformers/models/instructblip/modeling_instructblip.py b/src/transformers/models/instructblip/modeling_instructblip.py index a78a3b66877429..e5622185bc39a8 100644 --- a/src/transformers/models/instructblip/modeling_instructblip.py +++ b/src/transformers/models/instructblip/modeling_instructblip.py @@ -1471,7 +1471,7 @@ def forward( logger.warning_once( "Expanding inputs for image tokens in InstructBLIP should be done in processing. " "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your InstructBLIP model. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + "Using processors without these attributes in the config is deprecated and will throw an error in v4.50." ) inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1) attention_mask = torch.cat( @@ -1610,7 +1610,7 @@ def generate( logger.warning_once( "Expanding inputs for image tokens in InstructBLIP should be done in processing. " "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your InstructBLIP model. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + "Using processors without these attributes in the config is deprecated and will throw an error in v4.50." ) inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1) attention_mask = torch.cat( diff --git a/src/transformers/models/instructblip/processing_instructblip.py b/src/transformers/models/instructblip/processing_instructblip.py index 3d48839d376c5c..a96d97fb07e1d9 100644 --- a/src/transformers/models/instructblip/processing_instructblip.py +++ b/src/transformers/models/instructblip/processing_instructblip.py @@ -148,7 +148,7 @@ def __call__( logger.warning_once( "Expanding inputs for image tokens in InstructBLIP should be done in processing. " "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your InstructBLIP model. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + "Using processors without these attributes in the config is deprecated and will throw an error in v4.50." ) # cast to desired return tensors type after concatenating diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py index 6d6bf4a6f38e3f..e8536ee50f94bb 100644 --- a/src/transformers/models/llava/modeling_llava.py +++ b/src/transformers/models/llava/modeling_llava.py @@ -485,7 +485,7 @@ def forward( "Expanding inputs for image tokens in LLaVa should be done in processing. " "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly " "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + "Using processors without these attributes in the config is deprecated and will throw an error in v4.50." ) # prefill stage vs decoding stage (legacy behavior copied) if input_ids.shape[1] != 1: diff --git a/src/transformers/models/llava/processing_llava.py b/src/transformers/models/llava/processing_llava.py index 0ff40acc405224..820fa581711a63 100644 --- a/src/transformers/models/llava/processing_llava.py +++ b/src/transformers/models/llava/processing_llava.py @@ -160,7 +160,7 @@ def __call__( "Expanding inputs for image tokens in LLaVa should be done in processing. " "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly " "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + "Using processors without these attributes in the config is deprecated and will throw an error in v4.50." ) text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"]) diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py index 2d23c48225cd00..269663c7d6141a 100644 --- a/src/transformers/models/llava_next/modeling_llava_next.py +++ b/src/transformers/models/llava_next/modeling_llava_next.py @@ -868,7 +868,7 @@ def forward( "Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. " "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly " "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + "Using processors without these attributes in the config is deprecated and will throw an error in v4.50." ) if input_ids.shape[1] != 1: inputs_embeds = inputs_embeds.to(image_features.dtype) diff --git a/src/transformers/models/llava_next/processing_llava_next.py b/src/transformers/models/llava_next/processing_llava_next.py index 310083c1ce53ac..89b885f0f1abb2 100644 --- a/src/transformers/models/llava_next/processing_llava_next.py +++ b/src/transformers/models/llava_next/processing_llava_next.py @@ -143,7 +143,7 @@ def __call__( "Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. " "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly " "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + "Using processors without these attributes in the config is deprecated and will throw an error in v4.50." ) else: image_sizes = iter(image_inputs["image_sizes"]) diff --git a/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py b/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py index 502aa78263649a..284d8a3d454848 100644 --- a/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py +++ b/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py @@ -21,10 +21,11 @@ import types import torch +from huggingface_hub import split_torch_state_dict_into_shards from packaging import version from transformers import AutoTokenizer, GPT2Config -from transformers.modeling_utils import WEIGHTS_INDEX_NAME, WEIGHTS_NAME, shard_checkpoint +from transformers.modeling_utils import WEIGHTS_INDEX_NAME, WEIGHTS_NAME def add_checkpointing_args(parser): @@ -571,7 +572,15 @@ def convert_checkpoint_from_megatron_to_transformers(args): # Store the state_dict to file. max_shard_size = int(args.max_shard_size) if args.max_shard_size.isdigit() else args.max_shard_size - shards, index = shard_checkpoint(output_state_dict, max_shard_size=max_shard_size) + state_dict_split = split_torch_state_dict_into_shards(output_state_dict, max_shard_size=max_shard_size) + shards = index = None + for tensors in state_dict_split.filename_to_tensors.values(): + shards = {tensor: state_dict[tensor] for tensor in tensors} + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } # Save the model for shard_file, shard in shards.items(): diff --git a/src/transformers/models/rwkv/convert_rwkv_checkpoint_to_hf.py b/src/transformers/models/rwkv/convert_rwkv_checkpoint_to_hf.py index 44cf17b1cf1899..a0c97fc4e234ab 100644 --- a/src/transformers/models/rwkv/convert_rwkv_checkpoint_to_hf.py +++ b/src/transformers/models/rwkv/convert_rwkv_checkpoint_to_hf.py @@ -21,10 +21,10 @@ import re import torch -from huggingface_hub import hf_hub_download +from huggingface_hub import hf_hub_download, split_torch_state_dict_into_shards from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerFast, RwkvConfig -from transformers.modeling_utils import WEIGHTS_INDEX_NAME, shard_checkpoint +from transformers.modeling_utils import WEIGHTS_INDEX_NAME NUM_HIDDEN_LAYERS_MAPPING = { @@ -116,7 +116,16 @@ def convert_rmkv_checkpoint_to_hf_format( state_dict = convert_state_dict(state_dict) # 4. Split in shards and save - shards, index = shard_checkpoint(state_dict) + state_dict_split = split_torch_state_dict_into_shards(state_dict) + shards = index = None + for tensors in state_dict_split.filename_to_tensors.values(): + shards = {tensor: state_dict[tensor] for tensor in tensors} + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + for shard_file, shard in shards.items(): torch.save(shard, os.path.join(output_dir, shard_file)) diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py index a3b3de33fa66ee..30adcb6ab5c089 100644 --- a/src/transformers/models/video_llava/modeling_video_llava.py +++ b/src/transformers/models/video_llava/modeling_video_llava.py @@ -578,7 +578,7 @@ def forward( "Expanding inputs for image tokens in Video-LLaVa should be done in processing. " "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly " "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + "Using processors without these attributes in the config is deprecated and will throw an error in v4.50." ) if input_ids.shape[1] != 1: for features, frames in ((image_features, 1), (video_features, num_frames)): diff --git a/src/transformers/models/video_llava/processing_video_llava.py b/src/transformers/models/video_llava/processing_video_llava.py index d3c27ef56ca0ce..597d94cc2f0031 100644 --- a/src/transformers/models/video_llava/processing_video_llava.py +++ b/src/transformers/models/video_llava/processing_video_llava.py @@ -149,9 +149,10 @@ def __call__( if encoded_images is not None and (self.patch_size is None or self.vision_feature_select_strategy is None): logger.warning_once( "Expanding inputs for image tokens in Video-LLaVa should be done in processing. " - "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly " - "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.44." + "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set " + "directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = " + "{{vision_feature_select_strategy}}`. Using processors without these attributes in the config is " + "deprecated and will throw an error in v4.50." ) # Replace the image/video tokens with the expanded token sequence elif encoded_images is not None: diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py index 4060f8c8ecd1bf..b45325d2194e24 100644 --- a/src/transformers/models/vipllava/modeling_vipllava.py +++ b/src/transformers/models/vipllava/modeling_vipllava.py @@ -476,7 +476,7 @@ def forward( logger.warning_once( "Expanding inputs for image tokens in VipLLaVa should be done in processing. " "Please add `patch_size` and `vision_feature_select_strategy` to the model's image processing config. " - "Using processors without these attributes in the config is deprecated and will throw an error in v4.47." + "Using processors without these attributes in the config is deprecated and will throw an error in v4.50." ) # prefill stage vs decoding stage (legacy behavior copied) if input_ids.shape[1] != 1: diff --git a/src/transformers/quantizers/quantizer_awq.py b/src/transformers/quantizers/quantizer_awq.py index 18b883429c5ec5..0c14c236d26036 100644 --- a/src/transformers/quantizers/quantizer_awq.py +++ b/src/transformers/quantizers/quantizer_awq.py @@ -57,14 +57,14 @@ def validate_environment(self, device_map, **kwargs): raise RuntimeError( "To use IPEX backend, you need autoawq>0.6.2. Please install the latest version or from source." ) - if ( - device_map is not None - and isinstance(device_map, dict) - and (torch.device("cpu") not in device_map.values() or len(device_map.values()) > 1) - ): + if device_map is None: + logger.warning_once( + "You have loaded an AWQ model without setting device_map, please set 'cpu' or 'xpu' or 'auto'" + ) + elif isinstance(device_map, dict) and "disk" in device_map.values(): raise ValueError( - "You are attempting to load an IPEX version AWQ model with a device_map that contains more than CPU." - " This is not supported. Please make sure only cpu in the device_map." + "You are attempting to load an IPEX version AWQ model with a device_map that contains disk device." + " This is not supported. Please make sure only cpu and xpu in the device_map." ) else: if not torch.cuda.is_available(): diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 381f3ef497d9bd..03df02d21ff32b 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1690,8 +1690,12 @@ def apply_chat_template( final_message = chat[-1]["content"] if isinstance(final_message, (list, tuple)): final_message = final_message[-1]["text"] - final_message = final_message.strip() - rendered_chat = rendered_chat[: rendered_chat.rindex(final_message) + len(final_message)].rstrip() + try: + rendered_chat = rendered_chat[: rendered_chat.rindex(final_message) + len(final_message)] + except: # noqa: E722 + # Some chat templates like Llama-3.1 trim messages before rendering, so we must do the same here. + final_message = final_message.strip() + rendered_chat = rendered_chat[: rendered_chat.rindex(final_message) + len(final_message)] rendered.append(rendered_chat) if not is_batched: diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index d41b7181be6334..1603a4ec215557 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -66,7 +66,7 @@ from .integrations.deepspeed import deepspeed_init, deepspeed_load_checkpoint, is_deepspeed_available from .integrations.tpu import tpu_spmd_dataloader from .modelcard import TrainingSummary -from .modeling_utils import PreTrainedModel, load_sharded_checkpoint +from .modeling_utils import PreTrainedModel, load_sharded_checkpoint, unwrap_model from .models.auto.modeling_auto import ( MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, MODEL_MAPPING_NAMES, @@ -2277,8 +2277,11 @@ def _inner_training_loop( # FSDP-XLA, SageMaker MP/DP, DataParallel, IPEX use_accelerator_prepare = True if model is self.model else False - # configure fsdp plugin for qlora if any - if use_accelerator_prepare: + if use_accelerator_prepare and self.is_fsdp_enabled: + # In case of auto_find_batch_size=True + # Remove FSDP wrapping from sub-models. + self.model = unwrap_model(self.model, recursive=True) + # configure fsdp plugin for qlora if any self._fsdp_qlora_plugin_updates() if delay_optimizer_creation: @@ -2488,7 +2491,7 @@ def _inner_training_loop( # We explicitly want to avoid relying on `accelerator.accumulate` for generation training context = ( functools.partial(self.accelerator.no_sync, model=model) - if i == len(batch_samples) - 1 + if i != len(batch_samples) - 1 else contextlib.nullcontext ) with context(): diff --git a/tests/fsdp/test_fsdp.py b/tests/fsdp/test_fsdp.py index 7e14cc8c9e6fc9..74a3bfe04b7506 100644 --- a/tests/fsdp/test_fsdp.py +++ b/tests/fsdp/test_fsdp.py @@ -224,6 +224,18 @@ def test_basic_run(self, sharding_strategy, dtype): cmd = launcher + script + args + fsdp_args execute_subprocess_async(cmd, env=self.get_env()) + @parameterized.expand(params, name_func=_parameterized_custom_name_func) + @require_torch_multi_accelerator + @slow + def test_basic_run_with_gradient_accumulation(self, sharding_strategy, dtype): + launcher = get_launcher(distributed=True, use_accelerate=False) + output_dir = self.get_auto_remove_tmp_dir() + args = self.get_base_args(output_dir, 1, 50).split() + [f"--{dtype}", "--gradient_accumulation_steps", "2"] + fsdp_args = ["--fsdp", f"{sharding_strategy} auto_wrap", "--fsdp_transformer_layer_cls_to_wrap", "BertLayer"] + script = [f"{self.examples_dir_str}/pytorch/text-classification/run_glue.py"] + cmd = launcher + script + args + fsdp_args + execute_subprocess_async(cmd, env=self.get_env()) + @parameterized.expand(dtypes) @require_torch_multi_accelerator @slow diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index a3bbbf3c9e97b2..f04a4255556baf 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -1461,6 +1461,38 @@ def test_continue_final_message(self): "<|im_start|>system\nsystem message<|im_end|>\n<|im_start|>user\nuser message<|im_end|>\n<|im_start|>assistant\nassistant message", ) + @require_jinja + def test_continue_final_message_with_trim(self): + """Regression test for chat templates with trimming: https://github.com/huggingface/transformers/pull/34214""" + + dummy_template = """ + {%- for message in messages %} + {{- "<|im_start|>" + message['role'] + "\n" + message['content'] | trim + "<|im_end|>" + "\n"}} + {%- endfor %}""" + dummy_conversation = [ + {"role": "system", "content": "system message"}, + {"role": "user", "content": "user message"}, + {"role": "assistant", "content": "assistant message "}, # Note the trailing whitespace + ] + tokenizers = self.get_tokenizers() + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + output = tokenizer.apply_chat_template( + dummy_conversation, chat_template=dummy_template, tokenize=False, continue_final_message=False + ) + self.assertEqual( + output, + "<|im_start|>system\nsystem message<|im_end|>\n<|im_start|>user\nuser message<|im_end|>\n<|im_start|>assistant\nassistant message<|im_end|>\n", + ) + prefill_output = tokenizer.apply_chat_template( + dummy_conversation, chat_template=dummy_template, tokenize=False, continue_final_message=True + ) + # Assert that the final message is unterminated + self.assertEqual( + prefill_output, + "<|im_start|>system\nsystem message<|im_end|>\n<|im_start|>user\nuser message<|im_end|>\n<|im_start|>assistant\nassistant message", + ) + @require_jinja def test_chat_template_dict(self): dummy_template_1 = "{{'a'}}" diff --git a/tests/trainer/test_trainer_fsdp.py b/tests/trainer/test_trainer_fsdp.py index 4bcf5de04520e2..eca6a30664f045 100644 --- a/tests/trainer/test_trainer_fsdp.py +++ b/tests/trainer/test_trainer_fsdp.py @@ -117,6 +117,33 @@ def test_trainer(self): execute_subprocess_async(cmd, env=self.get_env()) # successful return here == success - any errors would have caused an error in the sub-call + class TestFSDPTrainerWrap(TestCasePlus): + @require_accelerate + @require_torch_multi_gpu + @require_fsdp + def test_trainer(self): + output_dir = self.get_auto_remove_tmp_dir() + cmd = [ + "accelerate", + "launch", + "--use_fsdp", + "--main_process_port", + f"{get_torch_dist_unique_port()}", + "--num_processes", + f"{torch.cuda.device_count()}", + "--fsdp_transformer_layer_cls_to_wrap", + "GPT2Block", + f"{self.test_file_dir}/test_trainer_fsdp.py", + "--output_dir", + f"{output_dir}", + "--report_to", + "none", + "--auto_find_batch_size", + "True", + ] + execute_subprocess_async(cmd, env=self.get_env()) + # successful return here == success - any errors would have caused an error in the sub-call + if __name__ == "__main__": parser = HfArgumentParser((Seq2SeqTrainingArguments,)) diff --git a/tests/utils/test_modeling_utils.py b/tests/utils/test_modeling_utils.py index 5fd6251224c3ed..96a30df7e5587f 100644 --- a/tests/utils/test_modeling_utils.py +++ b/tests/utils/test_modeling_utils.py @@ -105,7 +105,6 @@ _find_disjoint, _find_identical, dtype_byte_size, - shard_checkpoint, ) from transformers.pytorch_utils import isin_mps_friendly @@ -668,71 +667,6 @@ def test_no_super_init_config_and_model(self): for p1, p2 in zip(model.parameters(), new_model.parameters()): self.assertTrue(torch.equal(p1, p2)) - def test_shard_checkpoint(self): - # This is the model we will use, total size 340,000 bytes. - model = torch.nn.Sequential( - torch.nn.Linear(100, 200, bias=False), # size 80,000 - torch.nn.Linear(200, 200, bias=False), # size 160,000 - torch.nn.Linear(200, 100, bias=False), # size 80,000 - torch.nn.Linear(100, 50, bias=False), # size 20,000 - ) - state_dict = model.state_dict() - - with self.subTest("No shard when max size is bigger than model size"): - shards, index = shard_checkpoint(state_dict) - self.assertIsNone(index) - self.assertDictEqual(shards, {WEIGHTS_NAME: state_dict}) - - with self.subTest("Test sharding, no weights bigger than max size"): - shards, index = shard_checkpoint(state_dict, max_shard_size="300kB") - # Split is first two layers then last two. - self.assertDictEqual( - index, - { - "metadata": {"total_size": 340000}, - "weight_map": { - "0.weight": "pytorch_model-00001-of-00002.bin", - "1.weight": "pytorch_model-00001-of-00002.bin", - "2.weight": "pytorch_model-00002-of-00002.bin", - "3.weight": "pytorch_model-00002-of-00002.bin", - }, - }, - ) - - shard1 = {"0.weight": state_dict["0.weight"], "1.weight": state_dict["1.weight"]} - shard2 = {"2.weight": state_dict["2.weight"], "3.weight": state_dict["3.weight"]} - self.assertDictEqual( - shards, {"pytorch_model-00001-of-00002.bin": shard1, "pytorch_model-00002-of-00002.bin": shard2} - ) - - with self.subTest("Test sharding with weights bigger than max size"): - shards, index = shard_checkpoint(state_dict, max_shard_size="100kB") - # Split is first layer, second layer then last 2. - self.assertDictEqual( - index, - { - "metadata": {"total_size": 340000}, - "weight_map": { - "0.weight": "pytorch_model-00001-of-00003.bin", - "1.weight": "pytorch_model-00002-of-00003.bin", - "2.weight": "pytorch_model-00003-of-00003.bin", - "3.weight": "pytorch_model-00003-of-00003.bin", - }, - }, - ) - - shard1 = {"0.weight": state_dict["0.weight"]} - shard2 = {"1.weight": state_dict["1.weight"]} - shard3 = {"2.weight": state_dict["2.weight"], "3.weight": state_dict["3.weight"]} - self.assertDictEqual( - shards, - { - "pytorch_model-00001-of-00003.bin": shard1, - "pytorch_model-00002-of-00003.bin": shard2, - "pytorch_model-00003-of-00003.bin": shard3, - }, - ) - def test_checkpoint_sharding_local_bin(self): model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert")