diff --git a/.github/workflows/self-pr-slow-ci.yml b/.github/workflows/self-pr-slow-ci.yml index 8225e5b6aa7b1d..2287b5e3f31587 100644 --- a/.github/workflows/self-pr-slow-ci.yml +++ b/.github/workflows/self-pr-slow-ci.yml @@ -4,7 +4,7 @@ on: pull_request: paths: - "src/transformers/models/*/modeling_*.py" - - "tests/models/*/test_*.py" + - "tests/**/test_*.py" concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 430670aa4364e6..740bb4b0719c61 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -157,6 +157,8 @@ title: EETQ - local: quantization/hqq title: HQQ + - local: quantization/fbgemm_fp8 + title: FBGEMM_FP8 - local: quantization/optimum title: Optimum - local: quantization/contribute diff --git a/docs/source/en/chat_templating.md b/docs/source/en/chat_templating.md index d840caaf660520..c4069dd1afc706 100644 --- a/docs/source/en/chat_templating.md +++ b/docs/source/en/chat_templating.md @@ -580,7 +580,7 @@ default template for that model class is used instead. Let's take a look at the >>> from transformers import AutoTokenizer >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill") ->>> tokenizer.default_chat_template +>>> tokenizer.chat_template "{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ ' ' }}{% endif %}{% endfor %}{{ eos_token }}" ``` @@ -704,23 +704,6 @@ with other names, pass the name of the template you want to the `chat_template` We find that this can be a bit confusing for users, though - so if you're writing a template yourself, we recommend trying to put it all in a single template where possible! -### What are "default" templates? - -Before the introduction of chat templates, chat handling was hardcoded at the model class level. For backwards -compatibility, we have retained this class-specific handling as default templates, also set at the class level. If a -model does not have a chat template set, but there is a default template for its model class, the `TextGenerationPipeline` -class and methods like `apply_chat_template` will use the class template instead. You can find out what the default -template for your tokenizer is by checking the `tokenizer.default_chat_template` attribute. - -This is something we do purely for backward compatibility reasons, to avoid breaking any existing workflows. Even when -the class template is appropriate for your model, we strongly recommend overriding the default template by -setting the `chat_template` attribute explicitly to make it clear to users that your model has been correctly configured -for chat. - -Now that actual chat templates have been adopted more widely, default templates have been deprecated and will be -removed in a future release. We strongly recommend setting the `chat_template` attribute for any tokenizers that -still depend on them! - ### What template should I use? When setting the template for a model that's already been trained for chat, you should ensure that the template diff --git a/docs/source/en/conversations.md b/docs/source/en/conversations.md index 9336503ad7cb8c..a48c046b4949d7 100644 --- a/docs/source/en/conversations.md +++ b/docs/source/en/conversations.md @@ -195,7 +195,7 @@ inputs = {key: tensor.to(model.device) for key, tensor in inputs.items()} print("Tokenized inputs:\n", inputs) # 4: Generate text from the model -outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.) +outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.1) print("Generated tokens:\n", outputs) # 5: Decode the output back to a string diff --git a/docs/source/en/llm_optims.md b/docs/source/en/llm_optims.md index 5e49f0e1ebd3ab..8e7e9c54d42a42 100644 --- a/docs/source/en/llm_optims.md +++ b/docs/source/en/llm_optims.md @@ -18,59 +18,109 @@ Basic inference is slow because LLMs have to be called repeatedly to generate th This guide will show you how to use the optimization techniques available in Transformers to accelerate LLM inference. > [!TIP] -> Hugging Face also provides [Text Generation Inference (TGI)](https://hf.co/docs/text-generation-inference), a library dedicated to deploying and serving highly optimized LLMs for inference. It includes more optimization features not included in Transformers, such as continuous batching for increasing throughput and tensor parallelism for multi-GPU inference. +> Hugging Face also provides [Text Generation Inference (TGI)](https://hf.co/docs/text-generation-inference), a library dedicated to deploying and serving highly optimized LLMs for inference. It includes deployment-oriented optimization features not included in Transformers, such as continuous batching for increasing throughput and tensor parallelism for multi-GPU inference. -## Static kv-cache and torch.compile +## Static kv-cache and `torch.compile` During decoding, a LLM computes the key-value (kv) values for each input token and since it is autoregressive, it computes the same kv values each time because the generated output becomes part of the input now. This is not very efficient because you're recomputing the same kv values each time. -To optimize this, you can use a kv-cache to store the past keys and values instead of recomputing them each time. However, since the kv-cache grows with each generation step and is dynamic, it prevents you from taking advantage of [torch.compile](./perf_torch_compile), a powerful optimization tool that fuses PyTorch code into fast and optimized kernels. +To optimize this, you can use a kv-cache to store the past keys and values instead of recomputing them each time. However, since the kv-cache grows with each generation step and is dynamic, it prevents you from taking advantage of [`torch.compile`](./perf_torch_compile), a powerful optimization tool that fuses PyTorch code into fast and optimized kernels. -The *static kv-cache* solves this issue by pre-allocating the kv-cache size to a maximum value which allows you to combine it with torch.compile for up to a 4x speed up. +The *static kv-cache* solves this issue by pre-allocating the kv-cache size to a maximum value which allows you to combine it with `torch.compile` for up to a 4x speed up. Your speed up may vary depending on the model size (larger models have a smaller speed up) and hardware. > [!WARNING] -> Currently, only [Llama](./model_doc/llama2) and a few other models support static kv-cache and torch.compile. Check [this issue](https://github.com/huggingface/transformers/issues/28981) for a live model compatibility list. +> Currently, only [Llama](./model_doc/llama2) and a few other models support static kv-cache and `torch.compile`. Check [this issue](https://github.com/huggingface/transformers/issues/28981) for a live model compatibility list. -For this example, let's load the [Gemma](https://hf.co/google/gemma-2b) model. +There are three flavors of static kv-cache usage, depending on the complexity of your task: +1. Basic usage: simply set a flag in `generation_config` (recommended); +2. Advanced usage: handle a cache object for multi-turn generation or a custom generation loop; +3. Advanced usage: compile the entire `generate` function into a single graph, if having a single graph is relevant for you. + +Select the correct tab below for further instructions on each of these flavors. + +> [!TIP] +> Regardless of the strategy used with `torch.compile`, you can avoid shape-related recompilations if you left-pad your LLM inputs to a limited set of values. The [`pad_to_multiple_of` tokenizer flag](https://huggingface.co/docs/transformers/main_classes/tokenizer#transformers.PreTrainedTokenizer.__call__.pad_to_multiple_of) is your friend! + + + + +For this example, let's use the [Gemma](https://hf.co/google/gemma-2b) model. All we need to do is to: +1. Access the model's `generation_config` attribute and set the `cache_implementation` to "static"; +2. Call `torch.compile` on the model to compile the forward pass with the static kv-cache. + +And that's it! ```py from transformers import AutoTokenizer, AutoModelForCausalLM +import torch +import os +os.environ["TOKENIZERS_PARALLELISM"] = "false" # To prevent long warnings :) tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b") -model = AutoModelForCausalLM.from_pretrained( - "google/gemma-2b", device_map="auto" -) +model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", device_map="auto") + +model.generation_config.cache_implementation = "static" + +model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True) +input_text = "The theory of special relativity states " +input_ids = tokenizer(input_text, return_tensors="pt").to("cuda") + +outputs = model.generate(**input_ids) +print(tokenizer.batch_decode(outputs, skip_special_tokens=True)) +['The theory of special relativity states 1. The speed of light is constant in all inertial reference'] ``` -There are two ways you can configure the model to use a static kv-cache. For a 7B model on an A100, both methods get a 4x speed up in the forward pass. Your speed up may vary depending on the model size (larger models have a smaller speed up) and hardware. If you're using the [`~GenerationMixin.generate`] method, the speed up is ~3x. The forward pass (which still gets 4x speed up) is only a part of the whole [`~GenerationMixin.generate`] code. +Under the hood, `generate` will attempt to reuse the same cache object, removing the need for re-compilation at each call. Avoiding re-compilation is critical to get the most out of `torch.compile`, and you should be aware of the following: +1. If the batch size changes or the maximum output length increases between calls, the cache will have to be reinitialized, triggering a new compilation; +2. The first couple of calls of the compiled function are slower, as the function is being compiled. - - +> [!WARNING] +> For a more advanced usage of the static cache, such as multi-turn conversations, we recommend instantiating and manipulating the cache object outside [`~GenerationMixin.generate`]. See the advanced usage tab. + + + -Access the model's `generation_config` attribute and set the `cache_implementation` to "static". +A [`StaticCache`] object can be passed to the model's [`~GenerationMixin.generate`] under the `past_key_values` argument. The object will retain the cache contents, so you can pass it to a new [`~GenerationMixin.generate`] call to continue generation, like you would do with a dynamic cache. ```py -model.generation_config.cache_implementation = "static" -``` +from transformers import AutoTokenizer, AutoModelForCausalLM, StaticCache +import torch +import os +os.environ["TOKENIZERS_PARALLELISM"] = "false" # To prevent long warnings :) -Call torch.compile on the model to compile the forward pass with the static kv-cache. +tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b") +model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", device_map="auto") -```py -compiled_model = torch.compile(model, mode="reduce-overhead", fullgraph=True) +model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True) input_text = "The theory of special relativity states " input_ids = tokenizer(input_text, return_tensors="pt").to("cuda") +prompt_length = input_ids.input_ids.shape[1] +model.generation_config.max_new_tokens = 16 + +past_key_values = StaticCache( + config=model.config, + max_batch_size=1, + # If you plan to reuse the cache, make sure the cache length is large enough for all cases + max_cache_len=prompt_length+(model.generation_config.max_new_tokens*2), + device=model.device, + dtype=model.dtype +) +outputs = model.generate(**input_ids, past_key_values=past_key_values) +print(tokenizer.batch_decode(outputs, skip_special_tokens=True)) +['The theory of special relativity states 1. The speed of light is constant in all inertial reference frames. 2'] -outputs = compiled_model.generate(**input_ids) -tokenizer.batch_decode(outputs, skip_special_tokens=True) -['The theory of special relativity states 1. The speed of light is constant in all inertial reference'] +# pass in the generated text and the same cache object to continue generation from where it left off. Optionally, in a +# multi-turn conversation, append the new user input to the generated text. +new_input_ids = outputs +outputs = model.generate(new_input_ids, past_key_values=past_key_values) +print(tokenizer.batch_decode(outputs, skip_special_tokens=True)) +['The theory of special relativity states 1. The speed of light is constant in all inertial reference frames. 2. The speed of light is constant in all inertial reference frames. 3.'] ``` -Under the hood, `generate` will attempt to reuse the same cache object, removing the need for re-compilation at each call. However, if the batch size or the maximum output length increase between calls, the cache will have to be reinitialized, triggering a new compilation. - - - +> [!TIP] +> If you want to reuse the same [`StaticCache`] object on a new prompt, be sure to reset its contents with the `.reset()` method between calls -A [`StaticCache`] object can be passed to the model's forward pass under the `past_key_values` argument, enabling the use of this object as a static kv-cache. Using this strategy, you can write your own function to decode the next token given the current token and position and cache position of previously generated tokens. You can also pass the [`StaticCache`] object to [`~GenerationMixin.generate`] and use it across calls, like you would do with a dynamic cache. +If you want to go further down a level, the [`StaticCache`] object can also be passed to the model's forward pass under the same `past_key_values` argument. Using this strategy, you can write your own function to decode the next token given the current token and position and cache position of previously generated tokens. ```py from transformers import LlamaTokenizer, LlamaForCausalLM, StaticCache, logging @@ -102,12 +152,9 @@ def decode_one_tokens(model, cur_token, input_pos, cache_position, past_key_valu return new_token ``` -There are a few important things you must do to enable static kv-cache and torch.compile with the `StaticCache` method: - +There are a few important things you must do to enable static kv-cache and `torch.compile` with the `StaticCache` method: 1. Initialize the [`StaticCache`] instance before using the model for inference. There you can configure parameters like the maximum batch size and sequence length. - -2. Call torch.compile on the model to compile the forward pass with the static kv-cache. - +2. Call `torch.compile` on the model to compile the forward pass with the static kv-cache. 3. Set `enable_math=True` in the [torch.backends.cuda.sdp_kernel](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html) context manager to enable the native PyTorch C++ implementation of scaled dot product attention to speed up inference even more. ```py @@ -142,8 +189,34 @@ text 'My favorite all time favorite condiment is ketchup. I love it on everything. I love it on my eggs, my fries, my chicken, my burgers, my hot dogs, my sandwiches, my salads, my p'] ``` -> [!TIP] -> If you want to reuse the [`StaticCache`] object on a new prompt, be sure to reset its contents with the `.reset()` method + + + +Compiling the entire `generate` function, in terms of code, is even simpler than in the basic usage: call `torch.compile` on `generate` to compile the entire function. No need to specify the use of the static cache: although it is compatible, dynamic cache (default) was faster in our benchmarks. + +```py +from transformers import AutoTokenizer, AutoModelForCausalLM +import torch +import os +os.environ["TOKENIZERS_PARALLELISM"] = "false" # To prevent long warnings :) + +tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b") +model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", device_map="auto") + +model.generate = torch.compile(model.generate, mode="reduce-overhead", fullgraph=True) +input_text = "The theory of special relativity states " +input_ids = tokenizer(input_text, return_tensors="pt").to("cuda") + +outputs = model.generate(**input_ids) +print(tokenizer.batch_decode(outputs, skip_special_tokens=True)) +['The theory of special relativity states 1. The speed of light is constant in all inertial reference'] +``` + +As a result, we compile not only the model forward pass, but also all input preparation, logit processor operations, and so on. The result should be a slightly `generate` call, compared to the basic usage example, and the compiled graph may be better suited to more exotic hardware devices or use cases. However, there are severe drawbacks in using this approach: +1. Compilation is much slower; +2. All parameterization of `generate` must be done through `generation_config`; +3. Many warnings and exceptions are suppressed -- we suggest testing with its uncompiled form first; +4. Although we are working on it, it is heavily feature restricted (for instance, at the time of writing, generation does not stop if an EOS token is selected). diff --git a/docs/source/en/main_classes/data_collator.md b/docs/source/en/main_classes/data_collator.md index 74e653dd1185e9..e704bb747fe6e0 100644 --- a/docs/source/en/main_classes/data_collator.md +++ b/docs/source/en/main_classes/data_collator.md @@ -66,3 +66,8 @@ Examples of use can be found in the [example scripts](../examples) or [example n - numpy_mask_tokens - tf_mask_tokens - torch_mask_tokens + +## DataCollatorWithFlattening + +[[autodoc]] data.data_collator.DataCollatorWithFlattening + diff --git a/docs/source/en/main_classes/quantization.md b/docs/source/en/main_classes/quantization.md index f1e2acdcfe4809..fc5808415cbe5f 100755 --- a/docs/source/en/main_classes/quantization.md +++ b/docs/source/en/main_classes/quantization.md @@ -56,3 +56,8 @@ Learn how to quantize models in the [Quantization](../quantization) guide. ## HqqConfig [[autodoc]] HqqConfig + +## FbgemmFp8Config + +[[autodoc]] FbgemmFp8Config + diff --git a/docs/source/en/model_doc/grounding-dino.md b/docs/source/en/model_doc/grounding-dino.md index d258f492abf8b5..a6da554f8d5053 100644 --- a/docs/source/en/model_doc/grounding-dino.md +++ b/docs/source/en/model_doc/grounding-dino.md @@ -41,33 +41,40 @@ The original code can be found [here](https://github.com/IDEA-Research/Grounding Here's how to use the model for zero-shot object detection: ```python -import requests - -import torch -from PIL import Image -from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection, - -model_id = "IDEA-Research/grounding-dino-tiny" - -processor = AutoProcessor.from_pretrained(model_id) -model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device) - -image_url = "http://images.cocodataset.org/val2017/000000039769.jpg" -image = Image.open(requests.get(image_url, stream=True).raw) -# Check for cats and remote controls -text = "a cat. a remote control." - -inputs = processor(images=image, text=text, return_tensors="pt").to(device) -with torch.no_grad(): - outputs = model(**inputs) - -results = processor.post_process_grounded_object_detection( - outputs, - inputs.input_ids, - box_threshold=0.4, - text_threshold=0.3, - target_sizes=[image.size[::-1]] -) +>>> import requests + +>>> import torch +>>> from PIL import Image +>>> from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection + +>>> model_id = "IDEA-Research/grounding-dino-tiny" +>>> device = "cuda" + +>>> processor = AutoProcessor.from_pretrained(model_id) +>>> model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device) + +>>> image_url = "http://images.cocodataset.org/val2017/000000039769.jpg" +>>> image = Image.open(requests.get(image_url, stream=True).raw) +>>> # Check for cats and remote controls +>>> text = "a cat. a remote control." + +>>> inputs = processor(images=image, text=text, return_tensors="pt").to(device) +>>> with torch.no_grad(): +... outputs = model(**inputs) + +>>> results = processor.post_process_grounded_object_detection( +... outputs, +... inputs.input_ids, +... box_threshold=0.4, +... text_threshold=0.3, +... target_sizes=[image.size[::-1]] +... ) +>>> print(results) +[{'boxes': tensor([[344.6959, 23.1090, 637.1833, 374.2751], + [ 12.2666, 51.9145, 316.8582, 472.4392], + [ 38.5742, 70.0015, 176.7838, 118.1806]], device='cuda:0'), + 'labels': ['a cat', 'a cat', 'a remote control'], + 'scores': tensor([0.4785, 0.4381, 0.4776], device='cuda:0')}] ``` ## Grounded SAM diff --git a/docs/source/en/model_doc/llava-next-video.md b/docs/source/en/model_doc/llava-next-video.md index 88e41efc29c87c..48e50f950621e8 100644 --- a/docs/source/en/model_doc/llava-next-video.md +++ b/docs/source/en/model_doc/llava-next-video.md @@ -43,6 +43,13 @@ The original code can be found [here](https://github.com/LLaVA-VL/LLaVA-NeXT/tre - We advise users to use `padding_side="left"` when computing batched generation as it leads to more accurate results. Simply make sure to call `processor.tokenizer.padding_side = "left"` before generating. + + +- Llava-Next uses different number of patches for images and thus has to pad the inputs inside modeling code, aside from the padding done when processing the inputs. The default setting is "left-padding" if model is in `eval()` mode, otherwise "right-padding". + + + + - Note that each checkpoint has been trained with a specific prompt format, depending on which large language model (LLM) was used. You can use tokenizer's `apply_chat_template` to format your prompts correctly. Below is an example of how to do that. We will use [LLaVA-NeXT-Video-7B-hf](https://huggingface.co/llava-hf/LLaVA-NeXT-Video-7B-hf) and a conversation history of videos and images. Each content field has to be a list of dicts, as follows: diff --git a/docs/source/en/model_doc/llava_next.md b/docs/source/en/model_doc/llava_next.md index b9d06ff97ffa53..0c25ed32db5ab3 100644 --- a/docs/source/en/model_doc/llava_next.md +++ b/docs/source/en/model_doc/llava_next.md @@ -46,6 +46,13 @@ The original code can be found [here](https://github.com/haotian-liu/LLaVA/tree/ - We advise users to use `padding_side="left"` when computing batched generation as it leads to more accurate results. Simply make sure to call `processor.tokenizer.padding_side = "left"` before generating. + + +- Llava-Next uses different number of patches for images and thus has to pad the inputs inside modeling code, aside from the padding done when processing the inputs. The default setting is "left-padding" if model is in `eval()` mode, otherwise "right-padding". + + + + - Note that each checkpoint has been trained with a specific prompt format, depending on which large language model (LLM) was used. You can use the processor's `apply_chat_template` to format your prompts correctly. For that you have to construct a conversation history, passing a plain string will not format your prompt. Each message in the conversation history for chat templates is a dictionary with keys "role" and "content". The "content" should be a list of dictionaries, for "text" and "image" modalities. Below is an example of how to do that and the list of formats accepted by each checkpoint. We will use [llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-hf/llava-v1.6-mistral-7b-hf) and a conversation history of text and image. Each content field has to be a list of dicts, as follows: @@ -100,6 +107,17 @@ print(text_prompt) "<|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n\nWhat is shown in this image?<|im_end|><|im_start|>assistant\n" ``` +[llama3-llava-next-8b-hf](https://huggingface.co/llava-hf/llava-next-8b-hf) requires the following format: + +```bash +"<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.<|eot_id|><|start_header_id|><|start_header_id|>user<|end_header_id|>\n\n\nWhat is shown in this image?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" +``` + +[llava-next-72b-hf](https://huggingface.co/llava-hf/llava-next-72b-hf) and [llava-next-110b-hf](https://huggingface.co/llava-hf/llava-next-110b-hf) require the following format: + +```bash +"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n\nWhat is shown in this image?<|im_end|>\n<|im_start|>assistant\n" +``` ## Usage example diff --git a/docs/source/en/model_doc/qwen2.md b/docs/source/en/model_doc/qwen2.md index ac0e25e02c35f9..16815f2fc1f3cd 100644 --- a/docs/source/en/model_doc/qwen2.md +++ b/docs/source/en/model_doc/qwen2.md @@ -18,7 +18,7 @@ rendered properly in your Markdown viewer. ## Overview -Qwen2 is the new model series of large language models from the Qwen team. Previously, we released the Qwen series, including Qwen-72B, Qwen-1.8B, Qwen-VL, Qwen-Audio, etc. +Qwen2 is the new model series of large language models from the Qwen team. Previously, we released the Qwen series, including Qwen2-0.5B, Qwen2-1.5B, Qwen2-7B, Qwen2-57B-A14B, Qwen2-72B, Qwen2-Audio, etc. ### Model Details @@ -27,16 +27,16 @@ Qwen2 is a language model series including decoder language models of different ## Usage tips -`Qwen2-7B-beta` and `Qwen2-7B-Chat-beta` can be found on the [Huggingface Hub](https://huggingface.co/Qwen) +`Qwen2-7B` and `Qwen2-7B-Instruct` can be found on the [Huggingface Hub](https://huggingface.co/Qwen) -In the following, we demonstrate how to use `Qwen2-7B-Chat-beta` for the inference. Note that we have used the ChatML format for dialog, in this demo we show how to leverage `apply_chat_template` for this purpose. +In the following, we demonstrate how to use `Qwen2-7B-Instruct` for the inference. Note that we have used the ChatML format for dialog, in this demo we show how to leverage `apply_chat_template` for this purpose. ```python >>> from transformers import AutoModelForCausalLM, AutoTokenizer >>> device = "cuda" # the device to load the model onto ->>> model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen1.5-7B-Chat", device_map="auto") ->>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-7B-Chat") +>>> model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-7B-Instruct", device_map="auto") +>>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-7B-Instruct") >>> prompt = "Give me a short introduction to large language model." diff --git a/docs/source/en/model_doc/roberta.md b/docs/source/en/model_doc/roberta.md index 364b5b37e5f3f0..2a1843d8885abe 100644 --- a/docs/source/en/model_doc/roberta.md +++ b/docs/source/en/model_doc/roberta.md @@ -51,19 +51,19 @@ This model was contributed by [julien-c](https://huggingface.co/julien-c). The o ## Usage tips -- This implementation is the same as [`BertModel`] with a tiny embeddings tweak as well as a setup - for Roberta pretrained models. -- RoBERTa has the same architecture as BERT, but uses a byte-level BPE as a tokenizer (same as GPT-2) and uses a +- This implementation is the same as [`BertModel`] with a minor tweak to the embeddings, as well as a setup + for RoBERTa pretrained models. +- RoBERTa has the same architecture as BERT but uses a byte-level BPE as a tokenizer (same as GPT-2) and uses a different pretraining scheme. -- RoBERTa doesn't have `token_type_ids`, you don't need to indicate which token belongs to which segment. Just - separate your segments with the separation token `tokenizer.sep_token` (or ``) -- Same as BERT with better pretraining tricks: - - * dynamic masking: tokens are masked differently at each epoch, whereas BERT does it once and for all - * together to reach 512 tokens (so the sentences are in an order than may span several documents) - * train with larger batches - * use BPE with bytes as a subunit and not characters (because of unicode characters) -- [CamemBERT](camembert) is a wrapper around RoBERTa. Refer to this page for usage examples. +- RoBERTa doesn't have `token_type_ids`, so you don't need to indicate which token belongs to which segment. Just + separate your segments with the separation token `tokenizer.sep_token` (or ``). +- RoBERTa is similar to BERT but with better pretraining techniques: + + * Dynamic masking: tokens are masked differently at each epoch, whereas BERT does it once and for all. + * Sentence packing: Sentences are packed together to reach 512 tokens (so the sentences are in an order that may span several documents). + * Larger batches: Training uses larger batches. + * Byte-level BPE vocabulary: Uses BPE with bytes as a subunit instead of characters, accommodating Unicode characters. +- [CamemBERT](camembert) is a wrapper around RoBERTa. Refer to its model page for usage examples. ## Resources diff --git a/docs/source/en/perf_torch_compile.md b/docs/source/en/perf_torch_compile.md index fb8027a9d5a188..acc424930b1c4e 100644 --- a/docs/source/en/perf_torch_compile.md +++ b/docs/source/en/perf_torch_compile.md @@ -98,7 +98,7 @@ Below you can find the list of the models we benchmarked. - [google/vit-base-patch16-224](https://huggingface.co/google/vit-base-patch16-224) - [microsoft/beit-base-patch16-224-pt22k-ft22k](https://huggingface.co/microsoft/beit-base-patch16-224-pt22k-ft22k) - [facebook/convnext-large-224](https://huggingface.co/facebook/convnext-large-224) -- [microsoft/resnet-50](https://huggingface.co/) +- [microsoft/resnet-50](https://huggingface.co/microsoft/resnet-50) **Image Segmentation** - [nvidia/segformer-b0-finetuned-ade-512-512](https://huggingface.co/nvidia/segformer-b0-finetuned-ade-512-512) diff --git a/docs/source/en/quantization/fbgemm_fp8.md b/docs/source/en/quantization/fbgemm_fp8.md new file mode 100644 index 00000000000000..4df194d31be7ca --- /dev/null +++ b/docs/source/en/quantization/fbgemm_fp8.md @@ -0,0 +1,58 @@ + + +# FBGEMM FP8 + +With FBGEMM FP8 quantization method, you can quantize your model in FP8 (W8A8): +- the weights will be quantized in 8bit (FP8) per channel +- the activation will be quantized in 8bit (FP8) per token + +It relies on the [FBGEMM](https://github.com/pytorch/FBGEMM) library which provides efficient low-precision general matrix multiplication for small batch sizes and support for accuracy-loss minimizing techniques such as row-wise quantization and outlier-aware quantization. + +> [!TIP] +> You need a GPU with compute capability>=9 (e.g. H100) + +Before you begin, make sure the following libraries are installed with their latest version: + +```bash +pip install --upgrade accelerate fbgemm-gpu torch +``` + +If you are having issues with fbgemm-gpu and torch library, you might need to install the nighlty release. You can follow the instruction [here](https://pytorch.org/FBGEMM/fbgemm_gpu-development/InstallationInstructions.html#fbgemm-gpu-install-libraries:~:text=found%20here.-,Install%20the%20FBGEMM_GPU%20Package,-Install%20through%20PyTorch) + + +```py +from transformers import FbgemmFp8Config, AutoModelForCausalLM, AutoTokenizer + +model_name = "meta-llama/Meta-Llama-3-8B" +quantization_config = FbgemmFp8Config() +quantized_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", quantization_config=quantization_config) + +tokenizer = AutoTokenizer.from_pretrained(model_name) +input_text = "What are we having for dinner?" +input_ids = tokenizer(input_text, return_tensors="pt").to("cuda") + +output = quantized_model.generate(**input_ids, max_new_tokens=10) +print(tokenizer.decode(output[0], skip_special_tokens=True)) +``` + +A quantized model can be saved via "saved_pretrained" and be reused again via the "from_pretrained". + +```py +quant_path = "/path/to/save/quantized/model" +model.save_pretrained(quant_path) +model = AutoModelForCausalLM.from_pretrained(quant_path, device_map="auto") +``` \ No newline at end of file diff --git a/docs/source/en/quantization/overview.md b/docs/source/en/quantization/overview.md index 6cd13fc894633b..99fc669e49f448 100644 --- a/docs/source/en/quantization/overview.md +++ b/docs/source/en/quantization/overview.md @@ -55,4 +55,5 @@ Use the table below to help you decide which quantization method to use. | [GPTQ](./gptq) | 🔴 | 🔴 | 🟢 | 🟢 | 🔴 | 🔴 | 2 - 3 - 4 - 8 | 🟢 | 🟢 | 🟢 | https://github.com/AutoGPTQ/AutoGPTQ | | [HQQ](./hqq) | 🟢 | 🟢 | 🟢 | 🔴 | 🔴 | 🟢 | 1 - 8 | 🟢 | 🔴 | 🟢 | https://github.com/mobiusml/hqq/ | | [Quanto](./quanto) | 🟢 | 🟢 | 🟢 | 🔴 | 🟢 | 🟢 | 2 / 4 / 8 | 🔴 | 🔴 | 🟢 | https://github.com/huggingface/quanto | +| [FBGEMM_FP8](./fbgemm_fp8.md) | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | 🔴 | 8 | 🔴 | 🟢 | 🟢 | https://github.com/pytorch/FBGEMM | diff --git a/docs/source/en/tf_xla.md b/docs/source/en/tf_xla.md index 86ed1035fccc9e..a585aec068b1f3 100644 --- a/docs/source/en/tf_xla.md +++ b/docs/source/en/tf_xla.md @@ -157,7 +157,7 @@ Execution time -- 79.0 ms Execution time -- 78.9 ms ``` -The first call to `xla_generate()` is time-consuming because of tracing, but the successive calls are orders of magnitude faster. Keep in mind that any change in the generation options at any point with trigger re-tracing and thus leading to slow-downs in the generation time. +The first call to `xla_generate()` is time-consuming because of tracing, but the successive calls are orders of magnitude faster. Keep in mind that any change in the generation options at any point will trigger re-tracing and thus leading to slow-downs in the generation time. We didn’t cover all the text generation options 🤗 Transformers provides in this document. We encourage you to read the documentation for advanced use cases. @@ -171,4 +171,4 @@ Here, we leave you with some additional resources if you want to delve deeper in * Recommended posts for learning more about XLA and TensorFlow graphs in general: * [XLA: Optimizing Compiler for Machine Learning](https://www.tensorflow.org/xla) * [Introduction to graphs and tf.function](https://www.tensorflow.org/guide/intro_to_graphs) - * [Better performance with tf.function](https://www.tensorflow.org/guide/function) \ No newline at end of file + * [Better performance with tf.function](https://www.tensorflow.org/guide/function) diff --git a/docs/source/es/chat_templating.md b/docs/source/es/chat_templating.md index 10129e87ef1184..e287c213743542 100644 --- a/docs/source/es/chat_templating.md +++ b/docs/source/es/chat_templating.md @@ -220,7 +220,7 @@ La plantilla de chat para un modelo se almacena en el atributo `tokenizer.chat_t >>> from transformers import AutoTokenizer >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill") ->>> tokenizer.default_chat_template +>>> tokenizer.chat_template "{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ ' ' }}{% endif %}{% endfor %}{{ eos_token }}" ``` @@ -307,12 +307,6 @@ Si estás ajustando finamente un modelo para chat, además de establecer una pla -### ¿Qué son las plantillas "default"? - -Antes de la introducción de las plantillas de chat, el manejo del chat estaba codificado en el nivel de la clase del modelo. Por razones de compatibilidad con versiones anteriores, hemos conservado este manejo específico de la clase como plantillas predeterminadas, también establecidas a nivel de clase. Si un modelo no tiene una plantilla de chat establecida, pero hay una plantilla predeterminada para su clase de modelo, la clase `TextGenerationPipeline` y métodos como `apply_chat_template` usarán la plantilla de clase en su lugar. Puedes averiguar cuál es la plantilla predeterminada para tu tokenizador comprobando el atributo `tokenizer.default_chat_template`. - -Esto es algo que hacemos puramente por razones de compatibilidad con versiones anteriores, para evitar romper cualquier flujo de trabajo existente. Incluso cuando la plantilla de clase es apropiada para tu modelo, recomendamos encarecidamente anular la plantilla predeterminada estableciendo explícitamente el atributo `chat_template` para dejar claro a los usuarios que tu modelo ha sido configurado correctamente para el chat, y para estar preparados para el futuro en caso de que las plantillas predeterminadas alguna vez se alteren o se eliminen. - ### ¿Qué plantilla debería usar? Cuando establezcas la plantilla para un modelo que ya ha sido entrenado para chat, debes asegurarte de que la plantilla coincida exactamente con el formato de mensajes que el modelo vio durante el entrenamiento, o de lo contrario es probable que experimentes degradación del rendimiento. Esto es cierto incluso si estás entrenando aún más el modelo; probablemente obtendrás el mejor rendimiento si mantienes constantes los tokens de chat. Esto es muy análogo a la tokenización: generalmente obtienes el mejor rendimiento para la inferencia o el ajuste fino cuando coincides precisamente con la tokenización utilizada durante el entrenamiento. diff --git a/docs/source/ja/chat_templating.md b/docs/source/ja/chat_templating.md index 200bf40ac4cf40..82db942ef1e15b 100644 --- a/docs/source/ja/chat_templating.md +++ b/docs/source/ja/chat_templating.md @@ -85,7 +85,7 @@ LLM(Language Model)のますます一般的な使用事例の1つは「チ >>> from transformers import AutoTokenizer >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill") ->>> tokenizer.default_chat_template +>>> tokenizer.chat_template "{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ ' ' }}{% endif %}{% endfor %}{{ eos_token }}" ``` diff --git a/docs/source/zh/_toctree.yml b/docs/source/zh/_toctree.yml index 517033cad562a2..fe966bdbfcf943 100644 --- a/docs/source/zh/_toctree.yml +++ b/docs/source/zh/_toctree.yml @@ -78,6 +78,8 @@ title: 如何将流水线添加到 🤗 Transformers? title: 贡献 - sections: + - local: philosophy + title: Transformers的设计理念 - local: task_summary title: 🤗Transformers能做什么 - local: tokenizer_summary diff --git a/docs/source/zh/chat_templating.md b/docs/source/zh/chat_templating.md index a08da47cb27a26..e0ab50b634c780 100644 --- a/docs/source/zh/chat_templating.md +++ b/docs/source/zh/chat_templating.md @@ -228,7 +228,7 @@ The sun. >>> from transformers import AutoTokenizer >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill") ->>> tokenizer.default_chat_template +>>> tokenizer.chat_template "{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ ' ' }}{% endif %}{% endfor %}{{ eos_token }}" ``` diff --git a/docs/source/zh/philosophy.md b/docs/source/zh/philosophy.md new file mode 100644 index 00000000000000..b0fd0a5167d448 --- /dev/null +++ b/docs/source/zh/philosophy.md @@ -0,0 +1,67 @@ + + + + +# Transformers 的设计理念 + +🤗 Transformers 是一个专为以下用户群体构建的库: + +- 寻求使用、研究或扩展大规模 Transformers 模型的机器学习研究人员和教育者。 +- 希望微调这些模型或在生产环境中使用它们(或两者兼而有之)的实际操作者。 +- 只想下载预训练模型并将其用于解决给定机器学习任务的工程师。 + +Transformers 设计时有两个主要目标: + +1. 尽可能简单快速地使用: + + - 我们尽可能地限制用户能接触的抽象层,实际上几乎没有抽象。用户只需学习三个标准类即可使用每个模型:[configuration](main_classes/configuration)、[models](main_classes/model) 和一个预处理类(用于 NLP 的 [tokenizer](main_classes/tokenizer),用于视觉的 [image processor](main_classes/image_processor),用于音频的 [feature extractor](main_classes/feature_extractor),以及用于多模态输入的 [processor](main_classes/processors))。 + - 所有这些类都可以通过一个通用的 `from_pretrained()` 方法从预训练实例中简单统一地初始化,该方法会从提供在 [Hugging Face Hub](https://huggingface.co/models) 上的预训练检查点(如果需要的话)下载、缓存和加载相关类实例及相关数据(配置的超参数、分词器的词汇表和模型的权重)。 + - 在这三个基本类之上,该库提供了两种 API:[`pipeline`] 用于快速在给定任务上使用模型进行推断,以及 [`Trainer`] 用于快速训练或微调 PyTorch 模型(所有 TensorFlow 模型与 `Keras.fit` 兼容)。 + - 因此,Transformers 不是神经网络的模块化工具箱。如果要基于 Transformers 扩展或搭建新项目,请使用常规的 Python、PyTorch、TensorFlow、Keras 模块,并从 Transformers 的基类继承以重用模型加载和保存等功能。如果想了解更多有关我们的模型代码的设计理念,请查看我们的[重复自己](https://huggingface.co/blog/transformers-design-philosophy)博文。 + +2. 提供与原始模型性能尽可能接近的最新模型: + + - 我们为每种架构提供至少一个示例,复现了该架构官方作者提供的结果。 + - 代码通常尽可能接近原始代码库,这意味着某些 PyTorch 代码可能不够*pytorchic*,因为它是转换后的 TensorFlow 代码,反之亦然。 + +其他几个目标: + +- 尽可能一致地公开模型的内部: + + - 我们使用单一 API 提供对完整隐藏状态和注意力权重的访问。 + - 预处理类和基本模型 API 标准化,便于在不同模型之间轻松切换。 + +- 结合主观选择的有前途的工具进行模型微调和调查: + + - 简单一致的方法来向词汇表和嵌入中添加新标记以进行微调。 + - 简单的方法来屏蔽和修剪 Transformer 头部。 + +- 轻松在 PyTorch、TensorFlow 2.0 和 Flax 之间切换,允许使用一个框架进行训练并使用另一个进行推断。 + +## 主要概念 + +该库围绕每个模型的三类类构建: + +- **模型类** 可以是 PyTorch 模型([torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module))、Keras 模型([tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model))或 JAX/Flax 模型([flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html)),这些模型可以使用库中提供的预训练权重。 +- **配置类** 存储构建模型所需的超参数(如层数和隐藏大小)。通常情况下,如果您使用不进行任何修改的预训练模型,则创建模型将自动处理配置的实例化(配置是模型的一部分)。 +- **预处理类** 将原始数据转换为模型可接受的格式。一个 [tokenizer](main_classes/tokenizer) 存储每个模型的词汇表,并提供编码和解码字符串为要馈送到模型的令牌嵌入索引列表的方法。[Image processors](main_classes/image_processor) 预处理视觉输入,[feature extractors](main_classes/feature_extractor) 预处理音频输入,而 [processor](main_classes/processors) 则处理多模态输入。 + +所有这些类都可以从预训练实例中实例化、本地保存,并通过以下三种方法与 Hub 共享: + +- `from_pretrained()` 允许您从库自身提供的预训练版本(支持的模型可在 [Model Hub](https://huggingface.co/models) 上找到)或用户本地(或服务器上)存储的版本实例化模型、配置和预处理类。 +- `save_pretrained()` 允许您本地保存模型、配置和预处理类,以便可以使用 `from_pretrained()` 重新加载。 +- `push_to_hub()` 允许您将模型、配置和预处理类共享到 Hub,以便所有人都可以轻松访问。 diff --git a/examples/flax/question-answering/run_qa.py b/examples/flax/question-answering/run_qa.py index 1819b0235fafc9..cb708534116eee 100644 --- a/examples/flax/question-answering/run_qa.py +++ b/examples/flax/question-answering/run_qa.py @@ -61,7 +61,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") Array = Any Dataset = datasets.arrow_dataset.Dataset diff --git a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py index 501aaf18642400..cea636c9a782f2 100644 --- a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py +++ b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py @@ -60,7 +60,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risk. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/flax/speech-recognition/requirements.txt") diff --git a/examples/flax/text-classification/run_flax_glue.py b/examples/flax/text-classification/run_flax_glue.py index 281881c51182ee..5bb8a245bbd139 100755 --- a/examples/flax/text-classification/run_flax_glue.py +++ b/examples/flax/text-classification/run_flax_glue.py @@ -56,7 +56,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") Array = Any Dataset = datasets.arrow_dataset.Dataset diff --git a/examples/flax/token-classification/run_flax_ner.py b/examples/flax/token-classification/run_flax_ner.py index 6eb162adcb0784..88f4f2f635d042 100644 --- a/examples/flax/token-classification/run_flax_ner.py +++ b/examples/flax/token-classification/run_flax_ner.py @@ -57,7 +57,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt") diff --git a/examples/pytorch/audio-classification/run_audio_classification.py b/examples/pytorch/audio-classification/run_audio_classification.py index 3c75f0b1504d19..ad9fde080373c1 100644 --- a/examples/pytorch/audio-classification/run_audio_classification.py +++ b/examples/pytorch/audio-classification/run_audio_classification.py @@ -45,7 +45,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt") diff --git a/examples/pytorch/contrastive-image-text/run_clip.py b/examples/pytorch/contrastive-image-text/run_clip.py index c4936410c52ade..60f698951d0baa 100644 --- a/examples/pytorch/contrastive-image-text/run_clip.py +++ b/examples/pytorch/contrastive-image-text/run_clip.py @@ -54,7 +54,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt") diff --git a/examples/pytorch/image-classification/run_image_classification.py b/examples/pytorch/image-classification/run_image_classification.py index b7557b903fdf06..3f8a2433c94f68 100755 --- a/examples/pytorch/image-classification/run_image_classification.py +++ b/examples/pytorch/image-classification/run_image_classification.py @@ -56,7 +56,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt") diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py index e67424f6819ca9..ee30bd4608638c 100644 --- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py +++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py @@ -49,7 +49,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") logger = get_logger(__name__) diff --git a/examples/pytorch/image-pretraining/run_mae.py b/examples/pytorch/image-pretraining/run_mae.py index 8af6e18b1ca37c..a1624d12c01e73 100644 --- a/examples/pytorch/image-pretraining/run_mae.py +++ b/examples/pytorch/image-pretraining/run_mae.py @@ -43,7 +43,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt") diff --git a/examples/pytorch/image-pretraining/run_mim.py b/examples/pytorch/image-pretraining/run_mim.py index c2c3ff818b5b6b..379ad7eaecae35 100644 --- a/examples/pytorch/image-pretraining/run_mim.py +++ b/examples/pytorch/image-pretraining/run_mim.py @@ -48,7 +48,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt") diff --git a/examples/pytorch/image-pretraining/run_mim_no_trainer.py b/examples/pytorch/image-pretraining/run_mim_no_trainer.py index e3efbec76c4419..fe1593fbdd849f 100644 --- a/examples/pytorch/image-pretraining/run_mim_no_trainer.py +++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py @@ -53,7 +53,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt") diff --git a/examples/pytorch/instance-segmentation/run_instance_segmentation.py b/examples/pytorch/instance-segmentation/run_instance_segmentation.py index 9a29e43d7d304c..e7d752a8b4513f 100644 --- a/examples/pytorch/instance-segmentation/run_instance_segmentation.py +++ b/examples/pytorch/instance-segmentation/run_instance_segmentation.py @@ -46,7 +46,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt") diff --git a/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py b/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py index 8f57997deacbc7..e926b8b73ba5cd 100644 --- a/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py +++ b/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py @@ -52,7 +52,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py index cf80ae83ab2e0a..d6f04ed75ee4bc 100755 --- a/examples/pytorch/language-modeling/run_clm.py +++ b/examples/pytorch/language-modeling/run_clm.py @@ -55,7 +55,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py index 7ef8d94f3e3cfb..577d86e4fc1579 100755 --- a/examples/pytorch/language-modeling/run_clm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py @@ -57,7 +57,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") logger = get_logger(__name__) diff --git a/examples/pytorch/language-modeling/run_fim.py b/examples/pytorch/language-modeling/run_fim.py index 7154f1ffcd71e5..dd00375dfe849f 100644 --- a/examples/pytorch/language-modeling/run_fim.py +++ b/examples/pytorch/language-modeling/run_fim.py @@ -58,7 +58,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_fim_no_trainer.py b/examples/pytorch/language-modeling/run_fim_no_trainer.py index 11c64c7c4849ef..6cb4a3b6bca5e7 100644 --- a/examples/pytorch/language-modeling/run_fim_no_trainer.py +++ b/examples/pytorch/language-modeling/run_fim_no_trainer.py @@ -60,7 +60,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") logger = get_logger(__name__) diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py index f40015d9701dc1..ada6a4534b67df 100755 --- a/examples/pytorch/language-modeling/run_mlm.py +++ b/examples/pytorch/language-modeling/run_mlm.py @@ -54,7 +54,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py index 75c3b1936fd024..8fe6697c497d94 100755 --- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py @@ -57,7 +57,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") logger = get_logger(__name__) require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py index 33dc8baaa6e93f..3a9b3ffa16ac8d 100755 --- a/examples/pytorch/language-modeling/run_plm.py +++ b/examples/pytorch/language-modeling/run_plm.py @@ -47,7 +47,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/multiple-choice/run_swag.py b/examples/pytorch/multiple-choice/run_swag.py index 51c13458ed44c0..7cc2a772d7eddf 100755 --- a/examples/pytorch/multiple-choice/run_swag.py +++ b/examples/pytorch/multiple-choice/run_swag.py @@ -47,7 +47,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") logger = logging.getLogger(__name__) diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py index 8356493762ed7c..697ec34ac8860b 100755 --- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py +++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py @@ -56,7 +56,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") logger = get_logger(__name__) # You should update this to your particular problem to have better documentation of `model_type` diff --git a/examples/pytorch/object-detection/run_object_detection.py b/examples/pytorch/object-detection/run_object_detection.py index 3f1eb681df225a..8250320dfbdad5 100644 --- a/examples/pytorch/object-detection/run_object_detection.py +++ b/examples/pytorch/object-detection/run_object_detection.py @@ -48,7 +48,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/object-detection/requirements.txt") diff --git a/examples/pytorch/object-detection/run_object_detection_no_trainer.py b/examples/pytorch/object-detection/run_object_detection_no_trainer.py index 296f045a5234e3..34d04d99d53c39 100644 --- a/examples/pytorch/object-detection/run_object_detection_no_trainer.py +++ b/examples/pytorch/object-detection/run_object_detection_no_trainer.py @@ -51,7 +51,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") logging.basicConfig(level=logging.INFO) logger = get_logger(__name__) diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py index 83eda3e98a75ea..53393ddc5d8ab4 100755 --- a/examples/pytorch/question-answering/run_qa.py +++ b/examples/pytorch/question-answering/run_qa.py @@ -50,7 +50,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/question-answering/run_qa_beam_search.py b/examples/pytorch/question-answering/run_qa_beam_search.py index 4ba3564d6e505e..bfdf1fc70a1967 100755 --- a/examples/pytorch/question-answering/run_qa_beam_search.py +++ b/examples/pytorch/question-answering/run_qa_beam_search.py @@ -48,7 +48,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py index db6256aedbacfd..909084e65298a9 100644 --- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py @@ -56,7 +56,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py index 202cc7d661db87..6aed16d9fba88b 100755 --- a/examples/pytorch/question-answering/run_qa_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_no_trainer.py @@ -57,7 +57,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/question-answering/run_seq2seq_qa.py b/examples/pytorch/question-answering/run_seq2seq_qa.py index 1932df9677ce6c..bac49185d55b43 100644 --- a/examples/pytorch/question-answering/run_seq2seq_qa.py +++ b/examples/pytorch/question-answering/run_seq2seq_qa.py @@ -46,7 +46,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py index a5929205b2f5fe..ad92a3c99e6be1 100644 --- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py +++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py @@ -51,7 +51,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/semantic-segmentation/requirements.txt") diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py index 5ff906c22cba16..5c78da9119c078 100644 --- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py +++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py @@ -50,7 +50,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") logger = get_logger(__name__) diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py index b4019d64f774ce..ffcca59f3ddb3f 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py @@ -50,7 +50,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt") diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py index 3da281430ec272..77bd768cc4f389 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py @@ -53,7 +53,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt") diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py index 501b2df1c5eb6a..b18bfb15ea1793 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py @@ -48,7 +48,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt") diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py index 225b20fcd63572..9bc80cc5f27e26 100755 --- a/examples/pytorch/summarization/run_summarization.py +++ b/examples/pytorch/summarization/run_summarization.py @@ -52,7 +52,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py index 325cbebd9634b0..3f35c72113cf05 100644 --- a/examples/pytorch/summarization/run_summarization_no_trainer.py +++ b/examples/pytorch/summarization/run_summarization_no_trainer.py @@ -56,7 +56,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") logger = get_logger(__name__) require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") diff --git a/examples/pytorch/text-classification/run_classification.py b/examples/pytorch/text-classification/run_classification.py index b4520d6af340eb..d37ebb7c4bbf3d 100755 --- a/examples/pytorch/text-classification/run_classification.py +++ b/examples/pytorch/text-classification/run_classification.py @@ -47,7 +47,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py index e3bb7ea94ca07f..751084a6f6e3dc 100755 --- a/examples/pytorch/text-classification/run_glue.py +++ b/examples/pytorch/text-classification/run_glue.py @@ -48,7 +48,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py index 5ec90faf87a57d..152b8b02999db9 100644 --- a/examples/pytorch/text-classification/run_glue_no_trainer.py +++ b/examples/pytorch/text-classification/run_glue_no_trainer.py @@ -49,7 +49,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") logger = get_logger(__name__) diff --git a/examples/pytorch/text-classification/run_xnli.py b/examples/pytorch/text-classification/run_xnli.py index 7fa0d3bc9005bf..18f5d37821c186 100755 --- a/examples/pytorch/text-classification/run_xnli.py +++ b/examples/pytorch/text-classification/run_xnli.py @@ -48,7 +48,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py index 937d09d74d6277..65a34df2945d1f 100755 --- a/examples/pytorch/token-classification/run_ner.py +++ b/examples/pytorch/token-classification/run_ner.py @@ -49,7 +49,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt") diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py index 61e14da6cb8b25..085dc7879d9b69 100755 --- a/examples/pytorch/token-classification/run_ner_no_trainer.py +++ b/examples/pytorch/token-classification/run_ner_no_trainer.py @@ -56,7 +56,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") logger = get_logger(__name__) require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt") diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py index 7ccfc30802f5f0..f4194d4e30b834 100755 --- a/examples/pytorch/translation/run_translation.py +++ b/examples/pytorch/translation/run_translation.py @@ -52,7 +52,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt") diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py index e2e9f3d3d3daa0..7e7ce04f02413e 100644 --- a/examples/pytorch/translation/run_translation_no_trainer.py +++ b/examples/pytorch/translation/run_translation_no_trainer.py @@ -57,7 +57,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") logger = get_logger(__name__) require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt") diff --git a/examples/research_projects/bertabs/modeling_bertabs.py b/examples/research_projects/bertabs/modeling_bertabs.py index 66f2320ebd167c..c2c6a54be75ffa 100644 --- a/examples/research_projects/bertabs/modeling_bertabs.py +++ b/examples/research_projects/bertabs/modeling_bertabs.py @@ -557,7 +557,7 @@ def unshape(x): return context -class DecoderState(object): +class DecoderState: """Interface for grouping together the current state of a recurrent decoder. In the simplest case just represents the hidden state of the model. But can also be used for implementing various forms of @@ -694,7 +694,7 @@ def build_predictor(args, tokenizer, symbols, model, logger=None): return translator -class GNMTGlobalScorer(object): +class GNMTGlobalScorer: """ NMT re-ranking score from "Google's Neural Machine Translation System" :cite:`wu2016google` @@ -717,7 +717,7 @@ def score(self, beam, logprobs): return normalized_probs -class PenaltyBuilder(object): +class PenaltyBuilder: """ Returns the Length and Coverage Penalty function for Beam Search. @@ -763,7 +763,7 @@ def length_none(self, beam, logprobs, alpha=0.0, beta=0.0): return logprobs -class Translator(object): +class Translator: """ Uses a model to translate a batch of sentences. @@ -1002,7 +1002,7 @@ def tile(x, count, dim=0): # -class BertSumOptimizer(object): +class BertSumOptimizer: """Specific optimizer for BertSum. As described in [1], the authors fine-tune BertSum for abstractive diff --git a/examples/research_projects/distillation/grouped_batch_sampler.py b/examples/research_projects/distillation/grouped_batch_sampler.py index fd126b13b58ee7..e25def738a8483 100644 --- a/examples/research_projects/distillation/grouped_batch_sampler.py +++ b/examples/research_projects/distillation/grouped_batch_sampler.py @@ -59,7 +59,7 @@ class GroupedBatchSampler(BatchSampler): def __init__(self, sampler, group_ids, batch_size): if not isinstance(sampler, Sampler): - raise ValueError( + raise TypeError( "sampler should be an instance of torch.utils.data.Sampler, but got sampler={}".format(sampler) ) self.sampler = sampler diff --git a/examples/research_projects/fsner/src/fsner/tokenizer_utils.py b/examples/research_projects/fsner/src/fsner/tokenizer_utils.py index b281ae6cfb8961..7169e23dbe490d 100644 --- a/examples/research_projects/fsner/src/fsner/tokenizer_utils.py +++ b/examples/research_projects/fsner/src/fsner/tokenizer_utils.py @@ -3,7 +3,7 @@ from transformers import AutoTokenizer -class FSNERTokenizerUtils(object): +class FSNERTokenizerUtils: def __init__(self, pretrained_model_name_or_path): self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path) diff --git a/examples/research_projects/lxmert/modeling_frcnn.py b/examples/research_projects/lxmert/modeling_frcnn.py index 8aea9b5e1a4f9a..c7c3bf376ce382 100644 --- a/examples/research_projects/lxmert/modeling_frcnn.py +++ b/examples/research_projects/lxmert/modeling_frcnn.py @@ -417,7 +417,7 @@ def __new__(cls, *, channels=None, height=None, width=None, stride=None): return super().__new__(cls, channels, height, width, stride) -class Box2BoxTransform(object): +class Box2BoxTransform: """ This R-CNN transformation scales the box's width and height by exp(dw), exp(dh) and shifts a box's center by the offset @@ -519,7 +519,7 @@ def apply_deltas(self, deltas, boxes): return pred_boxes -class Matcher(object): +class Matcher: """ This class assigns to each predicted "element" (e.g., a box) a ground-truth element. Each predicted element will have exactly zero or one matches; each @@ -622,7 +622,7 @@ def set_low_quality_matches_(self, match_labels, match_quality_matrix): match_labels[pred_inds_with_highest_quality] = 1 -class RPNOutputs(object): +class RPNOutputs: def __init__( self, box2box_transform, @@ -1132,7 +1132,7 @@ def forward(self, feature_maps, boxes): return output -class ROIOutputs(object): +class ROIOutputs: def __init__(self, cfg, training=False): self.smooth_l1_beta = cfg.ROI_BOX_HEAD.SMOOTH_L1_BETA self.box2box_transform = Box2BoxTransform(weights=cfg.ROI_BOX_HEAD.BBOX_REG_WEIGHTS) diff --git a/examples/research_projects/movement-pruning/emmental/modules/binarizer.py b/examples/research_projects/movement-pruning/emmental/modules/binarizer.py index b4a801d56d9de2..c96975e3b37509 100644 --- a/examples/research_projects/movement-pruning/emmental/modules/binarizer.py +++ b/examples/research_projects/movement-pruning/emmental/modules/binarizer.py @@ -108,7 +108,7 @@ def backward(ctx, gradOutput): return gradOutput, None -class MagnitudeBinarizer(object): +class MagnitudeBinarizer: """ Magnitude Binarizer. Computes a binary mask M from a real value matrix S such that `M_{i,j} = 1` if and only if `S_{i,j}` diff --git a/examples/research_projects/performer/modeling_flax_performer_utils.py b/examples/research_projects/performer/modeling_flax_performer_utils.py index 6e6173729cc348..24c5e4d7c7fcec 100644 --- a/examples/research_projects/performer/modeling_flax_performer_utils.py +++ b/examples/research_projects/performer/modeling_flax_performer_utils.py @@ -284,7 +284,7 @@ def kernel_feature_creator( return attention_fn -class RandomMatrix(object): +class RandomMatrix: r""" Abstract class providing a method for constructing 2D random arrays. Class is responsible for constructing 2D random arrays. @@ -348,7 +348,7 @@ def get_2d_array(self): return jnp.matmul(jnp.diag(multiplier), final_matrix) -class FastAttention(object): +class FastAttention: r""" Abstract class providing a method for fast attention. Class is responsible for providing a method for fast approximate attention. diff --git a/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples.py b/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples.py index 454951ed3888a0..0ee4dd8afe1d5e 100644 --- a/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples.py +++ b/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples.py @@ -418,7 +418,7 @@ def test_finetune_lr_schedulers(self): with CaptureStdout() as cs: args = parser.parse_args(args) assert False, "--help is expected to sys.exit" - assert excinfo.type == SystemExit + assert excinfo.type is SystemExit expected = lightning_base.arg_to_scheduler_metavar assert expected in cs.out, "--help is expected to list the supported schedulers" @@ -429,7 +429,7 @@ def test_finetune_lr_schedulers(self): with CaptureStderr() as cs: args = parser.parse_args(args) assert False, "invalid argument is expected to sys.exit" - assert excinfo.type == SystemExit + assert excinfo.type is SystemExit expected = f"invalid choice: '{unsupported_param}'" assert expected in cs.err, f"should have bailed on invalid choice of scheduler {unsupported_param}" diff --git a/examples/research_projects/tapex/wikisql_utils.py b/examples/research_projects/tapex/wikisql_utils.py index 3351bddf019448..13d10e091a10c1 100644 --- a/examples/research_projects/tapex/wikisql_utils.py +++ b/examples/research_projects/tapex/wikisql_utils.py @@ -48,7 +48,7 @@ def convert_to_float(value): if isinstance(value, int): return float(value) if not isinstance(value, str): - raise ValueError("Argument value is not a string. Can't parse it as float") + raise TypeError("Argument value is not a string. Can't parse it as float") sanitized = value try: @@ -158,7 +158,7 @@ def _respect_conditions(table, row, conditions): cmp_value = _normalize_for_match(cmp_value) if not isinstance(table_value, type(cmp_value)): - raise ValueError("Type difference {} != {}".format(type(table_value), type(cmp_value))) + raise TypeError("Type difference {} != {}".format(type(table_value), type(cmp_value))) if not _compare(cond.operator, table_value, cmp_value): return False diff --git a/examples/research_projects/visual_bert/modeling_frcnn.py b/examples/research_projects/visual_bert/modeling_frcnn.py index 8aea9b5e1a4f9a..c7c3bf376ce382 100644 --- a/examples/research_projects/visual_bert/modeling_frcnn.py +++ b/examples/research_projects/visual_bert/modeling_frcnn.py @@ -417,7 +417,7 @@ def __new__(cls, *, channels=None, height=None, width=None, stride=None): return super().__new__(cls, channels, height, width, stride) -class Box2BoxTransform(object): +class Box2BoxTransform: """ This R-CNN transformation scales the box's width and height by exp(dw), exp(dh) and shifts a box's center by the offset @@ -519,7 +519,7 @@ def apply_deltas(self, deltas, boxes): return pred_boxes -class Matcher(object): +class Matcher: """ This class assigns to each predicted "element" (e.g., a box) a ground-truth element. Each predicted element will have exactly zero or one matches; each @@ -622,7 +622,7 @@ def set_low_quality_matches_(self, match_labels, match_quality_matrix): match_labels[pred_inds_with_highest_quality] = 1 -class RPNOutputs(object): +class RPNOutputs: def __init__( self, box2box_transform, @@ -1132,7 +1132,7 @@ def forward(self, feature_maps, boxes): return output -class ROIOutputs(object): +class ROIOutputs: def __init__(self, cfg, training=False): self.smooth_l1_beta = cfg.ROI_BOX_HEAD.SMOOTH_L1_BETA self.box2box_transform = Box2BoxTransform(weights=cfg.ROI_BOX_HEAD.BBOX_REG_WEIGHTS) diff --git a/examples/tensorflow/contrastive-image-text/run_clip.py b/examples/tensorflow/contrastive-image-text/run_clip.py index ba83bbe56dcfb3..9cd6dc08bff5e4 100644 --- a/examples/tensorflow/contrastive-image-text/run_clip.py +++ b/examples/tensorflow/contrastive-image-text/run_clip.py @@ -51,7 +51,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version( "datasets>=1.8.0", "To fix: pip install -r examples/tensorflow/contrastive-image-text/requirements.txt" diff --git a/examples/tensorflow/image-classification/run_image_classification.py b/examples/tensorflow/image-classification/run_image_classification.py index a3ea7cf0b10d99..025ab7c448b8a3 100644 --- a/examples/tensorflow/image-classification/run_image_classification.py +++ b/examples/tensorflow/image-classification/run_image_classification.py @@ -55,7 +55,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt") diff --git a/examples/tensorflow/multiple-choice/run_swag.py b/examples/tensorflow/multiple-choice/run_swag.py index 7d8189b087efd8..d6fe9ab7cb5007 100644 --- a/examples/tensorflow/multiple-choice/run_swag.py +++ b/examples/tensorflow/multiple-choice/run_swag.py @@ -50,7 +50,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") logger = logging.getLogger(__name__) diff --git a/examples/tensorflow/question-answering/run_qa.py b/examples/tensorflow/question-answering/run_qa.py index d9758c401826b7..1c3de1b137f315 100755 --- a/examples/tensorflow/question-answering/run_qa.py +++ b/examples/tensorflow/question-answering/run_qa.py @@ -62,7 +62,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") logger = logging.getLogger(__name__) diff --git a/examples/tensorflow/summarization/run_summarization.py b/examples/tensorflow/summarization/run_summarization.py index 3f5190186dc3f5..b5f6f444057a32 100644 --- a/examples/tensorflow/summarization/run_summarization.py +++ b/examples/tensorflow/summarization/run_summarization.py @@ -53,7 +53,7 @@ # region Checking dependencies # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") diff --git a/examples/tensorflow/text-classification/run_glue.py b/examples/tensorflow/text-classification/run_glue.py index ec3fbe8c61bad3..c098192477a3bf 100644 --- a/examples/tensorflow/text-classification/run_glue.py +++ b/examples/tensorflow/text-classification/run_glue.py @@ -47,7 +47,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") task_to_keys = { "cola": ("sentence", None), diff --git a/examples/tensorflow/translation/run_translation.py b/examples/tensorflow/translation/run_translation.py index 7280f7a95b37a5..6fe65b0cf3c669 100644 --- a/examples/tensorflow/translation/run_translation.py +++ b/examples/tensorflow/translation/run_translation.py @@ -56,7 +56,7 @@ # region Dependencies and constants # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") diff --git a/setup.py b/setup.py index f6a6875dcda691..5815d69e4da7cb 100644 --- a/setup.py +++ b/setup.py @@ -157,7 +157,7 @@ "rhoknp>=1.1.0,<1.3.1", "rjieba", "rouge-score!=0.0.7,!=0.0.8,!=0.1,!=0.1.1", - "ruff==0.4.4", + "ruff==0.5.1", "sacrebleu>=1.4.12,<2.0.0", "sacremoses", "safetensors>=0.4.1", @@ -430,7 +430,7 @@ def run(self): setup( name="transformers", - version="4.43.0.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) + version="4.44.0.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)", author_email="transformers@huggingface.co", description="State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow", diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 41347d401a5719..4c953bab6be4b0 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -18,7 +18,7 @@ # to defer the actual importing for when the objects are requested. This way `import transformers` provides the names # in the namespace without actually importing anything (and especially none of the backends). -__version__ = "4.43.0.dev0" +__version__ = "4.44.0.dev0" from typing import TYPE_CHECKING @@ -104,6 +104,7 @@ "DataCollatorForSOP", "DataCollatorForTokenClassification", "DataCollatorForWholeWordMask", + "DataCollatorWithFlattening", "DataCollatorWithPadding", "DefaultDataCollator", "default_data_collator", @@ -935,6 +936,7 @@ "AwqConfig", "BitsAndBytesConfig", "EetqConfig", + "FbgemmFp8Config", "GPTQConfig", "HqqConfig", "QuantoConfig", @@ -1295,6 +1297,7 @@ ) _import_structure["modeling_flash_attention_utils"] = [] _import_structure["modeling_outputs"] = [] + _import_structure["modeling_rope_utils"] = ["ROPE_INIT_FUNCTIONS"] _import_structure["modeling_utils"] = ["PreTrainedModel"] # PyTorch models structure @@ -4764,6 +4767,7 @@ DataCollatorForSOP, DataCollatorForTokenClassification, DataCollatorForWholeWordMask, + DataCollatorWithFlattening, DataCollatorWithPadding, DefaultDataCollator, default_data_collator, @@ -5667,6 +5671,7 @@ AwqConfig, BitsAndBytesConfig, EetqConfig, + FbgemmFp8Config, GPTQConfig, HqqConfig, QuantoConfig, @@ -6010,6 +6015,7 @@ WatermarkLogitsProcessor, WhisperTimeStampLogitsProcessor, ) + from .modeling_rope_utils import ROPE_INIT_FUNCTIONS from .modeling_utils import PreTrainedModel from .models.albert import ( AlbertForMaskedLM, diff --git a/src/transformers/agents/agent_types.py b/src/transformers/agents/agent_types.py index 0b4999b7f76d3c..114b6de01c3333 100644 --- a/src/transformers/agents/agent_types.py +++ b/src/transformers/agents/agent_types.py @@ -107,7 +107,7 @@ def __init__(self, value): elif isinstance(value, np.ndarray): self._tensor = torch.tensor(value) else: - raise ValueError(f"Unsupported type for {self.__class__.__name__}: {type(value)}") + raise TypeError(f"Unsupported type for {self.__class__.__name__}: {type(value)}") def _ipython_display_(self, include=None, exclude=None): """ diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py index 34b457ce018956..9664ea49cb8f74 100644 --- a/src/transformers/cache_utils.py +++ b/src/transformers/cache_utils.py @@ -9,7 +9,7 @@ from packaging import version from .configuration_utils import PretrainedConfig -from .utils import is_hqq_available, is_quanto_available, logging +from .utils import is_hqq_available, is_quanto_available, is_torchdynamo_compiling, logging if is_quanto_available(): @@ -398,7 +398,6 @@ def from_legacy_cache(cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTens def crop(self, max_length: int): """Crop the past key values up to a new `max_length` in terms of tokens. `max_length` can also be negative to remove `max_length` tokens. This is used in assisted decoding and contrastive search.""" - # In case it is negative if max_length < 0: max_length = self.get_seq_length() - abs(max_length) @@ -821,11 +820,13 @@ def __init__(self, config: PretrainedConfig, max_batch_size: int, max_cache_len: cache_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, self.head_dim) for _ in range(config.num_hidden_layers): # Note: `mark_static_address` is used to tag the cache as an fixed data pointer, preventing cuda graph - # breaks when updating the cache. + # breaks when updating the cache. It can't be used if the cache code is being compiled (but in that case + # it is not needed anyway) new_layer_key_cache = torch.zeros(cache_shape, dtype=self.dtype, device=device) new_layer_value_cache = torch.zeros(cache_shape, dtype=self.dtype, device=device) - torch._dynamo.mark_static_address(new_layer_key_cache) - torch._dynamo.mark_static_address(new_layer_value_cache) + if not is_torchdynamo_compiling(): + torch._dynamo.mark_static_address(new_layer_key_cache) + torch._dynamo.mark_static_address(new_layer_value_cache) self.key_cache.append(new_layer_key_cache) self.value_cache.append(new_layer_value_cache) @@ -862,8 +863,18 @@ def update( k_out.copy_(key_states) v_out.copy_(value_states) else: - k_out[:, :, cache_position] = key_states - v_out[:, :, cache_position] = value_states + # Note: here we use `tensor.index_copy_(dim, index, tensor)` that is equivalent to + # `tensor[:, :, index] = tensor`, but the first one is compile-friendly and it does explicitly an in-place + # operation, that avoids copies and uses less memory. + try: + # If using several devices (e.g.: multiple GPUs), we need to ensure everything is on the same one + cache_position.to(device=k_out.device) + k_out.index_copy_(2, cache_position, key_states) + v_out.index_copy_(2, cache_position, value_states) + except NotImplementedError: + # The operator 'aten::index_copy.out' is not currently implemented for the MPS device. + k_out[:, :, cache_position] = key_states + v_out[:, :, cache_position] = value_states return k_out, v_out @@ -958,8 +969,14 @@ def update( k_out = k_out[:, :, indices] v_out = v_out[:, :, indices] - k_out[:, :, cache_position] = key_states - v_out[:, :, cache_position] = value_states + try: + cache_position.to(device=k_out.device) + k_out.index_copy_(2, cache_position, key_states) + v_out.index_copy_(2, cache_position, value_states) + except NotImplementedError: + # The operator 'aten::index_copy.out' is not currently implemented for the MPS device. + k_out[:, :, cache_position] = key_states + v_out[:, :, cache_position] = value_states # `_.zero()` followed by `+=` is equivalent `=`, but compile-friendly (without graph breaks due to assignment) self.key_cache[layer_idx].zero_() diff --git a/src/transformers/commands/pt_to_tf.py b/src/transformers/commands/pt_to_tf.py index 4df45f7f086a51..4002b5e0eb85a4 100644 --- a/src/transformers/commands/pt_to_tf.py +++ b/src/transformers/commands/pt_to_tf.py @@ -202,9 +202,7 @@ def get_inputs(self, pt_model, tf_dummy_inputs, config): """ def _get_audio_input(): - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") speech_samples = ds.sort("id").select(range(2))[:2]["audio"] raw_samples = [x["array"] for x in speech_samples] return raw_samples diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py index c6de824339bbc0..2f84bc29aee25d 100755 --- a/src/transformers/configuration_utils.py +++ b/src/transformers/configuration_utils.py @@ -1004,7 +1004,7 @@ def update_from_string(self, update_str: str): elif isinstance(old_v, float): v = float(v) elif not isinstance(old_v, str): - raise ValueError( + raise TypeError( f"You can only update int, float, bool or string values in the config, got {v} for key {k}" ) diff --git a/src/transformers/data/__init__.py b/src/transformers/data/__init__.py index 1a8ef35ff439e4..8b675aae281f32 100644 --- a/src/transformers/data/__init__.py +++ b/src/transformers/data/__init__.py @@ -19,6 +19,7 @@ DataCollatorForSOP, DataCollatorForTokenClassification, DataCollatorForWholeWordMask, + DataCollatorWithFlattening, DataCollatorWithPadding, DefaultDataCollator, default_data_collator, diff --git a/src/transformers/data/data_collator.py b/src/transformers/data/data_collator.py index ce17f79ccfc88e..20a21318786c4a 100644 --- a/src/transformers/data/data_collator.py +++ b/src/transformers/data/data_collator.py @@ -1611,3 +1611,38 @@ def numpy_mask_tokens(self, inputs: Any) -> Tuple[Any, Any, Any, Any]: ) & masked_indices[i] return inputs.astype(np.int64), perm_mask, target_mapping, labels.astype(np.int64) + + +@dataclass +class DataCollatorWithFlattening(DefaultDataCollator): + """ + Data collator used for padding free approach. Does the following: + + - concatate the entire mini batch into single long sequence [1, total_tokens] + - no padding will be added, returns `input_ids`, `labels` and `position_ids` + """ + + def __init__(self, *args, return_position_ids=True, **kwargs): + super().__init__(*args, **kwargs) + self.return_position_ids = return_position_ids + warnings.warn( + "Using `DataCollatorWithFlattening` will flatten the entire mini batch into single long sequence." + "Make sure your attention computation is able to handle it!" + ) + + def __call__(self, features, return_tensors=None): + if return_tensors is None: + return_tensors = self.return_tensors + is_labels_provided = "labels" in features[0] + ret = {"input_ids": [], "labels": []} + if self.return_position_ids: + ret.update({"position_ids": []}) + for idx in range(0, len(features)): + ret["input_ids"] += features[idx]["input_ids"] + if is_labels_provided: + ret["labels"] += [-100] + features[idx]["labels"][1:] + else: + ret["labels"] += [-100] + features[idx]["input_ids"][1:] + if self.return_position_ids: + ret["position_ids"] += list(range(len(features[idx]["input_ids"]))) + return default_data_collator([ret], return_tensors) diff --git a/src/transformers/data/processors/xnli.py b/src/transformers/data/processors/xnli.py index 459c5bc3a6a38e..4d8ec17a8345db 100644 --- a/src/transformers/data/processors/xnli.py +++ b/src/transformers/data/processors/xnli.py @@ -47,11 +47,11 @@ def get_train_examples(self, data_dir): text_b = line[1] label = "contradiction" if line[2] == "contradictory" else line[2] if not isinstance(text_a, str): - raise ValueError(f"Training input {text_a} is not a string") + raise TypeError(f"Training input {text_a} is not a string") if not isinstance(text_b, str): - raise ValueError(f"Training input {text_b} is not a string") + raise TypeError(f"Training input {text_b} is not a string") if not isinstance(label, str): - raise ValueError(f"Training label {label} is not a string") + raise TypeError(f"Training label {label} is not a string") examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples @@ -70,11 +70,11 @@ def get_test_examples(self, data_dir): text_b = line[7] label = line[1] if not isinstance(text_a, str): - raise ValueError(f"Training input {text_a} is not a string") + raise TypeError(f"Training input {text_a} is not a string") if not isinstance(text_b, str): - raise ValueError(f"Training input {text_b} is not a string") + raise TypeError(f"Training input {text_b} is not a string") if not isinstance(label, str): - raise ValueError(f"Training label {label} is not a string") + raise TypeError(f"Training label {label} is not a string") examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index fcbb8469b9e5fe..7644d8d68d1696 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -63,7 +63,7 @@ "rhoknp": "rhoknp>=1.1.0,<1.3.1", "rjieba": "rjieba", "rouge-score": "rouge-score!=0.0.7,!=0.0.8,!=0.1,!=0.1.1", - "ruff": "ruff==0.4.4", + "ruff": "ruff==0.5.1", "sacrebleu": "sacrebleu>=1.4.12,<2.0.0", "sacremoses": "sacremoses", "safetensors": "safetensors>=0.4.1", diff --git a/src/transformers/generation/beam_constraints.py b/src/transformers/generation/beam_constraints.py index b53c4512427a87..e6462f322c49f7 100644 --- a/src/transformers/generation/beam_constraints.py +++ b/src/transformers/generation/beam_constraints.py @@ -156,7 +156,7 @@ def advance(self): def does_advance(self, token_id: int): if not isinstance(token_id, int): - raise ValueError(f"`token_id` has to be an `int`, but is {token_id} of type {type(token_id)}") + raise TypeError(f"`token_id` has to be an `int`, but is {token_id} of type {type(token_id)}") if self.completed: return False @@ -165,7 +165,7 @@ def does_advance(self, token_id: int): def update(self, token_id: int): if not isinstance(token_id, int): - raise ValueError(f"`token_id` has to be an `int`, but is {token_id} of type {type(token_id)}") + raise TypeError(f"`token_id` has to be an `int`, but is {token_id} of type {type(token_id)}") stepped = False completed = False @@ -300,7 +300,7 @@ def advance(self): def does_advance(self, token_id: int): if not isinstance(token_id, int): - raise ValueError(f"`token_id` is supposed to be type `int`, but is {token_id} of type {type(token_id)}") + raise TypeError(f"`token_id` is supposed to be type `int`, but is {token_id} of type {type(token_id)}") next_tokens = self.trie.next_tokens(self.current_seq) @@ -308,7 +308,7 @@ def does_advance(self, token_id: int): def update(self, token_id: int): if not isinstance(token_id, int): - raise ValueError(f"`token_id` is supposed to be type `int`, but is {token_id} of type {type(token_id)}") + raise TypeError(f"`token_id` is supposed to be type `int`, but is {token_id} of type {type(token_id)}") stepped = False completed = False @@ -432,7 +432,7 @@ def reset(self, token_ids: Optional[List[int]]): def add(self, token_id: int): if not isinstance(token_id, int): - raise ValueError(f"`token_id` should be an `int`, but is `{token_id}`.") + raise TypeError(f"`token_id` should be an `int`, but is `{token_id}`.") complete, stepped = False, False diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index e735d0a2ca7f5a..6fe71a446558e3 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -108,6 +108,9 @@ def __init__( self.assistant_model = assistant_model self.num_assistant_tokens = assistant_model.generation_config.num_assistant_tokens + # Set eos in assistant same as in target model + self.assistant_model.generation_config.eos_token_id = generation_config.eos_token_id + # Prepare the kwargs for the assistant model assistant_kwargs = {} for key, value in model_kwargs.items(): # deepcopy crashes if we attempt to copy encoder outputs with grads @@ -162,7 +165,7 @@ def __init__( self.generation_config.min_length = 0 self.generation_config.min_new_tokens = None for processor in self.logits_processor: - if type(processor) == MinLengthLogitsProcessor: + if isinstance(processor, MinLengthLogitsProcessor): raise ValueError( "Passing `MinLengthLogitsProcessor` when using `assisted_generation is disabled. " "Please pass in `min_length` into `.generate()` instead" @@ -267,6 +270,7 @@ class PromptLookupCandidateGenerator(CandidateGenerator): def __init__( self, + eos_token_id: torch.Tensor = None, num_output_tokens: int = 10, max_matching_ngram_size: int = None, max_length: int = 20, @@ -274,6 +278,7 @@ def __init__( self.num_output_tokens = num_output_tokens self.max_matching_ngram_size = max_matching_ngram_size if max_matching_ngram_size else 2 self.max_length = max_length + self.eos_token_id = eos_token_id if self.max_matching_ngram_size <= 0 or self.num_output_tokens <= 0: raise ValueError("Invalid max_matching_ngram_size or num_output_tokens") @@ -319,6 +324,15 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, if start_idx < end_idx: chosen_ids = input_ids[0, start_idx:end_idx] match_found = True + + # remove remaining candidate ids if an "eos" token is found, otherwise the target model may + # accept eos and the rest as valid, thus not stopping generation after "eos" + # NOTE: below code is written based on the fact that assisted decoding supports only bs=1 + mask = torch.isin(chosen_ids, self.eos_token_id) + match_indices_eos = torch.nonzero(mask) + if match_indices_eos.numel() > 0: + first_eos_index = match_indices_eos[0].item() + chosen_ids = chosen_ids[:first_eos_index] break if match_found: break @@ -364,19 +378,7 @@ def _crop_past_key_values(model, past_key_values, max_length): ) ) past_key_values = tuple(new_past) - # bloom is special - elif "bloom" in model.__class__.__name__.lower() or ( - model.config.architectures is not None and "bloom" in model.config.architectures[0].lower() - ): - for idx in range(len(past_key_values)): - new_past.append( - ( - past_key_values[idx][0][:, :, :max_length], - past_key_values[idx][1][:, :max_length, :], - ) - ) - past_key_values = tuple(new_past) - # gptbigcode is too + # gptbigcode is special and stores kv in shape (batch_size, seq_len, dim), if it's a multi_query model elif "gptbigcode" in model.__class__.__name__.lower() or ( model.config.architectures is not None and "gptbigcode" in model.config.architectures[0].lower() ): @@ -388,7 +390,6 @@ def _crop_past_key_values(model, past_key_values, max_length): past_key_values[idx] = past_key_values[idx][:, :, :max_length, :] elif isinstance(past_key_values, DynamicCache): past_key_values.crop(max_length) - elif past_key_values is not None: for idx in range(len(past_key_values)): new_past.append( diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py index c9a978f5ee8990..b226a059d106b1 100644 --- a/src/transformers/generation/logits_process.py +++ b/src/transformers/generation/logits_process.py @@ -1760,7 +1760,7 @@ class SuppressTokensAtBeginLogitsProcessor(LogitsProcessor): >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en") >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt") >>> # Whisper has `begin_suppress_tokens` set by default (= `[220, 50256]`). 50256 is the EOS token, so this means @@ -1812,7 +1812,7 @@ class SuppressTokensLogitsProcessor(LogitsProcessor): >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en") >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt") >>> # Whisper has a long list of suppressed tokens. For instance, in this case, the token 1 is suppressed by default. @@ -1901,7 +1901,7 @@ class WhisperTimeStampLogitsProcessor(LogitsProcessor): >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en") >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> inputs = processor(ds[3]["audio"]["array"], return_tensors="pt") >>> input_features = inputs.input_features diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index 51019da9a6b378..8918487696ebe9 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -639,7 +639,7 @@ def _expand_dict_for_generation(dict_to_expand): return input_ids, model_kwargs - def _extract_past_from_model_output(self, outputs: ModelOutput, standardize_cache_format: bool = False): + def _extract_past_from_model_output(self, outputs: ModelOutput): past_key_values = None cache_name = "past_key_values" if "past_key_values" in outputs: @@ -652,10 +652,6 @@ def _extract_past_from_model_output(self, outputs: ModelOutput, standardize_cach past_key_values = outputs.cache_params cache_name = "cache_params" - # Bloom fix: standardizes the cache format when requested - if standardize_cache_format and hasattr(self, "_convert_to_standard_cache"): - batch_size = outputs.logits.shape[0] - past_key_values = self._convert_to_standard_cache(past_key_values, batch_size=batch_size) return cache_name, past_key_values def _update_model_kwargs_for_generation( @@ -663,13 +659,10 @@ def _update_model_kwargs_for_generation( outputs: ModelOutput, model_kwargs: Dict[str, Any], is_encoder_decoder: bool = False, - standardize_cache_format: bool = False, num_new_tokens: int = 1, ) -> Dict[str, Any]: # update past_key_values keeping its naming used in model code - cache_name, cache = self._extract_past_from_model_output( - outputs, standardize_cache_format=standardize_cache_format - ) + cache_name, cache = self._extract_past_from_model_output(outputs) model_kwargs[cache_name] = cache if getattr(outputs, "state", None) is not None: model_kwargs["state"] = outputs.state @@ -725,6 +718,7 @@ def _get_candidate_generator( """ if generation_config.prompt_lookup_num_tokens is not None: candidate_generator = PromptLookupCandidateGenerator( + eos_token_id=generation_config._eos_token_tensor, num_output_tokens=generation_config.prompt_lookup_num_tokens, max_matching_ngram_size=generation_config.max_matching_ngram_size, max_length=generation_config.max_length, @@ -1150,7 +1144,7 @@ def _validate_model_class(self): Confirms that the model class is compatible with generation. If not, raises an exception that points to the right class to use. """ - if not self.can_generate(): + if not is_torchdynamo_compiling() and not self.can_generate(): generate_compatible_mappings = [ MODEL_FOR_CAUSAL_LM_MAPPING, MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING, @@ -1253,6 +1247,10 @@ def _validate_model_kwargs(self, model_kwargs: Dict[str, Any]): def _validate_generated_length(self, generation_config, input_ids_length, has_default_max_length): """Performs validation related to the resulting generated length""" + # Can't throw warnings/exceptions during compilation + if is_torchdynamo_compiling(): + return + # 1. Max length warnings related to poor parameterization if has_default_max_length and generation_config.max_new_tokens is None and generation_config.max_length == 20: # 20 is the default max_length of the generation config @@ -1382,20 +1380,12 @@ def _prepare_generation_config( self.generation_config = new_generation_config using_model_generation_config = True generation_config = self.generation_config + using_model_generation_config = True # `torch.compile` can't compile `copy.deepcopy`, arguments in `kwargs` that are part of `generation_config` - # will mutate the object with `.update`. As such, passing these arguments through `kwargs` is disabled. - if is_torchdynamo_compiling(): - model_kwargs = kwargs - generate_attributes_in_kwargs = [ - key for key, value in kwargs.items() if getattr(generation_config, key, None) != value - ] - if len(generate_attributes_in_kwargs) > 0: - raise ValueError( - "`torch.compile` exception: all generation configuration attributes must be passed within a " - f"`generation_config` instance passed to `generate` (found: {generate_attributes_in_kwargs})." - ) - else: + # will mutate the object with `.update`. As such, passing these arguments through `kwargs` is disabled -- an + # exception will be raised in `_validate_model_kwargs` + if not is_torchdynamo_compiling(): generation_config = copy.deepcopy(generation_config) model_kwargs = generation_config.update(**kwargs) # If `generation_config` is provided, let's fallback ALL special tokens to the default values for the model @@ -1408,30 +1398,40 @@ def _prepare_generation_config( generation_config.pad_token_id = self.generation_config.pad_token_id if generation_config.decoder_start_token_id is None: generation_config.decoder_start_token_id = self.generation_config.decoder_start_token_id + else: + model_kwargs = kwargs return generation_config, model_kwargs def _get_initial_cache_position(self, input_ids, model_kwargs): """Calculates `cache_position` for the pre-fill stage based on `input_ids` and optionally past length""" + # `torch.compile`-friendly `torch.arange` from a shape -- the lines below are equivalent to `torch.arange` + if "inputs_embeds" in model_kwargs: + cache_position = torch.ones_like(model_kwargs["inputs_embeds"][0, :, 0], dtype=torch.int64).cumsum(0) - 1 + else: + cache_position = torch.ones_like(input_ids[0, :], dtype=torch.int64).cumsum(0) - 1 + past_length = 0 if model_kwargs.get("past_key_values") is not None: cache = model_kwargs["past_key_values"] + past_length = 0 if not isinstance(cache, Cache): past_length = cache[0][0].shape[2] elif hasattr(cache, "get_seq_length") and cache.get_seq_length() is not None: past_length = cache.get_seq_length() - if "inputs_embeds" in model_kwargs: - cur_len = model_kwargs["inputs_embeds"].shape[1] - else: - cur_len = input_ids.shape[-1] - model_kwargs["cache_position"] = torch.arange(past_length, cur_len, device=input_ids.device) + # TODO(joao): this is not torch.compile-friendly, find a work-around. If the cache is not empty, + # end-to-end compilation will yield bad results because `cache_position` will be incorrect. + if not is_torchdynamo_compiling(): + cache_position = cache_position[past_length:] + + model_kwargs["cache_position"] = cache_position return model_kwargs def _get_cache(self, cache_implementation: str, max_batch_size: int, max_cache_len: int, model_kwargs) -> Cache: """ Sets a cache for `generate`, that will persist across calls. A new cache will only be initialized a - new `generate` call requires a larger cache. + new `generate` call requires a larger cache or uses a different batch size. Returns the resulting cache object. """ @@ -1464,7 +1464,14 @@ def _get_cache(self, cache_implementation: str, max_batch_size: int, max_cache_l if hasattr(self.config, "_pre_quantization_dtype"): cache_dtype = self.config._pre_quantization_dtype else: - cache_dtype = self.dtype + if not is_torchdynamo_compiling(): + cache_dtype = self.dtype + else: + # NOTE: self.dtype is not compatible with torch.compile, as it calls `self.parameters()`. + # Workaround: trust the lm_head, whose attribute name is somewhat consistent across generative + # models. May cause trobles with non-text modalities. + cache_dtype = self.lm_head.weight.dtype + cache_kwargs = { "config": self.config, "max_batch_size": max_batch_size, @@ -1541,27 +1548,29 @@ def _tensor_or_none(token, device=None): pad_token_tensor = eos_token_tensor[0] logger.warning(f"Setting `pad_token_id` to `eos_token_id`:{pad_token_tensor} for open-end generation.") - # we can't infer attn mask if pad token is set to be eos token in model's generation config - if eos_token_tensor is not None and pad_token_tensor in eos_token_tensor: - if kwargs_has_attention_mask is not None and not kwargs_has_attention_mask: - logger.warning_once( - "The attention mask is not set and cannot be inferred from input because pad token is same as eos token." - "As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` " - "to obtain reliable results." - ) - # Sanity checks/warnings if self.config.is_encoder_decoder and decoder_start_token_tensor is None: raise ValueError( "`decoder_start_token_id` or `bos_token_id` has to be defined for encoder-decoder generation." ) - if eos_token_tensor is not None and ( - torch.is_floating_point(eos_token_tensor) or (eos_token_tensor < 0).any() - ): - logger.warning( - f"`eos_token_id` should consist of positive integers, but is {eos_token_tensor}. Your generation will not " - "stop until the maximum length is reached. Depending on other flags, it may even crash." - ) + if not is_torchdynamo_compiling(): # Checks that depend on tensor-dependent control flow + if ( + eos_token_tensor is not None + and torch.isin(elements=eos_token_tensor, test_elements=pad_token_tensor).any() + ): + if kwargs_has_attention_mask is not None and not kwargs_has_attention_mask: + logger.warning_once( + "The attention mask is not set and cannot be inferred from input because pad token is same as " + "eos token. As a consequence, you may observe unexpected behavior. Please pass your input's " + "`attention_mask` to obtain reliable results." + ) + if eos_token_tensor is not None and ( + torch.is_floating_point(eos_token_tensor) or (eos_token_tensor < 0).any() + ): + logger.warning( + f"`eos_token_id` should consist of positive integers, but is {eos_token_tensor}. Your generation " + "will not stop until the maximum length is reached. Depending on other flags, it may even crash." + ) # Update generation config with the updated special tokens tensors # NOTE: this must be written into a different attribute name than the one holding the original special tokens @@ -1770,6 +1779,12 @@ def generate( cache_name = "cache_params" else: cache_name = "past_key_values" + if (model_kwargs.get(cache_name) is not None) and is_torchdynamo_compiling(): + raise ValueError( + "Passing `past_key_values` is not supported when compiling `model.generate` with torch.compile -- you " + "may get incorrect outputs. Please compile `model.forward` only or use the `cache_implementation` " + "input argument." + ) if generation_config.cache_implementation is not None and (model_kwargs.get(cache_name) is not None): raise ValueError( f"Passing both `cache_implementation` (used to initialize certain caches) and `{cache_name}` (a " @@ -1846,7 +1861,7 @@ def generate( "`streamer` cannot be used with beam search (yet!). Make sure that `num_beams` is set to 1." ) - if self.device.type != input_ids.device.type: + if not is_torchdynamo_compiling() and self.device.type != input_ids.device.type: warnings.warn( "You are calling .generate() with the `input_ids` being on a device type different" f" than your model's device. `input_ids` is on {input_ids.device.type}, whereas the model" @@ -2143,23 +2158,36 @@ def typeerror(): result.past_key_values = result.past_key_values.to_legacy_cache() return result - def _has_unfinished_sequences(self, this_peer_finished: bool, synced_gpus: bool, device: torch.device) -> bool: + def _has_unfinished_sequences( + self, + this_peer_finished: bool, + synced_gpus: bool, + device: torch.device, + cur_len: Optional[int] = None, + max_length: Optional[int] = None, + ) -> bool: """ Returns whether there are still unfinished sequences in the device. The existence of unfinished sequences is fed through `this_peer_finished`. ZeRO stage 3-friendly. """ - if synced_gpus: - # Under synced_gpus the `forward` call must continue until all gpus complete their sequence. - # The following logic allows an early break if all peers finished generating their sequence - this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(device) - # send 0.0 if we finished, 1.0 otherwise - dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM) - # did all peers finish? the reduced sum will be 0.0 then - if this_peer_finished_flag.item() == 0.0: + # torch.compile does not support data-dependent control flow. This is a workaround to allow torch.compile, + # although we lose the ability to stop when all sequences return an EOS token (and other stopping criteria) + # TODO (joao): remove this when torch's support for control flow is not experimental (https://pytorch.org/docs/stable/generated/torch.cond.html) + if is_torchdynamo_compiling(): + return cur_len < max_length + else: + if synced_gpus: + # Under synced_gpus the `forward` call must continue until all gpus complete their sequence. + # The following logic allows an early break if all peers finished generating their sequence + this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(device) + # send 0.0 if we finished, 1.0 otherwise + dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM) + # did all peers finish? the reduced sum will be 0.0 then + if this_peer_finished_flag.item() == 0.0: + return False + elif this_peer_finished: return False - elif this_peer_finished: - return False - return True + return True def heal_tokens( self, input_ids: torch.LongTensor, tokenizer: Optional["PreTrainedTokenizerBase"] = None @@ -2557,7 +2585,6 @@ def _contrastive_search( outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder, - standardize_cache_format=True, ) if not sequential: @@ -2722,7 +2749,7 @@ def _contrastive_search( next_past_key_values = selected_outputs["past_key_values"] else: - _, next_past_key_values = self._extract_past_from_model_output(outputs, standardize_cache_format=True) + _, next_past_key_values = self._extract_past_from_model_output(outputs) # Do it in-place layer per layer to save memory if isinstance(next_past_key_values, DynamicCache) or ( isinstance(next_past_key_values, EncoderDecoderCache) @@ -2892,6 +2919,7 @@ def _sample( output_scores = generation_config.output_scores output_logits = generation_config.output_logits return_dict_in_generate = generation_config.return_dict_in_generate + max_length = generation_config.max_length has_eos_stopping_criteria = any(hasattr(criteria, "eos_token_id") for criteria in stopping_criteria) do_sample = generation_config.do_sample if do_sample is True and not isinstance(logits_warper, LogitsProcessorList): @@ -2915,12 +2943,14 @@ def _sample( ) # keep track of which sequences are already finished - batch_size = input_ids.shape[0] + batch_size, cur_len = input_ids.shape this_peer_finished = False unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device) model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs) - while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device): + while self._has_unfinished_sequences( + this_peer_finished, synced_gpus, device=input_ids.device, cur_len=cur_len, max_length=max_length + ): # prepare model inputs model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) @@ -2966,6 +2996,7 @@ def _sample( # token selection if do_sample: probs = nn.functional.softmax(next_token_scores, dim=-1) + # TODO (joao): this OP throws "skipping cudagraphs due to ['incompatible ops']", find solution next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1) else: next_tokens = torch.argmax(next_token_scores, dim=-1) @@ -2986,6 +3017,7 @@ def _sample( unfinished_sequences = unfinished_sequences & ~stopping_criteria(input_ids, scores) this_peer_finished = unfinished_sequences.max() == 0 + cur_len += 1 # This is needed to properly delete outputs.logits which may be very large for first iteration # Otherwise a reference to outputs is kept which keeps the logits alive in the next iteration @@ -3032,7 +3064,7 @@ def _temporary_reorder_cache(self, past_key_values, beam_idx): past_key_values = self._reorder_cache(past_key_values, beam_idx) # Exception 2: models with different cache formats. These are limited to `DynamicCache` until their # cache format is standardized, to avoid adding complexity to the codebase. - elif "bloom" in model_class or "gptbigcode" in model_class: + elif "gptbigcode" in model_class: if not isinstance(past_key_values, (DynamicCache, EncoderDecoderCache)): raise ValueError( f"Using an unsupported cache format with {model_class}. Currently, it only supports the " @@ -3160,7 +3192,6 @@ def _beam_search( for model_name in [ "fsmt", "reformer", - "bloom", "ctrl", "gpt_bigcode", "transo_xl", @@ -3954,7 +3985,6 @@ def _assisted_decoding( # 1. Fetch candidate sequences from a `CandidateGenerator` candidate_input_ids, candidate_logits = candidate_generator.get_candidates(input_ids) - candidate_input_ids = candidate_input_ids.to(self.device) if candidate_logits is not None: candidate_logits = candidate_logits.to(self.device) @@ -4281,7 +4311,7 @@ def _split(data, full_batch_size: int, split_size: int = None): for i in range(0, full_batch_size, split_size) ] else: - raise ValueError(f"Unexpected attribute type: {type(data)}") + raise TypeError(f"Unexpected attribute type: {type(data)}") def _split_model_inputs( @@ -4388,7 +4418,7 @@ def _concat(data): # If the elements are integers or floats, return a tensor return torch.tensor(data) else: - raise ValueError(f"Unexpected attribute type: {type(data[0])}") + raise TypeError(f"Unexpected attribute type: {type(data[0])}") # Use a dictionary comprehension to gather attributes from all objects and concatenate them concatenated_data = { diff --git a/src/transformers/hf_argparser.py b/src/transformers/hf_argparser.py index 045bf798050e93..4b5548fffb4154 100644 --- a/src/transformers/hf_argparser.py +++ b/src/transformers/hf_argparser.py @@ -164,7 +164,7 @@ def _parse_dataclass_field(parser: ArgumentParser, field: dataclasses.Field): ) if type(None) not in field.type.__args__: # filter `str` in Union - field.type = field.type.__args__[0] if field.type.__args__[1] == str else field.type.__args__[1] + field.type = field.type.__args__[0] if field.type.__args__[1] is str else field.type.__args__[1] origin_type = getattr(field.type, "__origin__", field.type) elif bool not in field.type.__args__: # filter `NoneType` in Union (except for `Union[bool, NoneType]`) diff --git a/src/transformers/image_processing_base.py b/src/transformers/image_processing_base.py index 6c80aee0164722..9b314f83c11fb1 100644 --- a/src/transformers/image_processing_base.py +++ b/src/transformers/image_processing_base.py @@ -544,7 +544,7 @@ def fetch_images(self, image_url_or_urls: Union[str, List[str]]): response.raise_for_status() return Image.open(BytesIO(response.content)) else: - raise ValueError(f"only a single or a list of entries is supported but got type={type(image_url_or_urls)}") + raise TypeError(f"only a single or a list of entries is supported but got type={type(image_url_or_urls)}") ImageProcessingMixin.push_to_hub = copy_func(ImageProcessingMixin.push_to_hub) diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py index 4e4812879eed1c..580570f6066278 100644 --- a/src/transformers/image_transforms.py +++ b/src/transformers/image_transforms.py @@ -75,7 +75,7 @@ def to_channel_dimension_format( `np.ndarray`: The image with the channel dimension set to `channel_dim`. """ if not isinstance(image, np.ndarray): - raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}") + raise TypeError(f"Input image must be of type np.ndarray, got {type(image)}") if input_channel_dim is None: input_channel_dim = infer_channel_dimension_format(image) @@ -121,7 +121,7 @@ def rescale( `np.ndarray`: The rescaled image. """ if not isinstance(image, np.ndarray): - raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}") + raise TypeError(f"Input image must be of type np.ndarray, got {type(image)}") rescaled_image = image * scale if data_format is not None: @@ -453,7 +453,7 @@ def center_crop( return_numpy = True if return_numpy is None else return_numpy if not isinstance(image, np.ndarray): - raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}") + raise TypeError(f"Input image must be of type np.ndarray, got {type(image)}") if not isinstance(size, Iterable) or len(size) != 2: raise ValueError("size must have 2 elements representing the height and width of the output image") diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py index 13edcb3a2ad43b..4449b602491ad9 100644 --- a/src/transformers/image_utils.py +++ b/src/transformers/image_utils.py @@ -377,7 +377,7 @@ def load_image(image: Union[str, "PIL.Image.Image"], timeout: Optional[float] = elif isinstance(image, PIL.Image.Image): image = image else: - raise ValueError( + raise TypeError( "Incorrect format used for image. Should be an url linking to an image, a base64 string, a local path, or a PIL image." ) image = PIL.ImageOps.exif_transpose(image) diff --git a/src/transformers/integrations/__init__.py b/src/transformers/integrations/__init__.py index 9b838bd1608490..4c756a23ae0aa4 100755 --- a/src/transformers/integrations/__init__.py +++ b/src/transformers/integrations/__init__.py @@ -45,6 +45,7 @@ "unset_hf_deepspeed_config", ], "eetq": ["replace_with_eetq_linear"], + "fbgemm_fp8": ["FbgemmFp8Linear", "replace_with_fbgemm_fp8_linear"], "ggml": [ "GGUF_CONFIG_MAPPING", "GGUF_TENSOR_MAPPING", @@ -126,6 +127,7 @@ unset_hf_deepspeed_config, ) from .eetq import replace_with_eetq_linear + from .fbgemm_fp8 import FbgemmFp8Linear, replace_with_fbgemm_fp8_linear from .ggml import ( GGUF_CONFIG_MAPPING, GGUF_TENSOR_MAPPING, diff --git a/src/transformers/integrations/awq.py b/src/transformers/integrations/awq.py index 30427aa405dd56..550c23fde3d4ad 100644 --- a/src/transformers/integrations/awq.py +++ b/src/transformers/integrations/awq.py @@ -199,7 +199,7 @@ def get_modules_to_fuse(model, quantization_config): The quantization configuration to use. """ if not isinstance(model, PreTrainedModel): - raise ValueError(f"The model should be an instance of `PreTrainedModel`, got {model.__class__.__name__}") + raise TypeError(f"The model should be an instance of `PreTrainedModel`, got {model.__class__.__name__}") # Always default to `quantization_config.modules_to_fuse` if quantization_config.modules_to_fuse is not None: diff --git a/src/transformers/integrations/fbgemm_fp8.py b/src/transformers/integrations/fbgemm_fp8.py new file mode 100644 index 00000000000000..a0f5b2b76089b9 --- /dev/null +++ b/src/transformers/integrations/fbgemm_fp8.py @@ -0,0 +1,161 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..utils import is_accelerate_available, is_fbgemm_gpu_available, is_torch_available, logging + + +if is_torch_available(): + import torch + from torch import nn + +if is_accelerate_available(): + from accelerate import init_empty_weights + +if is_fbgemm_gpu_available(): + import fbgemm_gpu.experimental.gen_ai # noqa: F401 + +logger = logging.get_logger(__name__) + + +class FbgemmFp8Linear(torch.nn.Module): + def __init__(self, in_features, out_features, bias, weight_dtype=torch.float32): + super().__init__() + self.in_features = in_features + self.out_features = out_features + + self.register_buffer("weight", torch.zeros((out_features, in_features), dtype=torch.float8_e4m3fn)) + self.register_buffer("weight_scale", torch.zeros((out_features, 1), dtype=weight_dtype)) + self.register_buffer("input_scale_ub", torch.zeros([1], dtype=torch.float), persistent=False) + + if bias: + self.register_buffer("bias", torch.zeros((self.out_features), dtype=weight_dtype)) + else: + self.bias = None + + def forward(self, x): + num_tokens = None + # x_quantized and x_scale are not necessarily on the same device as x, this is an issue. + # https://github.com/pytorch/FBGEMM/blob/e08af8539c391437f447173863df0f3f6f6f1855/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cu#L1237C3-L1237C45 + x_quantized, x_scale = torch.ops.fbgemm.quantize_fp8_per_row( + x.view(-1, x.shape[-1]), num_tokens, self.input_scale_ub + ) + # moving x_quantized, x_scale here creates glibberish output ... However, if we move the output, it works + # x_quantized, x_scale = x_quantized.to(x.device), x_scale.to(x.device) + + # The computation still happens on the device where self.weight is even if x_quantized is not on the same device as self.weight + output = torch.ops.fbgemm.f8f8bf16_rowwise( + x_quantized, self.weight, x_scale, self.weight_scale, use_fast_accum=True + ) + output = output + self.bias if self.bias is not None else output + # Hacky for now, we have the output to the device of x + output = output.to(x.device) + del x_quantized, x_scale + return output + + +def _replace_with_fbgemm_fp8_linear( + model, + modules_to_not_convert=None, + current_key_name=None, + quantization_config=None, + has_been_replaced=False, + pre_quantized=False, +): + """ + Private method that wraps the recursion for module replacement. + + Returns the converted model and a boolean that indicates if the conversion has been successfull or not. + """ + if current_key_name is None: + current_key_name = [] + + for name, module in model.named_children(): + current_key_name.append(name) + + if (isinstance(module, nn.Linear)) and name not in modules_to_not_convert: + # Check if the current key is not in the `modules_to_not_convert` + current_key_name_str = ".".join(current_key_name) + if not any( + (key + "." in current_key_name_str) or (key == current_key_name_str) for key in modules_to_not_convert + ): + with init_empty_weights(include_buffers=True): + in_features = module.in_features + out_features = module.out_features + model._modules[name] = FbgemmFp8Linear( + in_features, + out_features, + module.bias is not None, + ) + has_been_replaced = True + + # Force requires grad to False to avoid unexpected errors + model._modules[name].requires_grad_(False) + # set non persistant buffer outside of init_empty_weights + model._modules[name].input_scale_ub = torch.tensor( + [quantization_config.activation_scale_ub], dtype=torch.float + ) + if len(list(module.children())) > 0: + _, has_been_replaced = _replace_with_fbgemm_fp8_linear( + module, + modules_to_not_convert, + current_key_name, + quantization_config, + has_been_replaced=has_been_replaced, + pre_quantized=pre_quantized, + ) + # Remove the last key for recursion + current_key_name.pop(-1) + return model, has_been_replaced + + +def replace_with_fbgemm_fp8_linear( + model, modules_to_not_convert=None, current_key_name=None, quantization_config=None, pre_quantized=False +): + """ + A helper function to replace all `torch.nn.Linear` modules by `FbgemmFp8Linear` modules. + This will enable running your models using high performance fp8 kernel from FBGEMM library. + + The function will be run recursively and replace all `torch.nn.Linear` modules except for the `lm_head` that should + be kept as a `torch.nn.Linear` module. The replacement is done under `init_empty_weights` context manager so no + CPU/GPU memory is required to run this function. Each weight will be quantized along the channel. + + Parameters: + model (`torch.nn.Module`): + Input model or `torch.nn.Module` as the function is run recursively. + modules_to_not_convert (`List[`str`]`, *optional*, defaults to `["lm_head"]`): + Names of the modules to not convert in `FP8Linear`. In practice we keep the `lm_head` in full precision + for numerical stability reasons. + current_key_name (`List[`str`]`, *optional*): + An array to track the current key of the recursion. This is used to check whether the current key (part of + it) is not in the list of modules to not convert (for instances modules that are offloaded to `cpu` or + `disk`). + """ + + modules_to_not_convert = ["lm_head"] if modules_to_not_convert is None else modules_to_not_convert + + if quantization_config.modules_to_not_convert is not None: + modules_to_not_convert.extend(quantization_config.modules_to_not_convert) + modules_to_not_convert = list(set(modules_to_not_convert)) + model, has_been_replaced = _replace_with_fbgemm_fp8_linear( + model, modules_to_not_convert, current_key_name, quantization_config, pre_quantized=pre_quantized + ) + + if not has_been_replaced: + logger.warning( + "You are loading your model using FP8 quantization but no linear modules were found in your model." + " Please double check your model architecture, or submit an issue on github if you think this is" + " a bug." + ) + + return model diff --git a/src/transformers/integrations/ggml.py b/src/transformers/integrations/ggml.py index 5c2d72c345ecf9..47f3f0cf8d57b4 100644 --- a/src/transformers/integrations/ggml.py +++ b/src/transformers/integrations/ggml.py @@ -36,6 +36,7 @@ # Listed here: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md GGML_TYPES = { "F32": 0, + "F16": 1, "Q4_0": 2, "Q8_0": 8, "Q2_K": 10, @@ -489,6 +490,8 @@ def dequantize_q5_k(data): def load_dequant_gguf_tensor(shape, ggml_type, data): if ggml_type == GGML_TYPES["F32"]: values = data + elif ggml_type == GGML_TYPES["F16"]: + values = data elif ggml_type == GGML_TYPES["Q8_0"]: values = dequantize_q8_0(data) elif ggml_type == GGML_TYPES["Q4_0"]: @@ -609,7 +612,7 @@ def tokenizer(self, proto): self.additional_kwargs["bos_token"] = eos_token if self.is_llama_3_tokenizer: - self.additional_kwargs["add_prefix_space"] = False + self.additional_kwargs["add_prefix_space"] = None self.additional_kwargs["clean_up_tokenization_spaces"] = True self.additional_kwargs["legacy"] = False diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py index e9c91192ecf9a2..5bdb21d67323e3 100755 --- a/src/transformers/integrations/integration_utils.py +++ b/src/transformers/integrations/integration_utils.py @@ -26,6 +26,7 @@ import sys import tempfile from dataclasses import asdict, fields +from enum import Enum from pathlib import Path from typing import TYPE_CHECKING, Any, Dict, Literal, Optional, Union @@ -252,10 +253,11 @@ def _objective(trial, checkpoint_dir=None): timeout = kwargs.pop("timeout", None) n_jobs = kwargs.pop("n_jobs", 1) + gc_after_trial = kwargs.pop("gc_after_trial", False) directions = direction if isinstance(direction, list) else None direction = None if directions is not None else direction study = optuna.create_study(direction=direction, directions=directions, **kwargs) - study.optimize(_objective, n_trials=n_trials, timeout=timeout, n_jobs=n_jobs) + study.optimize(_objective, n_trials=n_trials, timeout=timeout, n_jobs=n_jobs, gc_after_trial=gc_after_trial) if not study._is_multi_objective(): best_trial = study.best_trial return BestRun(str(best_trial.number), best_trial.value, best_trial.params) @@ -725,6 +727,35 @@ def print_to_file(s): print(model, file=f) +class WandbLogModel(str, Enum): + """Enum of possible log model values in W&B.""" + + CHECKPOINT = "checkpoint" + END = "end" + FALSE = "false" + + @property + def is_enabled(self) -> bool: + """Check if the value corresponds to a state where the `WANDB_LOG_MODEL` setting is enabled.""" + return self in (WandbLogModel.CHECKPOINT, WandbLogModel.END) + + @classmethod + def _missing_(cls, value: Any) -> "WandbLogModel": + if not isinstance(value, str): + raise ValueError(f"Expecting to have a string `WANDB_LOG_MODEL` setting, but got {type(value)}") + if value.upper() in ENV_VARS_TRUE_VALUES: + DeprecationWarning( + f"Setting `WANDB_LOG_MODEL` as {os.getenv('WANDB_LOG_MODEL')} is deprecated and will be removed in " + "version 5 of transformers. Use one of `'end'` or `'checkpoint'` instead." + ) + logger.info(f"Setting `WANDB_LOG_MODEL` from {os.getenv('WANDB_LOG_MODEL')} to `end` instead") + return WandbLogModel.END + logger.warning( + f"Received unrecognized `WANDB_LOG_MODEL` setting value={value}; so disabling `WANDB_LOG_MODEL`" + ) + return WandbLogModel.FALSE + + class WandbCallback(TrainerCallback): """ A [`TrainerCallback`] that logs metrics, media, model checkpoints to [Weight and Biases](https://www.wandb.com/). @@ -739,16 +770,7 @@ def __init__(self): self._wandb = wandb self._initialized = False - # log model - if os.getenv("WANDB_LOG_MODEL", "FALSE").upper() in ENV_VARS_TRUE_VALUES.union({"TRUE"}): - DeprecationWarning( - f"Setting `WANDB_LOG_MODEL` as {os.getenv('WANDB_LOG_MODEL')} is deprecated and will be removed in " - "version 5 of transformers. Use one of `'end'` or `'checkpoint'` instead." - ) - logger.info(f"Setting `WANDB_LOG_MODEL` from {os.getenv('WANDB_LOG_MODEL')} to `end` instead") - self._log_model = "end" - else: - self._log_model = os.getenv("WANDB_LOG_MODEL", "false").lower() + self._log_model = WandbLogModel(os.getenv("WANDB_LOG_MODEL", "false")) def setup(self, args, state, model, **kwargs): """ @@ -833,37 +855,38 @@ def setup(self, args, state, model, **kwargs): logger.info("Could not log the number of model parameters in Weights & Biases.") # log the initial model architecture to an artifact - with tempfile.TemporaryDirectory() as temp_dir: - model_name = ( - f"model-{self._wandb.run.id}" - if (args.run_name is None or args.run_name == args.output_dir) - else f"model-{self._wandb.run.name}" - ) - model_artifact = self._wandb.Artifact( - name=model_name, - type="model", - metadata={ - "model_config": model.config.to_dict() if hasattr(model, "config") else None, - "num_parameters": self._wandb.config.get("model/num_parameters"), - "initial_model": True, - }, - ) - # add the architecture to a separate text file - save_model_architecture_to_file(model, temp_dir) - - for f in Path(temp_dir).glob("*"): - if f.is_file(): - with model_artifact.new_file(f.name, mode="wb") as fa: - fa.write(f.read_bytes()) - self._wandb.run.log_artifact(model_artifact, aliases=["base_model"]) - - badge_markdown = ( - f'[Visualize in Weights & Biases]({self._wandb.run.get_url()})' - ) + if self._log_model.is_enabled: + with tempfile.TemporaryDirectory() as temp_dir: + model_name = ( + f"model-{self._wandb.run.id}" + if (args.run_name is None or args.run_name == args.output_dir) + else f"model-{self._wandb.run.name}" + ) + model_artifact = self._wandb.Artifact( + name=model_name, + type="model", + metadata={ + "model_config": model.config.to_dict() if hasattr(model, "config") else None, + "num_parameters": self._wandb.config.get("model/num_parameters"), + "initial_model": True, + }, + ) + # add the architecture to a separate text file + save_model_architecture_to_file(model, temp_dir) + + for f in Path(temp_dir).glob("*"): + if f.is_file(): + with model_artifact.new_file(f.name, mode="wb") as fa: + fa.write(f.read_bytes()) + self._wandb.run.log_artifact(model_artifact, aliases=["base_model"]) + + badge_markdown = ( + f'[Visualize in Weights & Biases]({self._wandb.run.get_url()})' + ) - modelcard.AUTOGENERATED_TRAINER_COMMENT += f"\n{badge_markdown}" + modelcard.AUTOGENERATED_TRAINER_COMMENT += f"\n{badge_markdown}" def on_train_begin(self, args, state, control, model=None, **kwargs): if self._wandb is None: @@ -879,7 +902,7 @@ def on_train_begin(self, args, state, control, model=None, **kwargs): def on_train_end(self, args, state, control, model=None, tokenizer=None, **kwargs): if self._wandb is None: return - if self._log_model in ("end", "checkpoint") and self._initialized and state.is_world_process_zero: + if self._log_model.is_enabled and self._initialized and state.is_world_process_zero: from ..trainer import Trainer fake_trainer = Trainer(args=args, model=model, tokenizer=tokenizer) @@ -937,7 +960,7 @@ def on_log(self, args, state, control, model=None, logs=None, **kwargs): self._wandb.log({**non_scalar_logs, "train/global_step": state.global_step}) def on_save(self, args, state, control, **kwargs): - if self._log_model == "checkpoint" and self._initialized and state.is_world_process_zero: + if self._log_model == WandbLogModel.CHECKPOINT and self._initialized and state.is_world_process_zero: checkpoint_metadata = { k: v for k, v in dict(self._wandb.summary).items() diff --git a/src/transformers/integrations/peft.py b/src/transformers/integrations/peft.py index a543315410c785..923aa59e4184dc 100644 --- a/src/transformers/integrations/peft.py +++ b/src/transformers/integrations/peft.py @@ -262,9 +262,7 @@ def add_adapter(self, adapter_config, adapter_name: Optional[str] = None) -> Non raise ValueError(f"Adapter with name {adapter_name} already exists. Please use a different name.") if not isinstance(adapter_config, PeftConfig): - raise ValueError( - f"adapter_config should be an instance of PeftConfig. Got {type(adapter_config)} instead." - ) + raise TypeError(f"adapter_config should be an instance of PeftConfig. Got {type(adapter_config)} instead.") # Retrieve the name or path of the model, one could also use self.config._name_or_path # but to be consistent with what we do in PEFT: https://github.com/huggingface/peft/blob/6e783780ca9df3a623992cc4d1d665001232eae0/src/peft/mapping.py#L100 diff --git a/src/transformers/modeling_flash_attention_utils.py b/src/transformers/modeling_flash_attention_utils.py index 1742e419b4aaea..7bb3ee03c07074 100644 --- a/src/transformers/modeling_flash_attention_utils.py +++ b/src/transformers/modeling_flash_attention_utils.py @@ -130,6 +130,56 @@ def _upad_input( ) +def prepare_fa2_from_position_ids(query, key, value, position_ids): + """ + This function returns necessary arguments to call `flash_attn_varlen_func`. + All three query, key, value states will be flattened. + Cummulative lengths of each examples in the batch will be extracted from position_ids. + + NOTE: ideally cummulative lengths should be prepared at the data collator stage + + Arguments: + query (`torch.Tensor`): + Query state with padding. Shape: (batch_size, query_length, num_heads, head_dim). + key (`torch.Tensor`): + Key state with padding. Shape: (batch_size, kv_seq_len, num_key_value_heads, head_dim). + value (`torch.Tensor`): + Value state with padding. Shape: (batch_size, kv_seq_len, num_key_value_heads, head_dim). + position_ids (`torch.Tensor`): + Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid. + + Return: + query (`torch.Tensor): + Query state without padding. Shape: (total_target_length, num_heads, head_dim). + key (`torch.Tensor`): + Key state with padding. Shape: (total_source_length, num_key_value_heads, head_dim). + value (`torch.Tensor`): + Value state with padding. Shape: (total_source_length, num_key_value_heads, head_dim). + indices_q (`torch.Tensor`): + The indices of non-masked tokens from the flattened input target sequence. + (cu_seqlens_q, cu_seqlens_k) (`Tuple[int]`): + The cumulative sequence lengths for the target (query) and source (key, value), used to index into ragged (unpadded) tensors. `cu_seqlens` shape is (batch_size + 1,). + (max_seqlen_in_batch_q, max_seqlen_in_batch_k) (`Tuple[int]`): + Maximum sequence length in batch (`max_seqlen_in_batch_q` for the target sequence i.e. query, `max_seqlen_in_batch_k` for the source sequence i.e. key/value). + """ + query = query.view(-1, query.size(-2), query.size(-1)) + key = key.view(-1, key.size(-2), key.size(-1)) + value = value.view(-1, value.size(-2), value.size(-1)) + position_ids = position_ids.flatten() + indices_q = torch.arange(position_ids.size(0), device=position_ids.device, dtype=torch.int32) + + cu_seq_lens = torch.cat( + ( + indices_q[position_ids == 0], + torch.tensor(position_ids.size(), device=position_ids.device, dtype=torch.int32), + ) + ) + + max_length = position_ids.max() + 1 + + return (query, key, value, indices_q, (cu_seq_lens, cu_seq_lens), (max_length, max_length)) + + def _flash_attention_forward( query_states: torch.Tensor, key_states: torch.Tensor, @@ -138,11 +188,12 @@ def _flash_attention_forward( query_length: int, is_causal: bool, dropout: float = 0.0, + position_ids: Optional[torch.Tensor] = None, softmax_scale: Optional[float] = None, sliding_window: Optional[int] = None, use_top_left_mask: bool = False, softcap: Optional[float] = None, - deterministic: bool = os.environ.get("FLASH_ATTENTION_DETERMINISTIC", "0") == "1", + deterministic: bool = None, ): """ Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token @@ -182,6 +233,8 @@ def _flash_attention_forward( flash_kwargs = {"window_size": (sliding_window, sliding_window)} if use_sliding_windows else {} if is_flash_attn_greater_or_equal("2.4.1"): + if deterministic is None: + deterministic = os.environ.get("FLASH_ATTENTION_DETERMINISTIC", "0") == "1" flash_kwargs["deterministic"] = deterministic if softcap is not None: @@ -210,6 +263,36 @@ def _flash_attention_forward( **flash_kwargs, ) attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) + + # if position_ids is provided and check not all examples (row) contain only 1 sequence, and is in pre-fill/training stage + # then use `flash_attn_varlen_func` to prevent cross-example attention and also allow padding free approach + elif ( + position_ids is not None and not (position_ids[:, -1] == position_ids.size(1) - 1).all() and query_length != 1 + ): + batch_size = query_states.size(0) + query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = prepare_fa2_from_position_ids( + query_states, key_states, value_states, position_ids + ) + + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + + attn_output = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + **flash_kwargs, + ) + + attn_output = attn_output.view(batch_size, -1, attn_output.size(-2), attn_output.size(-1)) + else: attn_output = flash_attn_func( query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal, **flash_kwargs diff --git a/src/transformers/modeling_flax_utils.py b/src/transformers/modeling_flax_utils.py index 61077cf7c30938..9d12e1e67c8082 100644 --- a/src/transformers/modeling_flax_utils.py +++ b/src/transformers/modeling_flax_utils.py @@ -90,7 +90,7 @@ def dtype_byte_size(dtype): 4 ``` """ - if dtype == bool: + if dtype is bool: return 1 / 8 bit_search = re.search(r"[^\d](\d+)$", dtype.name) if bit_search is None: diff --git a/src/transformers/modeling_rope_utils.py b/src/transformers/modeling_rope_utils.py new file mode 100644 index 00000000000000..61315e18d41245 --- /dev/null +++ b/src/transformers/modeling_rope_utils.py @@ -0,0 +1,549 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from typing import Optional, Tuple + +from .configuration_utils import PretrainedConfig +from .utils import is_torch_available, logging + + +logger = logging.get_logger(__name__) + + +if is_torch_available(): + import torch + + +def _compute_default_rope_parameters( + config: Optional[PretrainedConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + **rope_kwargs, +) -> Tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PretrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + rope_kwargs (`Dict`, *optional*): + BC compatibility with the previous RoPE class instantiation, will be removed in v4.45. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + if config is not None and len(rope_kwargs) > 0: + raise ValueError( + "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in " + f"`_compute_default_rope_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}" + ) + if len(rope_kwargs) > 0: + base = rope_kwargs["base"] + dim = rope_kwargs["dim"] + elif config is not None: + base = config.rope_theta + partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 + dim = int((config.hidden_size // config.num_attention_heads) * partial_rotary_factor) + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float().to(device) / dim)) + return inv_freq, attention_factor + + +def _compute_linear_scaling_rope_parameters( + config: Optional[PretrainedConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + **rope_kwargs, +) -> Tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies with linear scaling. Credits to the Reddit user /u/kaiokendev + Args: + config ([`~transformers.PretrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + rope_kwargs (`Dict`, *optional*): + BC compatibility with the previous RoPE class instantiation, will be removed in v4.45. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + if config is not None and len(rope_kwargs) > 0: + raise ValueError( + "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in " + f"`_compute_linear_scaling_rope_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}" + ) + if len(rope_kwargs) > 0: + factor = rope_kwargs["factor"] + elif config is not None: + factor = config.rope_scaling["factor"] + + # Gets the default RoPE parameters + inv_freq, attention_factor = _compute_default_rope_parameters(config, device, seq_len, **rope_kwargs) + + # Then applies linear scaling to the frequencies. + # NOTE: originally, scaling was applied to the position_ids. However, we get `embs = inv_freq @ position_ids`, so + # applying scaling to the inverse frequencies is equivalent. + inv_freq /= factor + return inv_freq, attention_factor + + +def _compute_dynamic_ntk_parameters( + config: Optional[PretrainedConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + **rope_kwargs, +) -> Tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies with NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla + Args: + config ([`~transformers.PretrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length, used to update the dynamic RoPE at inference time. + rope_kwargs (`Dict`, *optional*): + BC compatibility with the previous RoPE class instantiation, will be removed in v4.45. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + # TODO (joao): use the new `original_max_position_embeddings` from rope_scaling + if config is not None and len(rope_kwargs) > 0: + raise ValueError( + "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in " + f"`_compute_dynamic_ntk_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}" + ) + if len(rope_kwargs) > 0: + base = rope_kwargs["base"] + dim = rope_kwargs["dim"] + max_position_embeddings = rope_kwargs["max_position_embeddings"] + factor = rope_kwargs["factor"] + elif config is not None: + base = config.rope_theta + partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 + dim = int((config.hidden_size // config.num_attention_heads) * partial_rotary_factor) + max_position_embeddings = config.max_position_embeddings + factor = config.rope_scaling["factor"] + + attention_factor = 1.0 # Unused in this type of RoPE + + # seq_len: default to max_position_embeddings, e.g. at init time + seq_len = seq_len if seq_len is not None else max_position_embeddings + + # Compute the inverse frequencies + base = base * ((factor * seq_len / max_position_embeddings) - (factor - 1)) ** (dim / (dim - 2)) + inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float().to(device) / dim)) + return inv_freq, attention_factor + + +def _compute_yarn_parameters( + config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs +) -> Tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies with NTK scaling. Please refer to the + [original paper](https://arxiv.org/abs/2309.00071) + Args: + config ([`~transformers.PretrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + rope_kwargs (`Dict`, *optional*): + BC compatibility with the previous RoPE class instantiation, will be removed in v4.45. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin. + """ + # No need to keep BC with yarn, unreleased when this new pattern was created. + if len(rope_kwargs) > 0: + raise ValueError( + f"Unexpected arguments: `**rope_kwargs` should be unset in `_compute_yarn_parameters`, got {rope_kwargs}" + ) + + base = config.rope_theta + partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 + dim = int((config.hidden_size // config.num_attention_heads) * partial_rotary_factor) + max_position_embeddings = config.max_position_embeddings + factor = config.rope_scaling["factor"] + + # Sets the attention factor as suggested in the paper + attention_factor = config.rope_scaling.get("attention_factor") + if attention_factor is None: + attention_factor = 0.1 * math.log(factor) + 1.0 + + # Optional config options + # beta_fast/beta_slow: as suggested in the paper, default to 32/1 (correspondingly) + beta_fast = config.rope_scaling.get("beta_fast") or 32 + beta_slow = config.rope_scaling.get("beta_slow") or 1 + + # Compute the inverse frequencies + def find_correction_dim(num_rotations, dim, base, max_position_embeddings): + """Inverse dimension formula to find the dimension based on the number of rotations""" + return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (2 * math.log(base)) + + def find_correction_range(low_rot, high_rot, dim, base, max_position_embeddings): + """Find dimension range bounds based on rotations""" + low = math.floor(find_correction_dim(low_rot, dim, base, max_position_embeddings)) + high = math.ceil(find_correction_dim(high_rot, dim, base, max_position_embeddings)) + return max(low, 0), min(high, dim - 1) + + def linear_ramp_mask(min, max, dim): + if min == max: + max += 0.001 # Prevent singularity + + linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min) + ramp_func = torch.clamp(linear_func, 0, 1) + return ramp_func + + pos_freqs = base ** (torch.arange(0, dim, 2).float().to(device) / dim) + inv_freq_extrapolation = 1.0 / pos_freqs + inv_freq_interpolation = 1.0 / (factor * pos_freqs) + + low, high = find_correction_range(beta_fast, beta_slow, dim, base, max_position_embeddings) + + # Get n-dimensional rotational scaling corrected for extrapolation + inv_freq_mask = 1 - linear_ramp_mask(low, high, dim // 2).float().to(device) + inv_freq = inv_freq_interpolation * (1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask + + return inv_freq, attention_factor + + +def _compute_longrope_parameters( + config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs +) -> Tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies with LongRoPE scaling. Please refer to the + [original implementation](https://github.com/microsoft/LongRoPE) + Args: + config ([`~transformers.PretrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + rope_kwargs (`Dict`, *optional*): + BC compatibility with the previous RoPE class instantiation, will be removed in v4.45. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin. + """ + # TODO (joao): use the new `original_max_position_embeddings` from rope_scaling + # No need to keep BC with longrope, unreleased when this new pattern was created. + if len(rope_kwargs) > 0: + raise ValueError( + "Unexpected arguments: `**rope_kwargs` should be unset in `_compute_longrope_parameters`, got " + f"{rope_kwargs}" + ) + + base = config.rope_theta + partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 + dim = int((config.hidden_size // config.num_attention_heads) * partial_rotary_factor) + long_factor = config.rope_scaling["long_factor"] + short_factor = config.rope_scaling["short_factor"] + factor = config.rope_scaling.get("factor") + attention_factor = config.rope_scaling.get("attention_factor") + + # NOTE: Phi3 (and potentially other models) modify `max_position_embeddings` and have a + # `original_max_position_embeddings` field containing the pretrained value. They use the ratio between these two + # values to compute the default attention scaling factor, instead of using `factor`. + if hasattr(config, "original_max_position_embeddings"): + max_position_embeddings = config.original_max_position_embeddings + expanded_max_position_embeddings = config.max_position_embeddings + factor = expanded_max_position_embeddings / max_position_embeddings + else: + max_position_embeddings = config.max_position_embeddings + expanded_max_position_embeddings = max_position_embeddings * factor + + # Sets the attention factor as suggested in the paper + if attention_factor is None: + if factor <= 1.0: + attention_factor = 1.0 + else: + attention_factor = math.sqrt(1 + math.log(factor) / math.log(max_position_embeddings)) + + # Compute the inverse frequencies -- scaled based on the target sequence length + if expanded_max_position_embeddings > max_position_embeddings: + ext_factors = torch.tensor(long_factor, dtype=torch.float32, device=device) + else: + ext_factors = torch.tensor(short_factor, dtype=torch.float32, device=device) + inv_freq_shape = torch.arange(0, dim, 2, dtype=torch.int64, device=device).float() / dim + inv_freq = 1.0 / (ext_factors * base**inv_freq_shape) + + return inv_freq, attention_factor + + +def _compute_llama3_parameters( + config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs +) -> Tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies for llama 3.1. + + Args: + config ([`~transformers.PretrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + rope_kwargs (`Dict`, *optional*): + BC compatibility with the previous RoPE class instantiation, will be removed in v4.45. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin. + """ + # Gets the default RoPE parameters + inv_freq, attention_factor = _compute_default_rope_parameters(config, device, seq_len, **rope_kwargs) + + factor = config.rope_scaling["factor"] # `8` in the original implementation + low_freq_factor = config.rope_scaling["low_freq_factor"] # `1` in the original implementation + high_freq_factor = config.rope_scaling["high_freq_factor"] # `4` in the original implementation + old_context_len = config.rope_scaling["original_max_position_embeddings"] # `8192` in the original implementation + + low_freq_wavelen = old_context_len / low_freq_factor + high_freq_wavelen = old_context_len / high_freq_factor + + wavelen = 2 * math.pi / inv_freq + # wavelen < high_freq_wavelen: do nothing + # wavelen > low_freq_wavelen: divide by factor + inv_freq_new = torch.where(wavelen > low_freq_wavelen, inv_freq / factor, inv_freq) + # otherwise: interpolate between the two, using a smooth factor + smooth_factor = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) + smoothed_inv_freq = (1 - smooth_factor) * inv_freq_new / factor + smooth_factor * inv_freq_new + is_medium_freq = ~(wavelen < high_freq_wavelen) * ~(wavelen > low_freq_wavelen) + inv_freq_new = torch.where(is_medium_freq, smoothed_inv_freq, inv_freq_new) + + return inv_freq, attention_factor + + +# This maps the "rope_type" string field in rope config to the corresponding function to compute the RoPE parameters +# from the model config. You can append new {'rope_type': callable} pairs to this dictionary to enable custom RoPE +# parameterizations, as long as the callable has the same signature. +ROPE_INIT_FUNCTIONS = { + "default": _compute_default_rope_parameters, + "linear": _compute_linear_scaling_rope_parameters, + "dynamic": _compute_dynamic_ntk_parameters, + "yarn": _compute_yarn_parameters, + "longrope": _compute_longrope_parameters, + "llama3": _compute_llama3_parameters, +} + + +def _check_received_keys(rope_type: str, received_keys: set, required_keys: set, optional_keys: Optional[set] = None): + """Compare the received keys in `config.rope_scaling` against the expected and optional keys""" + # BC: "rope_type" was originally "type" -- let's gracefully handle it + if "rope_type" not in received_keys and "type" in received_keys: + received_keys -= {"type"} + received_keys.add("rope_type") + + missing_keys = required_keys - received_keys + if missing_keys: + raise KeyError(f"Missing required keys in `rope_scaling` for 'rope_type'='{rope_type}': {missing_keys}") + + if optional_keys is not None: + unused_keys = received_keys - required_keys - optional_keys + else: + unused_keys = received_keys - required_keys + if unused_keys: + logger.warning(f"Unrecognized keys in `rope_scaling` for 'rope_type'='{rope_type}': {unused_keys}") + + +def _validate_default_rope_parameters(config: PretrainedConfig): + rope_scaling = config.rope_scaling + rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type" + required_keys = {"rope_type"} + received_keys = set(rope_scaling.keys()) + _check_received_keys(rope_type, received_keys, required_keys) + + +def _validate_linear_scaling_rope_parameters(config: PretrainedConfig): + rope_scaling = config.rope_scaling + rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type" + required_keys = {"rope_type", "factor"} + received_keys = set(rope_scaling.keys()) + _check_received_keys(rope_type, received_keys, required_keys) + + factor = rope_scaling["factor"] + if factor is None or not isinstance(factor, float) or factor < 1.0: + logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") + + +def _validate_dynamic_scaling_rope_parameters(config: PretrainedConfig): + rope_scaling = config.rope_scaling + rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type" + required_keys = {"rope_type", "factor"} + # TODO (joao): update logic for the inclusion of `original_max_position_embeddings` + optional_keys = {"original_max_position_embeddings"} + received_keys = set(rope_scaling.keys()) + _check_received_keys(rope_type, received_keys, required_keys, optional_keys) + + factor = rope_scaling["factor"] + if factor is None or not isinstance(factor, float) or factor < 1.0: + logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") + + +def _validate_yarn_parameters(config: PretrainedConfig): + rope_scaling = config.rope_scaling + rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type" + required_keys = {"rope_type", "factor"} + optional_keys = {"attention_factor", "beta_fast", "beta_slow"} + received_keys = set(rope_scaling.keys()) + _check_received_keys(rope_type, received_keys, required_keys, optional_keys) + + factor = rope_scaling["factor"] + if factor is None or not isinstance(factor, float) or factor < 1.0: + logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") + + attention_factor = rope_scaling.get("attention_factor") + if attention_factor is not None and (not isinstance(attention_factor, float) or attention_factor < 0): + logger.warning( + f"`rope_scaling`'s attention_factor field must be a float greater than 0, got {attention_factor}" + ) + beta_fast = rope_scaling.get("beta_fast") + if beta_fast is not None and not isinstance(beta_fast, float): + logger.warning(f"`rope_scaling`'s beta_fast field must be a float, got {beta_fast}") + beta_slow = rope_scaling.get("beta_slow") + if beta_slow is not None and not isinstance(beta_slow, float): + logger.warning(f"`rope_scaling`'s beta_slow field must be a float, got {beta_slow}") + + if (beta_fast or 32) < (beta_slow or 1): + logger.warning( + f"`rope_scaling`'s beta_fast field must be greater than beta_slow, got beta_fast={beta_fast} " + f"(defaults to 32 if None) and beta_slow={beta_slow} (defaults to 1 if None)" + ) + + +def _validate_longrope_parameters(config: PretrainedConfig): + rope_scaling = config.rope_scaling + rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type" + required_keys = {"rope_type", "short_factor", "long_factor"} + # TODO (joao): update logic for the inclusion of `original_max_position_embeddings` + optional_keys = {"attention_factor", "factor", "original_max_position_embeddings"} + received_keys = set(rope_scaling.keys()) + _check_received_keys(rope_type, received_keys, required_keys, optional_keys) + + partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 + dim = int((config.hidden_size // config.num_attention_heads) * partial_rotary_factor) + + short_factor = rope_scaling.get("short_factor") + if not isinstance(short_factor, list) and all(isinstance(x, (int, float)) for x in short_factor): + logger.warning(f"`rope_scaling`'s short_factor field must be a list of numbers, got {short_factor}") + if not len(short_factor) == dim // 2: + logger.warning(f"`rope_scaling`'s short_factor field must have length {dim // 2}, got {len(short_factor)}") + + long_factor = rope_scaling.get("long_factor") + if not isinstance(long_factor, list) and all(isinstance(x, (int, float)) for x in long_factor): + logger.warning(f"`rope_scaling`'s long_factor field must be a list of numbers, got {long_factor}") + if not len(long_factor) == dim // 2: + logger.warning(f"`rope_scaling`'s long_factor field must have length {dim // 2}, got {len(long_factor)}") + + # Handle Phi3 divergence: prefer the use of `attention_factor` and/or `factor` over + # `original_max_position_embeddings` to compute internal variables. The latter lives outside `rope_scaling` and is + # unique to longrope (= undesirable) + if hasattr(config, "original_max_position_embeddings"): + logger.warning_once( + "This model has set a `original_max_position_embeddings` field, to be used together with " + "`max_position_embeddings` to determine a scaling factor. Please set the `factor` field of `rope_scaling`" + "with this ratio instead -- we recommend the use of this field over `original_max_position_embeddings`, " + "as it is compatible with most model architectures." + ) + else: + factor = rope_scaling.get("factor") + if factor is None: + logger.warning("Missing required keys in `rope_scaling`: 'factor'") + elif not isinstance(factor, float) or factor < 1.0: + logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") + + attention_factor = rope_scaling.get("attention_factor") + if attention_factor is not None and not isinstance(attention_factor, float) or attention_factor < 0: + logger.warning( + f"`rope_scaling`'s attention_factor field must be a float greater than 0, got {attention_factor}" + ) + + +def _validate_llama3_parameters(config: PretrainedConfig): + rope_scaling = config.rope_scaling + rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type" + required_keys = {"rope_type", "factor", "original_max_position_embeddings", "low_freq_factor", "high_freq_factor"} + received_keys = set(rope_scaling.keys()) + _check_received_keys(rope_type, received_keys, required_keys) + + factor = rope_scaling["factor"] + if factor is None or not isinstance(factor, float) or factor < 1.0: + logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") + + low_freq_factor = rope_scaling["low_freq_factor"] + high_freq_factor = rope_scaling["high_freq_factor"] + if low_freq_factor is None or not isinstance(low_freq_factor, float): + logger.warning(f"`rope_scaling`'s low_freq_factor field must be a float, got {low_freq_factor}") + if high_freq_factor is None or not isinstance(high_freq_factor, float): + logger.warning(f"`rope_scaling`'s high_freq_factor field must be a float, got {high_freq_factor}") + if high_freq_factor <= low_freq_factor: + logger.warning( + "`rope_scaling`'s high_freq_factor field must be greater than low_freq_factor, got high_freq_factor=" + f"{high_freq_factor} and low_freq_factor={low_freq_factor}" + ) + + original_max_position_embeddings = rope_scaling["original_max_position_embeddings"] + if original_max_position_embeddings is None or not isinstance(original_max_position_embeddings, int): + logger.warning( + "`rope_scaling`'s original_max_position_embeddings field must be an integer, got " + f"{original_max_position_embeddings}" + ) + if original_max_position_embeddings >= config.max_position_embeddings: + logger.warning( + "`rope_scaling`'s original_max_position_embeddings field must be less than max_position_embeddings, got " + f"{original_max_position_embeddings} and max_position_embeddings={config.max_position_embeddings}" + ) + + +# Like `ROPE_INIT_FUNCTIONS`, this validation function mapping can be dynamically updated for custom RoPE types. +ROPE_VALIDATION_FUNCTIONS = { + "default": _validate_default_rope_parameters, + "linear": _validate_linear_scaling_rope_parameters, + "dynamic": _validate_dynamic_scaling_rope_parameters, + "yarn": _validate_yarn_parameters, + "longrope": _validate_longrope_parameters, + "llama3": _validate_llama3_parameters, +} + + +def rope_config_validation(config: PretrainedConfig): + """ + Validate the RoPE config arguments, given a `PretrainedConfig` object + """ + rope_scaling = getattr(config, "rope_scaling", None) # not a default parameter in `PretrainedConfig` + if rope_scaling is None: + return + + # BC: "rope_type" was originally "type" + rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", "default")) + validation_fn = ROPE_VALIDATION_FUNCTIONS.get(rope_type) + if validation_fn is not None: + validation_fn(config) + else: + logger.warning( + f"Missing validation function mapping in `ROPE_VALIDATION_FUNCTIONS` for 'rope_type'='{rope_type}'" + ) diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py index 0ad5dd0396194a..3d7658ba372130 100644 --- a/src/transformers/modeling_tf_utils.py +++ b/src/transformers/modeling_tf_utils.py @@ -1209,7 +1209,7 @@ def build(self, input_shape=None): def __init__(self, config, *inputs, **kwargs): super().__init__(*inputs, **kwargs) if not isinstance(config, PretrainedConfig): - raise ValueError( + raise TypeError( f"Parameter config in `{self.__class__.__name__}(config)` should be an instance of class " "`PretrainedConfig`. To create a model from a pretrained model use " f"`model = {self.__class__.__name__}.from_pretrained(PRETRAINED_MODEL_NAME)`" diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index ce0086d1e3bcd8..557624f78b66e9 100755 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -855,6 +855,8 @@ def _load_state_dict_into_meta_model( for old_key, new_key in zip(old_keys, new_keys): state_dict[new_key] = state_dict.pop(old_key) + is_torch_e4m3fn_available = hasattr(torch, "float8_e4m3fn") + for param_name, param in state_dict.items(): # First part of the test is always true as load_state_dict_keys always contains state_dict keys. if param_name not in loaded_state_dict_keys or param_name not in expected_keys: @@ -866,9 +868,10 @@ def _load_state_dict_into_meta_model( module_name = param_name set_module_kwargs = {} - # We convert floating dtypes to the `dtype` passed. We want to keep the buffers/params + # We convert floating dtypes to the `dtype` passed except for float8_e4m3fn type. We also want to keep the buffers/params # in int/uint/bool and not cast them. - if dtype is not None and torch.is_floating_point(param): + is_param_float8_e4m3fn = is_torch_e4m3fn_available and param.dtype == torch.float8_e4m3fn + if dtype is not None and torch.is_floating_point(param) and not is_param_float8_e4m3fn: if ( keep_in_fp32_modules is not None and any( @@ -894,7 +897,6 @@ def _load_state_dict_into_meta_model( old_param = getattr(old_param, split) if old_param is None: break - if old_param is not None: if dtype is None: param = param.to(old_param.dtype) @@ -1978,12 +1980,22 @@ def resize_token_embeddings( if new_num_tokens is None and pad_to_multiple_of is None: return model_embeds + # Since we are basically resuing the same old embeddings with new weight values, gathering is required + is_quantized = hasattr(self, "hf_quantizer") and self.hf_quantizer is not None + if is_deepspeed_zero3_enabled() and not is_quantized: + import deepspeed + + with deepspeed.zero.GatheredParameters(model_embeds.weight, modifier_rank=None): + vocab_size = model_embeds.weight.shape[0] + else: + vocab_size = model_embeds.weight.shape[0] + # Update base model and current model config if hasattr(self.config, "text_config"): - self.config.text_config.vocab_size = model_embeds.weight.shape[0] + self.config.text_config.vocab_size = vocab_size else: - self.config.vocab_size = model_embeds.weight.shape[0] - self.vocab_size = model_embeds.weight.shape[0] + self.config.vocab_size = vocab_size + self.vocab_size = vocab_size # Tie weights again if needed self.tie_weights() @@ -2129,7 +2141,28 @@ def _get_resized_embeddings( else: new_embeddings.weight.data[:n, :] = old_embeddings.weight.data[:n, :] - return new_embeddings + # Replace weights in old_embeddings and return to maintain the same embedding type. + # This ensures correct functionality when a Custom Embedding class is passed as input. + # The input and output embedding types remain consistent. (c.f. https://github.com/huggingface/transformers/pull/31979) + if is_deepspeed_zero3_enabled() and not is_quantized: + import deepspeed + + params = [old_embeddings.weight, new_embeddings.weight] + with deepspeed.zero.GatheredParameters(params, modifier_rank=0): + old_embeddings.weight = new_embeddings.weight + old_embeddings.num_embeddings = new_embeddings.weight.data.shape[0] + + # If the new number of tokens is smaller than the original `padding_idx`, the `padding_idx` + # will be set to `None` in the resized embeddings. + if old_embeddings.padding_idx is not None and (new_num_tokens - 1) < old_embeddings.padding_idx: + old_embeddings.padding_idx = None + else: + old_embeddings.weight.data = new_embeddings.weight.data + old_embeddings.num_embeddings = new_embeddings.weight.data.shape[0] + if old_embeddings.padding_idx is not None and (new_num_tokens - 1) < old_embeddings.padding_idx: + old_embeddings.padding_idx = None + + return old_embeddings def _get_resized_lm_head( self, old_lm_head: nn.Linear, new_num_tokens: Optional[int] = None, transposed: Optional[bool] = False @@ -3395,14 +3428,14 @@ def from_pretrained( pretrained_model_name_or_path, subfolder, _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant) ) is_sharded = True - elif os.path.isfile( + elif not use_safetensors and os.path.isfile( os.path.join(pretrained_model_name_or_path, subfolder, _add_variant(WEIGHTS_NAME, variant)) ): # Load from a PyTorch checkpoint archive_file = os.path.join( pretrained_model_name_or_path, subfolder, _add_variant(WEIGHTS_NAME, variant) ) - elif os.path.isfile( + elif not use_safetensors and os.path.isfile( os.path.join(pretrained_model_name_or_path, subfolder, _add_variant(WEIGHTS_INDEX_NAME, variant)) ): # Load from a sharded PyTorch checkpoint @@ -3411,15 +3444,18 @@ def from_pretrained( ) is_sharded = True # At this stage we don't have a weight file so we will raise an error. - elif os.path.isfile( - os.path.join(pretrained_model_name_or_path, subfolder, TF_WEIGHTS_NAME + ".index") - ) or os.path.isfile(os.path.join(pretrained_model_name_or_path, subfolder, TF2_WEIGHTS_NAME)): + elif not use_safetensors and ( + os.path.isfile(os.path.join(pretrained_model_name_or_path, subfolder, TF_WEIGHTS_NAME + ".index")) + or os.path.isfile(os.path.join(pretrained_model_name_or_path, subfolder, TF2_WEIGHTS_NAME)) + ): raise EnvironmentError( f"Error no file named {_add_variant(WEIGHTS_NAME, variant)} found in directory" f" {pretrained_model_name_or_path} but there is a file for TensorFlow weights. Use" " `from_tf=True` to load this model from those weights." ) - elif os.path.isfile(os.path.join(pretrained_model_name_or_path, subfolder, FLAX_WEIGHTS_NAME)): + elif not use_safetensors and os.path.isfile( + os.path.join(pretrained_model_name_or_path, subfolder, FLAX_WEIGHTS_NAME) + ): raise EnvironmentError( f"Error no file named {_add_variant(WEIGHTS_NAME, variant)} found in directory" f" {pretrained_model_name_or_path} but there is a file for Flax weights. Use `from_flax=True`" @@ -3952,6 +3988,14 @@ def from_pretrained( and hf_quantizer.quantization_config.quant_method == QuantizationMethod.HQQ ): device_map_kwargs["force_hooks"] = True + if ( + hf_quantizer is not None + and hf_quantizer.quantization_config.quant_method == QuantizationMethod.FBGEMM_FP8 + and isinstance(device_map, dict) + and ("cpu" in device_map.values() or "disk" in device_map.values()) + ): + device_map_kwargs["offload_buffers"] = True + if not is_fsdp_enabled() and not is_deepspeed_zero3_enabled(): dispatch_model(model, **device_map_kwargs) @@ -4102,7 +4146,6 @@ def _fix_key(key): if cls._keys_to_ignore_on_load_unexpected is not None: for pat in cls._keys_to_ignore_on_load_unexpected: unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None] - if hf_quantizer is not None: missing_keys = hf_quantizer.update_missing_keys(model, missing_keys, prefix) diff --git a/src/transformers/models/align/modeling_align.py b/src/transformers/models/align/modeling_align.py index d6e6023a26f768..1b744d0f208d46 100644 --- a/src/transformers/models/align/modeling_align.py +++ b/src/transformers/models/align/modeling_align.py @@ -1418,13 +1418,13 @@ def __init__(self, config: AlignConfig): super().__init__(config) if not isinstance(config.text_config, AlignTextConfig): - raise ValueError( + raise TypeError( "config.text_config is expected to be of type AlignTextConfig but is of type" f" {type(config.text_config)}." ) if not isinstance(config.vision_config, AlignVisionConfig): - raise ValueError( + raise TypeError( "config.vision_config is expected to be of type AlignVisionConfig but is of type" f" {type(config.vision_config)}." ) diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py index 10c9e10491bbda..f9856ef701f9e0 100755 --- a/src/transformers/models/altclip/modeling_altclip.py +++ b/src/transformers/models/altclip/modeling_altclip.py @@ -1466,12 +1466,12 @@ def __init__(self, config: AltCLIPConfig): super().__init__(config) if not isinstance(config.vision_config, AltCLIPVisionConfig): - raise ValueError( + raise TypeError( "config.vision_config is expected to be of type AltCLIPVisionConfig but is of type" f" {type(config.vision_config)}." ) if not isinstance(config.text_config, AltCLIPTextConfig): - raise ValueError( + raise TypeError( "config.text_config is expected to be of type AltCLIPTextConfig but is of type" f" {type(config.text_config)}." ) diff --git a/src/transformers/models/bark/processing_bark.py b/src/transformers/models/bark/processing_bark.py index a9bf55b51f6015..53715f3260422c 100644 --- a/src/transformers/models/bark/processing_bark.py +++ b/src/transformers/models/bark/processing_bark.py @@ -211,7 +211,7 @@ def _validate_voice_preset_dict(self, voice_preset: Optional[dict] = None): raise ValueError(f"Voice preset unrecognized, missing {key} as a key.") if not isinstance(voice_preset[key], np.ndarray): - raise ValueError(f"{key} voice preset must be a {str(self.preset_shape[key])}D ndarray.") + raise TypeError(f"{key} voice preset must be a {str(self.preset_shape[key])}D ndarray.") if len(voice_preset[key].shape) != self.preset_shape[key]: raise ValueError(f"{key} voice preset must be a {str(self.preset_shape[key])}D ndarray.") diff --git a/src/transformers/models/bart/modeling_flax_bart.py b/src/transformers/models/bart/modeling_flax_bart.py index 507a93a8e7984f..634c256fe7d81d 100644 --- a/src/transformers/models/bart/modeling_flax_bart.py +++ b/src/transformers/models/bart/modeling_flax_bart.py @@ -1599,7 +1599,7 @@ def __call__( eos_mask = jnp.where(input_ids == self.config.eos_token_id, 1, 0) # The first condition is necessary to overcome jax._src.errors.ConcretizationTypeError during JIT compilation - if type(eos_mask) != jax.interpreters.partial_eval.DynamicJaxprTracer: + if not isinstance(eos_mask, jax.interpreters.partial_eval.DynamicJaxprTracer): if len(jnp.unique(eos_mask.sum(1))) > 1: raise ValueError("All examples must have the same number of tokens.") diff --git a/src/transformers/models/bert/tokenization_bert.py b/src/transformers/models/bert/tokenization_bert.py index a8f12746639ccc..cd70e38d008aa3 100644 --- a/src/transformers/models/bert/tokenization_bert.py +++ b/src/transformers/models/bert/tokenization_bert.py @@ -281,7 +281,7 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = return (vocab_file,) -class BasicTokenizer(object): +class BasicTokenizer: """ Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.). @@ -442,7 +442,7 @@ def _clean_text(self, text): return "".join(output) -class WordpieceTokenizer(object): +class WordpieceTokenizer: """Runs WordPiece tokenization.""" def __init__(self, vocab, unk_token, max_input_chars_per_word=100): diff --git a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py index 58ff3d2b83d607..10d71c417a7aaf 100644 --- a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py +++ b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py @@ -691,7 +691,7 @@ def tokenize(self, text): # Copied from transformers.models.bert.tokenization_bert.BasicTokenizer -class BasicTokenizer(object): +class BasicTokenizer: """ Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.). @@ -853,7 +853,7 @@ def _clean_text(self, text): # Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer -class WordpieceTokenizer(object): +class WordpieceTokenizer: """Runs WordPiece tokenization.""" def __init__(self, vocab, unk_token, max_input_chars_per_word=100): @@ -910,7 +910,7 @@ def tokenize(self, text): return output_tokens -class SentencepieceTokenizer(object): +class SentencepieceTokenizer: """ Runs sentencepiece tokenization. Based on transformers.models.albert.tokenization_albert.AlbertTokenizer. """ diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py index d1ba54213a0346..9f8e3cd19cd835 100755 --- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py @@ -1569,6 +1569,7 @@ class BigBirdPegasusPreTrainedModel(PreTrainedModel): supports_gradient_checkpointing = True _no_split_modules = ["BigBirdPegasusEncoderLayer", "BigBirdPegasusDecoderLayer"] _skip_keys_device_placement = "past_key_values" + _supports_param_buffer_assignment = False def _init_weights(self, module): std = self.config.init_std diff --git a/src/transformers/models/blenderbot/tokenization_blenderbot.py b/src/transformers/models/blenderbot/tokenization_blenderbot.py index 67724538233430..1a8807214d52ba 100644 --- a/src/transformers/models/blenderbot/tokenization_blenderbot.py +++ b/src/transformers/models/blenderbot/tokenization_blenderbot.py @@ -405,17 +405,3 @@ def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens. """ return token_ids_0 + [self.eos_token_id] - - @property - def default_chat_template(self): - """ - A very simple chat template that just adds whitespace between messages. - """ - return ( - "{% for message in messages %}" - "{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}" - "{{ message['content'] }}" - "{% if not loop.last %}{{ ' ' }}{% endif %}" - "{% endfor %}" - "{{ eos_token }}" - ) diff --git a/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py b/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py index 01cbf13809d657..0d24ed62c574a3 100644 --- a/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py +++ b/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py @@ -287,18 +287,3 @@ def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens. """ return token_ids_0 + [self.eos_token_id] - - @property - # Copied from transformers.models.blenderbot.tokenization_blenderbot.BlenderbotTokenizer.default_chat_template - def default_chat_template(self): - """ - A very simple chat template that just adds whitespace between messages. - """ - return ( - "{% for message in messages %}" - "{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}" - "{{ message['content'] }}" - "{% if not loop.last %}{{ ' ' }}{% endif %}" - "{% endfor %}" - "{{ eos_token }}" - ) diff --git a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py index 832b5315edfd7c..08c7be332e31ef 100644 --- a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py +++ b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py @@ -217,18 +217,3 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = index += 1 return vocab_file, merge_file - - @property - # Copied from transformers.models.blenderbot.tokenization_blenderbot.BlenderbotTokenizer.default_chat_template - def default_chat_template(self): - """ - A very simple chat template that just adds whitespace between messages. - """ - return ( - "{% for message in messages %}" - "{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}" - "{{ message['content'] }}" - "{% if not loop.last %}{{ ' ' }}{% endif %}" - "{% endfor %}" - "{{ eos_token }}" - ) diff --git a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py index a80acdb650e445..21fb76cbfc8691 100644 --- a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py +++ b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py @@ -98,18 +98,3 @@ def create_token_type_ids_from_sequences( if token_ids_1 is None: return len(cls + token_ids_0 + sep) * [0] return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] - - @property - # Copied from transformers.models.blenderbot.tokenization_blenderbot.BlenderbotTokenizer.default_chat_template - def default_chat_template(self): - """ - A very simple chat template that just adds whitespace between messages. - """ - return ( - "{% for message in messages %}" - "{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}" - "{{ message['content'] }}" - "{% if not loop.last %}{{ ' ' }}{% endif %}" - "{% endfor %}" - "{{ eos_token }}" - ) diff --git a/src/transformers/models/blip/convert_blip_original_pytorch_to_hf.py b/src/transformers/models/blip/convert_blip_original_pytorch_to_hf.py index 714aaa1e273d1a..3de18c294ae898 100644 --- a/src/transformers/models/blip/convert_blip_original_pytorch_to_hf.py +++ b/src/transformers/models/blip/convert_blip_original_pytorch_to_hf.py @@ -188,4 +188,4 @@ def convert_blip_checkpoint(pytorch_dump_folder_path, config_path=None): parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert") args = parser.parse_args() - convert_blip_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path) + convert_blip_checkpoint(args.pytorch_dump_folder_path, args.config_path) diff --git a/src/transformers/models/blip/modeling_blip.py b/src/transformers/models/blip/modeling_blip.py index a8f20ace6bd862..46e3a6005b0af6 100644 --- a/src/transformers/models/blip/modeling_blip.py +++ b/src/transformers/models/blip/modeling_blip.py @@ -755,13 +755,13 @@ def __init__(self, config: BlipConfig): super().__init__(config) if not isinstance(config.text_config, BlipTextConfig): - raise ValueError( + raise TypeError( "config.text_config is expected to be of type BlipTextConfig but is of type" f" {type(config.text_config)}." ) if not isinstance(config.vision_config, BlipVisionConfig): - raise ValueError( + raise TypeError( "config.vision_config is expected to be of type BlipVisionConfig but is of type" f" {type(config.vision_config)}." ) diff --git a/src/transformers/models/blip/modeling_tf_blip.py b/src/transformers/models/blip/modeling_tf_blip.py index 1557677eb3fbf2..6c9942b73acefb 100644 --- a/src/transformers/models/blip/modeling_tf_blip.py +++ b/src/transformers/models/blip/modeling_tf_blip.py @@ -794,13 +794,13 @@ def __init__(self, config: BlipConfig, *args, **kwargs): super().__init__(*args, **kwargs) if not isinstance(config.text_config, BlipTextConfig): - raise ValueError( + raise TypeError( "config.text_config is expected to be of type BlipTextConfig but is of type" f" {type(config.text_config)}." ) if not isinstance(config.vision_config, BlipVisionConfig): - raise ValueError( + raise TypeError( "config.vision_config is expected to be of type BlipVisionConfig but is of type" f" {type(config.vision_config)}." ) diff --git a/src/transformers/models/bloom/modeling_bloom.py b/src/transformers/models/bloom/modeling_bloom.py index e8ae2e7bdf6837..1d37d5f366d43e 100644 --- a/src/transformers/models/bloom/modeling_bloom.py +++ b/src/transformers/models/bloom/modeling_bloom.py @@ -24,8 +24,9 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss from torch.nn import functional as F +from ...cache_utils import Cache, DynamicCache, StaticCache from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward -from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask +from ...modeling_attn_mask_utils import AttentionMaskConverter from ...modeling_outputs import ( BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions, @@ -170,7 +171,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: class BloomAttention(nn.Module): - def __init__(self, config: BloomConfig): + def __init__(self, config: BloomConfig, layer_idx: Optional[int] = None): super().__init__() self.pretraining_tp = config.pretraining_tp @@ -191,26 +192,37 @@ def __init__(self, config: BloomConfig): # Layer-wise attention scaling self.inv_norm_factor = 1.0 / math.sqrt(self.head_dim) self.beta = 1.0 + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " + "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) self.query_key_value = nn.Linear(self.hidden_size, 3 * self.hidden_size, bias=True) self.dense = nn.Linear(self.hidden_size, self.hidden_size) self.attention_dropout = nn.Dropout(config.attention_dropout) - def _split_heads(self, fused_qkv: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + def _reshape(self, fused_qkv: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ - Split the last dimension into (num_heads, head_dim) without making any copies, results share same memory - storage as `fused_qkv` + Split the last dimension into (num_heads, head_dim) and reshapes to (bs, heads, len, dim) shape + without making any copies, results share same memory storage as `fused_qkv` Args: fused_qkv (`torch.tensor`, *required*): [batch_size, seq_length, num_heads * 3 * head_dim] Returns: - query: [batch_size, seq_length, num_heads, head_dim] key: [batch_size, seq_length, num_heads, head_dim] - value: [batch_size, seq_length, num_heads, head_dim] + query: [batch_size, num_heads, seq_length, head_dim] + key: [batch_size, num_heads, seq_length, head_dim] + value: [batch_size, num_heads, seq_length, head_dim] """ batch_size, seq_length, three_times_hidden_size = fused_qkv.shape fused_qkv = fused_qkv.view(batch_size, seq_length, self.num_heads, 3, self.head_dim) - return fused_qkv[..., 0, :], fused_qkv[..., 1, :], fused_qkv[..., 2, :] + query_layer = fused_qkv[..., 0, :].transpose(1, 2) + key_layer = fused_qkv[..., 1, :].transpose(1, 2) + value_layer = fused_qkv[..., 2, :].transpose(1, 2) + return query_layer, key_layer, value_layer def _merge_heads(self, x: torch.Tensor) -> torch.Tensor: """ @@ -243,35 +255,27 @@ def forward( residual: torch.Tensor, alibi: torch.Tensor, attention_mask: torch.Tensor, - layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, + layer_past: Optional[Cache] = None, head_mask: Optional[torch.Tensor] = None, use_cache: bool = False, output_attentions: bool = False, + cache_position: Optional[torch.LongTensor] = None, ): + batch_size, q_length, _ = hidden_states.shape fused_qkv = self.query_key_value(hidden_states) # [batch_size, seq_length, 3 x hidden_size] + # 3 x [batch_size, num_heads, seq_length, head_dim] + query_layer, key_layer, value_layer = self._reshape(fused_qkv) - # 3 x [batch_size, seq_length, num_heads, head_dim] - (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv) - - batch_size, q_length, _, _ = query_layer.shape - - query_layer = query_layer.transpose(1, 2).reshape(batch_size * self.num_heads, q_length, self.head_dim) - key_layer = key_layer.permute(0, 2, 3, 1).reshape(batch_size * self.num_heads, self.head_dim, q_length) - value_layer = value_layer.transpose(1, 2).reshape(batch_size * self.num_heads, q_length, self.head_dim) if layer_past is not None: - past_key, past_value = layer_past - # concatenate along seq_length dimension: - # - key: [batch_size * self.num_heads, head_dim, kv_length] - # - value: [batch_size * self.num_heads, kv_length, head_dim] - key_layer = torch.cat((past_key, key_layer), dim=2) - value_layer = torch.cat((past_value, value_layer), dim=1) + cache_kwargs = {"cache_position": cache_position} + key_layer, value_layer = layer_past.update(key_layer, value_layer, self.layer_idx, cache_kwargs) - _, _, kv_length = key_layer.shape + # reshape qkv for further computations + query_layer = query_layer.reshape(batch_size * self.num_heads, -1, self.head_dim) + key_layer = key_layer.reshape(batch_size * self.num_heads, -1, self.head_dim).transpose(1, 2) + value_layer = value_layer.reshape(batch_size * self.num_heads, -1, self.head_dim) - if use_cache is True: - present = (key_layer, value_layer) - else: - present = None + kv_length = cache_position[-1] + 1 # cache position is 0-indexed while length should start from 1 # [batch_size * num_heads, q_length, kv_length] # we use `torch.Tensor.baddbmm` instead of `torch.baddbmm` as the latter isn't supported by TorchScript v1.11 @@ -283,15 +287,13 @@ def forward( ) # change view to [batch_size, num_heads, q_length, kv_length] - attention_scores = matmul_result.view(batch_size, self.num_heads, q_length, kv_length) + attn_weights = matmul_result.view(batch_size, self.num_heads, q_length, kv_length) + if attention_mask is not None: # no matter the length, we just slice it + causal_mask = attention_mask[:, :, :, :kv_length] + attn_weights = attn_weights + causal_mask - # cast attention scores to fp32, compute scaled softmax and cast back to initial dtype - [batch_size, num_heads, q_length, kv_length] - input_dtype = attention_scores.dtype - # `float16` has a minimum value of -65504.0, whereas `bfloat16` and `float32` have a minimum value of `-3.4e+38` - if input_dtype == torch.float16: - attention_scores = attention_scores.to(torch.float) - attn_weights = torch.masked_fill(attention_scores, attention_mask, torch.finfo(attention_scores.dtype).min) - attention_probs = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(input_dtype) + # cast attention scores to fp32, compute scaled softmax and cast back to initial dtype + attention_probs = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_layer.dtype) # [batch_size, num_heads, q_length, kv_length] attention_probs = self.attention_dropout(attention_probs) @@ -322,7 +324,7 @@ def forward( output_tensor = dropout_add(output_tensor, residual, self.hidden_dropout, self.training) - outputs = (output_tensor, present) + outputs = (output_tensor, layer_past) if output_attentions: outputs += (attention_probs,) @@ -361,13 +363,13 @@ def forward(self, hidden_states: torch.Tensor, residual: torch.Tensor) -> torch. class BloomBlock(nn.Module): - def __init__(self, config: BloomConfig): + def __init__(self, config: BloomConfig, layer_idx: Optional[int] = None): super().__init__() hidden_size = config.hidden_size self.input_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon) self.num_heads = config.n_head - self.self_attention = BloomAttention(config) + self.self_attention = BloomAttention(config, layer_idx) self.post_attention_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon) self.mlp = BloomMLP(config) @@ -380,10 +382,11 @@ def forward( hidden_states: torch.Tensor, alibi: torch.Tensor, attention_mask: torch.Tensor, - layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, + layer_past: Optional[Cache] = None, head_mask: Optional[torch.Tensor] = None, use_cache: bool = False, output_attentions: bool = False, + cache_position: Optional[torch.LongTensor] = None, ): # hidden_states: [batch_size, seq_length, hidden_size] @@ -406,6 +409,7 @@ def forward( head_mask=head_mask, use_cache=use_cache, output_attentions=output_attentions, + cache_position=cache_position, ) attention_output = attn_outputs[0] @@ -428,7 +432,7 @@ def forward( else: outputs = (output,) + outputs[1:] - return outputs # hidden_states, present, attentions + return outputs # hidden_states, past_kv, attentions class BloomPreTrainedModel(PreTrainedModel): @@ -437,6 +441,7 @@ class BloomPreTrainedModel(PreTrainedModel): supports_gradient_checkpointing = True _no_split_modules = ["BloomBlock"] _skip_keys_device_placement = "past_key_values" + _supports_cache_class = True def __init__(self, *inputs, **kwargs): super().__init__(*inputs, **kwargs) @@ -457,45 +462,6 @@ def _init_weights(self, module: nn.Module): module.bias.data.zero_() module.weight.data.fill_(1.0) - @staticmethod - def _convert_to_standard_cache( - past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]], batch_size: int - ) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]: - """ - Standardizes the format of the cache so as to match most implementations, i.e. to tuple(tuple([batch_size, - num_heads, ...])) - """ - batch_size_times_num_heads, head_dim, seq_length = past_key_value[0][0].shape - num_heads = batch_size_times_num_heads // batch_size - # key: [batch_size * num_heads, head_dim, seq_length] -> [batch_size, num_heads, head_dim, seq_length] - # value: [batch_size * num_heads, seq_length, head_dim] -> [batch_size, num_heads, seq_length, head_dim] - return tuple( - ( - layer_past[0].view(batch_size, num_heads, head_dim, seq_length), - layer_past[1].view(batch_size, num_heads, seq_length, head_dim), - ) - for layer_past in past_key_value - ) - - @staticmethod - def _convert_to_bloom_cache( - past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]], - ) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]: - """ - Converts the cache to the format expected by Bloom, i.e. to tuple(tuple([batch_size * num_heads, ...])) - """ - batch_size, num_heads, head_dim, seq_length = past_key_value[0][0].shape - batch_size_times_num_heads = batch_size * num_heads - # key: [batch_size, num_heads, head_dim, seq_length] -> [batch_size * num_heads, head_dim, seq_length] - # value: [batch_size, num_heads, seq_length, head_dim] -> [batch_size * num_heads, seq_length, head_dim] - return tuple( - ( - layer_past[0].view(batch_size_times_num_heads, head_dim, seq_length), - layer_past[1].view(batch_size_times_num_heads, seq_length, head_dim), - ) - for layer_past in past_key_value - ) - BLOOM_START_DOCSTRING = r""" @@ -525,14 +491,23 @@ def _convert_to_bloom_cache( [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids) - past_key_values (`Tuple[Tuple[torch.Tensor]]` of length `config.n_layers`): - Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see - `past_key_values` output below). Can be used to speed up sequential decoding. The `input_ids` which have - their past given to this model should not be passed as `input_ids` as they have already been computed. - - Each element of `past_key_values` is a tuple (past_key, past_value): - - past_key: [batch_size * num_heads, head_dim, kv_length] - - past_value: [batch_size * num_heads, kv_length, head_dim] + past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): + Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` + returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. + + Two formats are allowed: + - a [`~cache_utils.Cache`] instance; + - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy + cache format. + + The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the + legacy cache format will be returned. + + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't + have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` + of shape `(batch_size, sequence_length)`. attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: @@ -564,6 +539,10 @@ def _convert_to_bloom_cache( more detail. return_dict (`bool`, *optional*): Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`, + this tensor is not affected by padding. It is used to update the cache in the correct position and to infer + the complete sequence length. """ @@ -583,7 +562,7 @@ def __init__(self, config: BloomConfig): self.word_embeddings_layernorm = LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) # Transformer blocks - self.h = nn.ModuleList([BloomBlock(config) for _ in range(config.num_hidden_layers)]) + self.h = nn.ModuleList([BloomBlock(config, layer_idx=i) for i in range(config.num_hidden_layers)]) # Final Layer Norm self.ln_f = LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) @@ -611,7 +590,7 @@ def set_input_embeddings(self, new_embeddings: torch.Tensor): def forward( self, input_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None, + past_key_values: Optional[Union[Cache, Tuple[Tuple[torch.Tensor, torch.Tensor], ...]]] = None, attention_mask: Optional[torch.Tensor] = None, head_mask: Optional[torch.LongTensor] = None, inputs_embeds: Optional[torch.LongTensor] = None, @@ -619,6 +598,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, **deprecated_arguments, ) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]: if deprecated_arguments.pop("position_ids", False) is not False: @@ -638,62 +618,59 @@ def forward( use_cache = use_cache if use_cache is not None else self.config.use_cache return_dict = return_dict if return_dict is not None else self.config.use_return_dict - if input_ids is not None and inputs_embeds is not None: - raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") - elif input_ids is not None: - batch_size, seq_length = input_ids.shape - elif inputs_embeds is not None: - batch_size, seq_length, _ = inputs_embeds.shape - else: - raise ValueError("You have to specify either input_ids or inputs_embeds") + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError( + "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one" + ) + + if self.gradient_checkpointing and self.training and use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) - if past_key_values is None: - past_key_values = tuple([None] * len(self.h)) + # kept for BC (non `Cache` `past_key_values` inputs) + use_legacy_cache = False + if use_cache and not isinstance(past_key_values, Cache) and not self.training: + use_legacy_cache = True + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + logger.warning_once( + "Using `past_key_values` as a tuple is deprecated and will be removed in v4.45. " + "Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)" + ) + + batch_size, seq_length, _ = inputs_embeds.shape + past_length = past_key_values.get_seq_length() if past_key_values is not None else 0 + seq_length_with_past = seq_length + past_length + if cache_position is None: + cache_position = torch.arange(past_length, past_length + seq_length, device=inputs_embeds.device) # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape batch_size x num_heads x N x N # head_mask has shape n_layer x batch x num_heads x N x N head_mask = self.get_head_mask(head_mask, self.config.n_layer) - - if inputs_embeds is None: - inputs_embeds = self.word_embeddings(input_ids) - hidden_states = self.word_embeddings_layernorm(inputs_embeds) - presents = () if use_cache else None + next_decoder_cache = None all_self_attentions = () if output_attentions else None all_hidden_states = () if output_hidden_states else None - if self.gradient_checkpointing and self.training: - if use_cache: - logger.warning_once( - "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." - ) - use_cache = False - # Compute alibi tensor: check build_alibi_tensor documentation - seq_length_with_past = seq_length - past_key_values_length = 0 - if past_key_values[0] is not None: - past_key_values_length = past_key_values[0][0].shape[2] - seq_length_with_past = seq_length_with_past + past_key_values_length if attention_mask is None: attention_mask = torch.ones((batch_size, seq_length_with_past), device=hidden_states.device) else: attention_mask = attention_mask.to(hidden_states.device) alibi = self.build_alibi_tensor(attention_mask, self.num_heads, dtype=hidden_states.dtype) - - causal_mask = _prepare_4d_causal_attention_mask( - attention_mask, - input_shape=(batch_size, seq_length), - inputs_embeds=inputs_embeds, - past_key_values_length=past_key_values_length, + causal_mask = self._update_causal_mask( + attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions ) - causal_mask = causal_mask.bool() - for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)): + for i, block in enumerate(self.h): if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) @@ -703,25 +680,27 @@ def forward( hidden_states, alibi, causal_mask, - layer_past, + past_key_values, head_mask[i], use_cache, output_attentions, + cache_position, ) else: outputs = block( hidden_states, - layer_past=layer_past, + layer_past=past_key_values, attention_mask=causal_mask, head_mask=head_mask[i], use_cache=use_cache, output_attentions=output_attentions, alibi=alibi, + cache_position=cache_position, ) hidden_states = outputs[0] - if use_cache is True: - presents = presents + (outputs[1],) + if use_cache: + next_decoder_cache = outputs[1] if output_attentions: all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],) @@ -732,16 +711,103 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) + next_cache = None + if use_cache: + next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache + if not return_dict: - return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None) + return tuple( + v for v in [hidden_states, next_cache, all_hidden_states, all_self_attentions] if v is not None + ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, - past_key_values=presents, + past_key_values=next_cache, hidden_states=all_hidden_states, attentions=all_self_attentions, ) + # Copied from transformers.models.llama.modeling_llama.LlamaModel._update_causal_mask + def _update_causal_mask( + self, + attention_mask: torch.Tensor, + input_tensor: torch.Tensor, + cache_position: torch.Tensor, + past_key_values: Cache, + output_attentions: bool, + ): + # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static + # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes. + # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using + # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114 + + if self.config._attn_implementation == "flash_attention_2": + if attention_mask is not None and 0.0 in attention_mask: + return attention_mask + return None + + # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in + # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail + # to infer the attention mask. + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + using_static_cache = isinstance(past_key_values, StaticCache) + + # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward + if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions: + if AttentionMaskConverter._ignore_causal_mask_sdpa( + attention_mask, + inputs_embeds=input_tensor, + past_key_values_length=past_seen_tokens, + is_training=self.training, + ): + return None + + dtype, device = input_tensor.dtype, input_tensor.device + min_dtype = torch.finfo(dtype).min + sequence_length = input_tensor.shape[1] + if using_static_cache: + target_length = past_key_values.get_max_length() + else: + target_length = ( + attention_mask.shape[-1] + if isinstance(attention_mask, torch.Tensor) + else past_seen_tokens + sequence_length + 1 + ) + + if attention_mask is not None and attention_mask.dim() == 4: + # in this case we assume that the mask comes already in inverted form and requires no inversion or slicing + if attention_mask.max() != 0: + raise ValueError("Custom 4D attention mask should be passed in inverted form with max==0`") + causal_mask = attention_mask + else: + causal_mask = torch.full( + (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device + ) + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :] + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + if ( + self.config._attn_implementation == "sdpa" + and attention_mask is not None + and attention_mask.device.type == "cuda" + and not output_attentions + ): + # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) + + return causal_mask + @add_start_docstrings( """ @@ -769,39 +835,34 @@ def set_output_embeddings(self, new_embeddings: torch.Tensor): def prepare_inputs_for_generation( self, - input_ids: torch.LongTensor, - past_key_values: Optional[torch.Tensor] = None, - attention_mask: Optional[torch.Tensor] = None, - inputs_embeds: Optional[torch.Tensor] = None, + input_ids, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + cache_position=None, + use_cache=True, **kwargs, - ) -> dict: - # only last tokens for input_ids if past is not None + ): + # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens + # Exception 1: when passing input_embeds, input_ids may be missing entries + # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here if past_key_values is not None: - past_length = past_key_values[0][0].shape[2] - - # Some generation methods already pass only the last input ID - if input_ids.shape[1] > past_length: - remove_prefix_length = past_length - else: - # Default to old behavior: keep only final ID - remove_prefix_length = input_ids.shape[1] - 1 - - input_ids = input_ids[:, remove_prefix_length:] - - # the cache may be in the stardard format (e.g. in contrastive search), convert to bloom's format if needed - if past_key_values[0][0].shape[0] == input_ids.shape[0]: - past_key_values = self._convert_to_bloom_cache(past_key_values) + if inputs_embeds is not None: # Exception 1 + input_ids = input_ids[:, -cache_position.shape[0] :] + elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2) + input_ids = input_ids[:, cache_position] # if `inputs_embeds` are passed, we only want to use them in the 1st generation step - if inputs_embeds is not None and past_key_values is None: + if inputs_embeds is not None and cache_position[0] == 0: model_inputs = {"inputs_embeds": inputs_embeds} else: - model_inputs = {"input_ids": input_ids} + model_inputs = {"input_ids": input_ids.contiguous()} # `contiguous()` needed for compilation use cases model_inputs.update( { + "cache_position": cache_position, "past_key_values": past_key_values, - "use_cache": kwargs.get("use_cache"), + "use_cache": use_cache, "attention_mask": attention_mask, } ) @@ -816,7 +877,7 @@ def prepare_inputs_for_generation( def forward( self, input_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None, + past_key_values: Optional[Union[Cache, Tuple[Tuple[torch.Tensor, torch.Tensor], ...]]] = None, attention_mask: Optional[torch.Tensor] = None, head_mask: Optional[torch.Tensor] = None, inputs_embeds: Optional[torch.Tensor] = None, @@ -825,6 +886,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, **deprecated_arguments, ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]: r""" @@ -855,6 +917,7 @@ def forward( output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, + cache_position=cache_position, ) hidden_states = transformer_outputs[0] @@ -896,8 +959,6 @@ def _reorder_cache( Output shares the same memory storage as `past`. """ - standardized_past = self._convert_to_standard_cache(past, batch_size=len(beam_idx)) - # Get a copy of `beam_idx` on all the devices where we need those indices. device_to_beam_idx = { past_state.device: beam_idx.to(past_state.device) for layer_past in past for past_state in layer_past @@ -907,9 +968,9 @@ def _reorder_cache( layer_past[0].index_select(0, device_to_beam_idx[layer_past[0].device]), layer_past[1].index_select(0, device_to_beam_idx[layer_past[0].device]), ) - for layer_past in standardized_past + for layer_past in past ) - return self._convert_to_bloom_cache(reordered_past) + return reordered_past @add_start_docstrings( @@ -946,7 +1007,7 @@ def __init__(self, config: BloomConfig): def forward( self, input_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None, + past_key_values: Optional[Union[Cache, Tuple[Tuple[torch.Tensor, torch.Tensor], ...]]] = None, attention_mask: Optional[torch.Tensor] = None, head_mask: Optional[torch.Tensor] = None, inputs_embeds: Optional[torch.Tensor] = None, @@ -1083,7 +1144,7 @@ def __init__(self, config: BloomConfig): def forward( self, input_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None, + past_key_values: Optional[Union[Cache, Tuple[Tuple[torch.Tensor, torch.Tensor], ...]]] = None, attention_mask: Optional[torch.Tensor] = None, head_mask: Optional[torch.Tensor] = None, inputs_embeds: Optional[torch.Tensor] = None, diff --git a/src/transformers/models/bloom/tokenization_bloom_fast.py b/src/transformers/models/bloom/tokenization_bloom_fast.py index d0da1621d4c968..54e6377353084d 100644 --- a/src/transformers/models/bloom/tokenization_bloom_fast.py +++ b/src/transformers/models/bloom/tokenization_bloom_fast.py @@ -147,11 +147,3 @@ def _encode_plus(self, *args, **kwargs) -> BatchEncoding: def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: files = self._tokenizer.model.save(save_directory, name=filename_prefix) return tuple(files) - - @property - # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.default_chat_template - def default_chat_template(self): - """ - A simple chat template that ignores role information and just concatenates messages with EOS tokens. - """ - return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}" diff --git a/src/transformers/models/chameleon/modeling_chameleon.py b/src/transformers/models/chameleon/modeling_chameleon.py index 1eea9b224958b1..f7237ebb0c85ef 100644 --- a/src/transformers/models/chameleon/modeling_chameleon.py +++ b/src/transformers/models/chameleon/modeling_chameleon.py @@ -76,11 +76,15 @@ def forward(self, hidden_states): hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) return self.weight * hidden_states.to(input_dtype) + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" + ALL_LAYERNORM_LAYERS.append(ChameleonRMSNorm) -# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Chameleon +# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Chameleon +# TODO(joao): add me back asap :) class ChameleonRotaryEmbedding(nn.Module): def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): super().__init__() @@ -110,7 +114,8 @@ def forward(self, x, position_ids): return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) -# Copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->Chameleon +# copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->Chameleon +# TODO(joao): add me back asap :) class ChameleonLinearScalingRotaryEmbedding(ChameleonRotaryEmbedding): """ChameleonRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" @@ -121,7 +126,8 @@ def forward(self, x, position_ids): return cos, sin -# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->Chameleon +# copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->Chameleon +# TODO(joao): add me back asap :) class ChameleonDynamicNTKScalingRotaryEmbedding(ChameleonRotaryEmbedding): """ChameleonRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla""" @@ -265,7 +271,8 @@ def __init__(self, config: ChameleonConfig, layer_idx: Optional[int] = None): self.k_norm = ChameleonLayerNorm((self.num_key_value_heads, self.head_dim)) self._init_rope() - # Copied from transformers.models.llama.modeling_llama.LlamaAttention._init_rope with Llama->Chameleon + # copied from transformers.models.llama.modeling_llama.LlamaAttention._init_rope with Llama->Chameleon + # TODO(joao): add me back asap :) def _init_rope(self): if self.config.rope_scaling is None: self.rotary_emb = ChameleonRotaryEmbedding( @@ -358,7 +365,8 @@ def forward( return attn_output, attn_weights, past_key_value -# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->Chameleon +# copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->Chameleon +# TODO(joao): add me back asap :) class ChameleonFlashAttention2(ChameleonAttention): """ Chameleon flash attention module. This module inherits from `ChameleonAttention` as the weights of the module stays @@ -576,7 +584,8 @@ def forward( } -# Copied from transformers.models.llama.modeling_llama.LlamaDecoderLayer with Llama->Chameleon, LLAMA->CHAMELEON +# copied from transformers.models.llama.modeling_llama.LlamaDecoderLayer with Llama->Chameleon, LLAMA->CHAMELEON +# TODO(joao): add me back asap :) class ChameleonDecoderLayer(nn.Module): def __init__(self, config: ChameleonConfig, layer_idx: int): super().__init__() diff --git a/src/transformers/models/chameleon/processing_chameleon.py b/src/transformers/models/chameleon/processing_chameleon.py index 559cac62e3d5a7..1480808336d14e 100644 --- a/src/transformers/models/chameleon/processing_chameleon.py +++ b/src/transformers/models/chameleon/processing_chameleon.py @@ -113,7 +113,7 @@ def __call__( if isinstance(text, str): text = [text] elif not isinstance(text, list) and not isinstance(text[0], str): - raise ValueError("Invalid input text. Please provide a string, or a list of strings") + raise TypeError("Invalid input text. Please provide a string, or a list of strings") # Replace the image token with the expanded image token sequence prompt_strings = [] diff --git a/src/transformers/models/chinese_clip/modeling_chinese_clip.py b/src/transformers/models/chinese_clip/modeling_chinese_clip.py index 801969c465bfb0..6fbd9459f5ad71 100644 --- a/src/transformers/models/chinese_clip/modeling_chinese_clip.py +++ b/src/transformers/models/chinese_clip/modeling_chinese_clip.py @@ -1341,13 +1341,13 @@ def __init__(self, config: ChineseCLIPConfig): super().__init__(config) if not isinstance(config.text_config, ChineseCLIPTextConfig): - raise ValueError( + raise TypeError( "config.text_config is expected to be of type ChineseCLIPTextConfig but is of type" f" {type(config.text_config)}." ) if not isinstance(config.vision_config, ChineseCLIPVisionConfig): - raise ValueError( + raise TypeError( "config.vision_config is expected to be of type ChineseCLIPVisionConfig but is of type" f" {type(config.vision_config)}." ) diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py index 3e83daa942c022..939032f2c894cc 100644 --- a/src/transformers/models/clap/modeling_clap.py +++ b/src/transformers/models/clap/modeling_clap.py @@ -1928,13 +1928,13 @@ def __init__(self, config: ClapConfig): super().__init__(config) if not isinstance(config.text_config, ClapTextConfig): - raise ValueError( + raise TypeError( "config.text_config is expected to be of type ClapTextConfig but is of type" f" {type(config.text_config)}." ) if not isinstance(config.audio_config, ClapAudioConfig): - raise ValueError( + raise TypeError( "config.audio_config is expected to be of type ClapAudioConfig but is of type" f" {type(config.audio_config)}." ) diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py index b96acfc0936c1d..ee85fe3125873b 100644 --- a/src/transformers/models/clip/modeling_clip.py +++ b/src/transformers/models/clip/modeling_clip.py @@ -1119,13 +1119,13 @@ def __init__(self, config: CLIPConfig): super().__init__(config) if not isinstance(config.text_config, CLIPTextConfig): - raise ValueError( + raise TypeError( "config.text_config is expected to be of type CLIPTextConfig but is of type" f" {type(config.text_config)}." ) if not isinstance(config.vision_config, CLIPVisionConfig): - raise ValueError( + raise TypeError( "config.vision_config is expected to be of type CLIPVisionConfig but is of type" f" {type(config.vision_config)}." ) diff --git a/src/transformers/models/clip/modeling_tf_clip.py b/src/transformers/models/clip/modeling_tf_clip.py index b728da52c222b4..ca5f4aede21854 100644 --- a/src/transformers/models/clip/modeling_tf_clip.py +++ b/src/transformers/models/clip/modeling_tf_clip.py @@ -825,13 +825,13 @@ def __init__(self, config: CLIPConfig, **kwargs): super().__init__(**kwargs) if not isinstance(config.text_config, CLIPTextConfig): - raise ValueError( + raise TypeError( "config.text_config is expected to be of type CLIPTextConfig but is of type" f" {type(config.text_config)}." ) if not isinstance(config.vision_config, CLIPVisionConfig): - raise ValueError( + raise TypeError( "config.vision_config is expected to be of type CLIPVisionConfig but is of type" f" {type(config.vision_config)}." ) diff --git a/src/transformers/models/clip/tokenization_clip.py b/src/transformers/models/clip/tokenization_clip.py index 7b4ad88b80a9e0..83e79890d084b3 100644 --- a/src/transformers/models/clip/tokenization_clip.py +++ b/src/transformers/models/clip/tokenization_clip.py @@ -90,7 +90,7 @@ def whitespace_tokenize(text): # Copied from transformers.models.bert.tokenization_bert.BasicTokenizer -class BasicTokenizer(object): +class BasicTokenizer: """ Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.). diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py index af7b94a10f4d7c..97fcf3d1f2b3e6 100644 --- a/src/transformers/models/clipseg/modeling_clipseg.py +++ b/src/transformers/models/clipseg/modeling_clipseg.py @@ -924,13 +924,13 @@ def __init__(self, config: CLIPSegConfig): super().__init__(config) if not isinstance(config.text_config, CLIPSegTextConfig): - raise ValueError( + raise TypeError( "config.text_config is expected to be of type CLIPSegTextConfig but is of type" f" {type(config.text_config)}." ) if not isinstance(config.vision_config, CLIPSegVisionConfig): - raise ValueError( + raise TypeError( "config.vision_config is expected to be of type CLIPSegVisionConfig but is of type" f" {type(config.vision_config)}." ) diff --git a/src/transformers/models/clvp/modeling_clvp.py b/src/transformers/models/clvp/modeling_clvp.py index a673d64614d786..4db7f42517fd33 100644 --- a/src/transformers/models/clvp/modeling_clvp.py +++ b/src/transformers/models/clvp/modeling_clvp.py @@ -239,6 +239,9 @@ def forward(self, hidden_states): hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) return self.weight * hidden_states.to(input_dtype) + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" + class ClvpRotaryPositionalEmbedding(nn.Module): """ @@ -1513,19 +1516,19 @@ def __init__(self, config: ClvpConfig): super().__init__(config) if not isinstance(config.text_config, ClvpEncoderConfig): - raise ValueError( + raise TypeError( "config.text_config is expected to be of type `ClvpEncoderConfig` but is of type" f" {type(config.text_config)}." ) if not isinstance(config.speech_config, ClvpEncoderConfig): - raise ValueError( + raise TypeError( "config.speech_config is expected to be of type `ClvpEncoderConfig` but is of type" f" {type(config.speech_config)}." ) if not isinstance(config.decoder_config, ClvpDecoderConfig): - raise ValueError( + raise TypeError( "config.decoder_config is expected to be of type `ClvpDecoderConfig` but is of type" f" {type(config.decoder_config)}." ) @@ -1681,7 +1684,7 @@ def get_speech_features( >>> # Define the Text and Load the Audio (We are taking an audio example from HuggingFace Hub using `datasets` library) >>> text = "This is an example text." - >>> ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050)) >>> _, audio, sr = ds.sort("id").select(range(1))[:1]["audio"][0].values() @@ -1754,7 +1757,7 @@ def forward( >>> # Define the Text and Load the Audio (We are taking an audio example from HuggingFace Hub using `datasets` library) >>> text = "This is an example text." - >>> ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050)) >>> _, audio, sr = ds.sort("id").select(range(1))[:1]["audio"][0].values() diff --git a/src/transformers/models/clvp/processing_clvp.py b/src/transformers/models/clvp/processing_clvp.py index ebccab89d0fca3..4e015cea1f8475 100644 --- a/src/transformers/models/clvp/processing_clvp.py +++ b/src/transformers/models/clvp/processing_clvp.py @@ -73,6 +73,7 @@ def __call__(self, *args, **kwargs): inputs["attention_mask"] = encodings["attention_mask"] return inputs + # Copied from transformers.models.whisper.processing_whisper.WhisperProcessor.batch_decode with Whisper->Clvp def batch_decode(self, *args, **kwargs): """ This method forwards all its arguments to ClvpTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please diff --git a/src/transformers/models/code_llama/tokenization_code_llama.py b/src/transformers/models/code_llama/tokenization_code_llama.py index 5bbf2d0452f4ff..cc906687874ce0 100644 --- a/src/transformers/models/code_llama/tokenization_code_llama.py +++ b/src/transformers/models/code_llama/tokenization_code_llama.py @@ -437,61 +437,6 @@ def create_token_type_ids_from_sequences( return output - @property - # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.default_chat_template - def default_chat_template(self): - """ - LLaMA uses [INST] and [/INST] to indicate user messages, and <> and <> to indicate system messages. - Assistant messages do not have special tokens, because LLaMA chat models are generally trained with strict - user/assistant/user/assistant message ordering, and so assistant messages can be identified from the ordering - rather than needing special tokens. The system message is partly 'embedded' in the first user message, which - results in an unusual token ordering when it is present. This template should definitely be changed if you wish - to fine-tune a model with more flexible role ordering! - - The output should look something like: - - [INST] B_SYS SystemPrompt E_SYS Prompt [/INST] Answer [INST] Prompt [/INST] Answer - [INST] Prompt [/INST] - - The reference for this chat template is [this code - snippet](https://github.com/facebookresearch/llama/blob/556949fdfb72da27c2f4a40b7f0e4cf0b8153a28/llama/generation.py#L320-L362) - in the original repository. - """ - template = ( - "{% if messages[0]['role'] == 'system' %}" - "{% set loop_messages = messages[1:] %}" # Extract system message if it's present - "{% set system_message = messages[0]['content'] %}" - "{% elif USE_DEFAULT_PROMPT == true and not '<>' in messages[0]['content'] %}" - "{% set loop_messages = messages %}" # Or use the default system message if the flag is set - "{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}" - "{% else %}" - "{% set loop_messages = messages %}" - "{% set system_message = false %}" - "{% endif %}" - "{% for message in loop_messages %}" # Loop over all non-system messages - "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}" - "{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}" - "{% endif %}" - "{% if loop.index0 == 0 and system_message != false %}" # Embed system message in first message - "{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}" - "{% else %}" - "{% set content = message['content'] %}" - "{% endif %}" - "{% if message['role'] == 'user' %}" # After all of that, handle messages/roles in a fairly normal way - "{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}" - "{% elif message['role'] == 'system' %}" - "{{ '<>\\n' + content.strip() + '\\n<>\\n\\n' }}" - "{% elif message['role'] == 'assistant' %}" - "{{ ' ' + content.strip() + ' ' + eos_token }}" - "{% endif %}" - "{% endfor %}" - ) - template = template.replace("USE_DEFAULT_PROMPT", "true" if self.use_default_system_prompt else "false") - default_message = DEFAULT_SYSTEM_PROMPT.replace("\n", "\\n").replace("'", "\\'") - template = template.replace("DEFAULT_SYSTEM_MESSAGE", default_message) - - return template - def __getstate__(self): state = self.__dict__.copy() state["sp_model"] = None diff --git a/src/transformers/models/code_llama/tokenization_code_llama_fast.py b/src/transformers/models/code_llama/tokenization_code_llama_fast.py index 9bdb7a65b58499..b832348d07af4d 100644 --- a/src/transformers/models/code_llama/tokenization_code_llama_fast.py +++ b/src/transformers/models/code_llama/tokenization_code_llama_fast.py @@ -349,61 +349,6 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = return (out_vocab_file,) - @property - # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.default_chat_template - def default_chat_template(self): - """ - LLaMA uses [INST] and [/INST] to indicate user messages, and <> and <> to indicate system messages. - Assistant messages do not have special tokens, because LLaMA chat models are generally trained with strict - user/assistant/user/assistant message ordering, and so assistant messages can be identified from the ordering - rather than needing special tokens. The system message is partly 'embedded' in the first user message, which - results in an unusual token ordering when it is present. This template should definitely be changed if you wish - to fine-tune a model with more flexible role ordering! - - The output should look something like: - - [INST] B_SYS SystemPrompt E_SYS Prompt [/INST] Answer [INST] Prompt [/INST] Answer - [INST] Prompt [/INST] - - The reference for this chat template is [this code - snippet](https://github.com/facebookresearch/llama/blob/556949fdfb72da27c2f4a40b7f0e4cf0b8153a28/llama/generation.py#L320-L362) - in the original repository. - """ - template = ( - "{% if messages[0]['role'] == 'system' %}" - "{% set loop_messages = messages[1:] %}" # Extract system message if it's present - "{% set system_message = messages[0]['content'] %}" - "{% elif USE_DEFAULT_PROMPT == true and not '<>' in messages[0]['content'] %}" - "{% set loop_messages = messages %}" # Or use the default system message if the flag is set - "{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}" - "{% else %}" - "{% set loop_messages = messages %}" - "{% set system_message = false %}" - "{% endif %}" - "{% for message in loop_messages %}" # Loop over all non-system messages - "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}" - "{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}" - "{% endif %}" - "{% if loop.index0 == 0 and system_message != false %}" # Embed system message in first message - "{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}" - "{% else %}" - "{% set content = message['content'] %}" - "{% endif %}" - "{% if message['role'] == 'user' %}" # After all of that, handle messages/roles in a fairly normal way - "{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}" - "{% elif message['role'] == 'system' %}" - "{{ '<>\\n' + content.strip() + '\\n<>\\n\\n' }}" - "{% elif message['role'] == 'assistant' %}" - "{{ ' ' + content.strip() + ' ' + eos_token }}" - "{% endif %}" - "{% endfor %}" - ) - template = template.replace("USE_DEFAULT_PROMPT", "true" if self.use_default_system_prompt else "false") - default_message = DEFAULT_SYSTEM_PROMPT.replace("\n", "\\n").replace("'", "\\'") - template = template.replace("DEFAULT_SYSTEM_MESSAGE", default_message) - - return template - def build_inputs_with_special_tokens( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py index 5322c2334d37b5..6257eeb9958cc8 100644 --- a/src/transformers/models/cohere/modeling_cohere.py +++ b/src/transformers/models/cohere/modeling_cohere.py @@ -295,7 +295,8 @@ def forward( return attn_output, attn_weights, past_key_value -# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->Cohere +# copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->Cohere +# TODO(joao): add me back asap :) class CohereFlashAttention2(CohereAttention): """ Cohere flash attention module. This module inherits from `CohereAttention` as the weights of the module stays @@ -409,7 +410,8 @@ def forward( return attn_output, attn_weights, past_key_value -# Copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention Llama->Cohere +# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention Llama->Cohere +# TODO(joao): add me back asap :) class CohereSdpaAttention(CohereAttention): """ Cohere attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from @@ -697,7 +699,8 @@ def _init_weights(self, module): "The bare Cohere Model outputting raw hidden-states without any specific head on top.", COHERE_START_DOCSTRING, ) -# Copied from transformers.models.llama.modeling_llama.LlamaModel with Llama->Cohere +# copied from transformers.models.llama.modeling_llama.LlamaModel with Llama->Cohere +# TODO(joao): add me back asap :) class CohereModel(CoherePreTrainedModel): """ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`CohereDecoderLayer`] @@ -766,7 +769,9 @@ def forward( past_seen_tokens = 0 return_legacy_cache = False - if use_cache and not isinstance(past_key_values, Cache): # kept for BC (non `Cache` `past_key_values` inputs) + if ( + use_cache and not isinstance(past_key_values, Cache) and not self.training + ): # kept for BC (non `Cache` `past_key_values` inputs) return_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) diff --git a/src/transformers/models/cohere/tokenization_cohere_fast.py b/src/transformers/models/cohere/tokenization_cohere_fast.py index b0a62e279ca8e9..bac665b473c57b 100644 --- a/src/transformers/models/cohere/tokenization_cohere_fast.py +++ b/src/transformers/models/cohere/tokenization_cohere_fast.py @@ -228,188 +228,6 @@ def add_bos_token(self, value): self._add_bos_token = value self.update_post_processor() - @property - def default_chat_template(self): - """ - Cohere Tokenizer uses <|START_OF_TURN_TOKEN|> and <|END_OF_TURN_TOKEN|> to indicate each turn in a chat. - Additioanlly, to indicate the source of the message, <|USER_TOKEN|>, <|CHATBOT_TOKEN|> and <|SYSTEM_TOKEN|> - for user, assitant and system messages respectively. - - The output should look something like: - <|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{ preamble }}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{ How are you? }}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{ I am doing well! }}<|END_OF_TURN_TOKEN|> - - Use add_generation_prompt to add a prompt for the model to generate a response: - >>> from transformers import AutoTokenizer - >>> tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01") - >>> messages = [{"role": "user", "content": "Hello, how are you?"}] - >>> tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello, how are you?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' - - """ - default_template = ( - "{{ bos_token }}" - "{% if messages[0]['role'] == 'system' %}" - "{% set loop_messages = messages[1:] %}" # Extract system message if it's present - "{% set system_message = messages[0]['content'] %}" - "{% elif USE_DEFAULT_PROMPT == true %}" - "{% set loop_messages = messages %}" # Or use the default system message if the flag is set - "{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}" - "{% else %}" - "{% set loop_messages = messages %}" - "{% set system_message = false %}" - "{% endif %}" - "{% if system_message != false %}" # Start with system message - "{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}" - "{% endif %}" - "{% for message in loop_messages %}" # Loop over all non-system messages - "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}" - "{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}" - "{% endif %}" - "{% set content = message['content'] %}" - "{% if message['role'] == 'user' %}" # After all of that, handle messages/roles in a fairly normal way - "{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}" - "{% elif message['role'] == 'assistant' %}" - "{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}" - "{% endif %}" - "{% endfor %}" - "{% if add_generation_prompt %}" - "{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}" - "{% endif %}" - ) - default_template = default_template.replace( - "USE_DEFAULT_PROMPT", "true" if self.use_default_system_prompt else "false" - ) - default_message = DEFAULT_SYSTEM_PROMPT.replace("\n", "\\n").replace("'", "\\'") - default_template = default_template.replace("DEFAULT_SYSTEM_MESSAGE", default_message) - - tool_use_template = ( - "{{ bos_token }}" - "{% if messages[0]['role'] == 'system' %}" - "{% set loop_messages = messages[1:] %}" # Extract system message if it's present - "{% set system_message = messages[0]['content'] %}" - "{% else %}" - "{% set loop_messages = messages %}" - "{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}" - "{% endif %}" - "{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' }}" - "{{ '# Safety Preamble' }}" - "{{ '\nThe instructions in this section override those in the task description and style guide sections. Don\\'t answer questions that are harmful or immoral.' }}" - "{{ '\n\n# System Preamble' }}" - "{{ '\n## Basic Rules' }}" - "{{ '\nYou are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user\\'s requests, you cite your sources in your answers, according to those instructions.' }}" - "{{ '\n\n# User Preamble' }}" - "{{ '\n' + system_message }}" - "{{'\n\n## Available Tools\nHere is a list of tools that you have available to you:\n\n'}}" - "{% for tool in tools %}" - "{% if loop.index0 != 0 %}" - "{{ '\n\n'}}" - "{% endif %}" - "{{'```python\ndef ' + tool.name + '('}}" - "{% for param_name, param_fields in tool.parameter_definitions.items() %}" - "{% if loop.index0 != 0 %}" - "{{ ', '}}" - "{% endif %}" - "{{param_name}}: " - "{% if not param_fields.required %}" - "{{'Optional[' + param_fields.type + '] = None'}}" - "{% else %}" - "{{ param_fields.type }}" - "{% endif %}" - "{% endfor %}" - '{{ \') -> List[Dict]:\n """\'}}' - "{{ tool.description }}" - "{% if tool.parameter_definitions|length != 0 %}" - "{{ '\n\n Args:\n '}}" - "{% for param_name, param_fields in tool.parameter_definitions.items() %}" - "{% if loop.index0 != 0 %}" - "{{ '\n ' }}" - "{% endif %}" - "{{ param_name + ' ('}}" - "{% if not param_fields.required %}" - "{{'Optional[' + param_fields.type + ']'}}" - "{% else %}" - "{{ param_fields.type }}" - "{% endif %}" - "{{ '): ' + param_fields.description }}" - "{% endfor %}" - "{% endif %}" - '{{ \'\n """\n pass\n```\' }}' - "{% endfor %}" - "{{ '<|END_OF_TURN_TOKEN|>'}}" - "{% for message in loop_messages %}" - "{% set content = message['content'] %}" - "{% if message['role'] == 'user' %}" - "{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}" - "{% elif message['role'] == 'system' %}" - "{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}" - "{% elif message['role'] == 'assistant' %}" - "{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}" - "{% endif %}" - "{% endfor %}" - "{{'<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>Write \\'Action:\\' followed by a json-formatted list of actions that you want to perform in order to produce a good response to the user\\'s last input. You can use any of the supplied tools any number of times, but you should aim to execute the minimum number of necessary actions for the input. You should use the `directly-answer` tool if calling the other tools is unnecessary. The list of actions you want to call should be formatted as a list of json objects, for example:\n```json\n[\n {\n \"tool_name\": title of the tool in the specification,\n \"parameters\": a dict of parameters to input into the tool as they are defined in the specs, or {} if it takes no parameters\n }\n]```<|END_OF_TURN_TOKEN|>'}}" - "{% if add_generation_prompt %}" - "{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}" - "{% endif %}" - ) - default_tool_message = DEFAULT_RAG_PREAMBLE.replace("\n", "\\n").replace("'", "\\'") - tool_use_template = tool_use_template.replace("DEFAULT_SYSTEM_MESSAGE", default_tool_message) - - rag_template = ( - "{{ bos_token }}" - "{% if messages[0]['role'] == 'system' %}" - "{% set loop_messages = messages[1:] %}" # Extract system message if it's present - "{% set system_message = messages[0]['content'] %}" - "{% else %}" - "{% set loop_messages = messages %}" - "{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}" - "{% endif %}" - "{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' }}" - "{{ '# Safety Preamble' }}" - "{{ '\nThe instructions in this section override those in the task description and style guide sections. Don\\'t answer questions that are harmful or immoral.' }}" - "{{ '\n\n# System Preamble' }}" - "{{ '\n## Basic Rules' }}" - "{{ '\nYou are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user\\'s requests, you cite your sources in your answers, according to those instructions.' }}" - "{{ '\n\n# User Preamble' }}" - "{{ '\n' + system_message }}" - "{{ '<|END_OF_TURN_TOKEN|>'}}" - "{% for message in loop_messages %}" # Loop over all non-system messages - "{% set content = message['content'] %}" - "{% if message['role'] == 'user' %}" # After all of that, handle messages/roles in a fairly normal way - "{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}" - "{% elif message['role'] == 'system' %}" - "{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}" - "{% elif message['role'] == 'assistant' %}" - "{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}" - "{% endif %}" - "{% endfor %}" - "{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>'}}" - "{{ '' }}" - "{% for document in documents %}" # Loop over all non-system messages - "{{ '\nDocument: ' }}" - "{{ loop.index0 }}\n" - "{% for key, value in document.items() %}" - "{{ key }}: {{value}}\n" - "{% endfor %}" - "{% endfor %}" - "{{ ''}}" - "{{ '<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' }}" - "{{ 'Carefully perform the following instructions, in order, starting each with a new line.\n' }}" - "{{ 'Firstly, Decide which of the retrieved documents are relevant to the user\\'s last input by writing \\'Relevant Documents:\\' followed by comma-separated list of document numbers. If none are relevant, you should instead write \\'None\\'.\n' }}" - "{{ 'Secondly, Decide which of the retrieved documents contain facts that should be cited in a good answer to the user\\'s last input by writing \\'Cited Documents:\\' followed a comma-separated list of document numbers. If you dont want to cite any of them, you should instead write \\'None\\'.\n' }}" - "{% if citation_mode=='accurate' %}" - "{{ 'Thirdly, Write \\'Answer:\\' followed by a response to the user\\'s last input in high quality natural english. Use the retrieved documents to help you. Do not insert any citations or grounding markup.\n' }}" - "{% endif %}" - "{{ 'Finally, Write \\'Grounded answer:\\' followed by a response to the user\\'s last input in high quality natural english. Use the symbols and to indicate when a fact comes from a document in the search result, e.g my fact for a fact from document 0.' }}" - "{{ '<|END_OF_TURN_TOKEN|>' }}" - "{% if add_generation_prompt %}" - "{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}" - "{% endif %}" - ) - default_rag_message = DEFAULT_RAG_PREAMBLE.replace("\n", "\\n").replace("'", "\\'") - rag_template = rag_template.replace("DEFAULT_SYSTEM_MESSAGE", default_rag_message) - - return {"default": default_template, "tool_use": tool_use_template, "rag": rag_template} - def apply_tool_use_template( self, conversation: Union[List[Dict[str, str]]], diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py index e72daa64713e8c..e0dcca67aefb5a 100644 --- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py +++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py @@ -2596,7 +2596,7 @@ def _max_by_axis(the_list): # Copied from transformers.models.detr.modeling_detr.NestedTensor -class NestedTensor(object): +class NestedTensor: def __init__(self, tensors, mask: Optional[Tensor]): self.tensors = tensors self.mask = mask diff --git a/src/transformers/models/convbert/tokenization_convbert.py b/src/transformers/models/convbert/tokenization_convbert.py index f1bc98bf41eedc..cc8cb1b9a738ab 100644 --- a/src/transformers/models/convbert/tokenization_convbert.py +++ b/src/transformers/models/convbert/tokenization_convbert.py @@ -285,7 +285,7 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = # Copied from transformers.models.bert.tokenization_bert.BasicTokenizer -class BasicTokenizer(object): +class BasicTokenizer: """ Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.). @@ -447,7 +447,7 @@ def _clean_text(self, text): # Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer -class WordpieceTokenizer(object): +class WordpieceTokenizer: """Runs WordPiece tokenization.""" def __init__(self, vocab, unk_token, max_input_chars_per_word=100): diff --git a/src/transformers/models/cpmant/tokenization_cpmant.py b/src/transformers/models/cpmant/tokenization_cpmant.py index 2ccb296c70d98e..094a14ffce069f 100644 --- a/src/transformers/models/cpmant/tokenization_cpmant.py +++ b/src/transformers/models/cpmant/tokenization_cpmant.py @@ -44,7 +44,7 @@ def load_vocab(vocab_file): return vocab -class WordpieceTokenizer(object): +class WordpieceTokenizer: def __init__(self, vocab, unk_token="", max_input_chars_per_word=200): self.vocab = vocab self.unk_token = unk_token diff --git a/src/transformers/models/dbrx/configuration_dbrx.py b/src/transformers/models/dbrx/configuration_dbrx.py index 91f4fc3a4b1c9f..a040a649d9d8e3 100644 --- a/src/transformers/models/dbrx/configuration_dbrx.py +++ b/src/transformers/models/dbrx/configuration_dbrx.py @@ -249,6 +249,7 @@ def __init__( self.use_cache = use_cache self.initializer_range = initializer_range self.output_router_logits = output_router_logits + self.num_key_value_heads = self.attn_config.kv_n_heads tie_word_embeddings = kwargs.pop("tie_word_embeddings", False) if tie_word_embeddings: diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py index 31810028ef4448..b1f3ce1b8ba963 100644 --- a/src/transformers/models/dbrx/modeling_dbrx.py +++ b/src/transformers/models/dbrx/modeling_dbrx.py @@ -415,6 +415,7 @@ def forward( value_states, attention_mask, q_len, + position_ids=position_ids, dropout=dropout_rate, is_causal=self.is_causal, use_top_left_mask=self._flash_attn_uses_top_left_mask, @@ -1004,7 +1005,9 @@ def forward( inputs_embeds = nn.functional.dropout(inputs_embeds, p=self.emb_pdrop, training=self.training) return_legacy_cache = False - if use_cache and not isinstance(past_key_values, Cache): # kept for BC (non `Cache` `past_key_values` inputs) + if ( + use_cache and not isinstance(past_key_values, Cache) and not self.training + ): # kept for BC (non `Cache` `past_key_values` inputs) return_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( diff --git a/src/transformers/models/deberta/modeling_deberta.py b/src/transformers/models/deberta/modeling_deberta.py index 964e3add914afd..af6d630a4dbc97 100644 --- a/src/transformers/models/deberta/modeling_deberta.py +++ b/src/transformers/models/deberta/modeling_deberta.py @@ -138,7 +138,7 @@ def symbolic(g, self, mask, dim): return masked_fill(g, output, r_mask, g.op("Constant", value_t=torch.tensor(0, dtype=torch.bool))) -class DropoutContext(object): +class DropoutContext: def __init__(self): self.dropout = 0 self.mask = None diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py index fd910e9daf7427..72653582d2b819 100644 --- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py +++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py @@ -133,7 +133,7 @@ def symbolic(g, self, mask, dim): # Copied from transformers.models.deberta.modeling_deberta.DropoutContext -class DropoutContext(object): +class DropoutContext: def __init__(self): self.dropout = 0 self.mask = None diff --git a/src/transformers/models/deberta_v2/tokenization_deberta_v2.py b/src/transformers/models/deberta_v2/tokenization_deberta_v2.py index 2876ac7660493c..6ff689f80a5c1b 100644 --- a/src/transformers/models/deberta_v2/tokenization_deberta_v2.py +++ b/src/transformers/models/deberta_v2/tokenization_deberta_v2.py @@ -518,4 +518,4 @@ def convert_to_unicode(text): elif isinstance(text, bytes): return text.decode("utf-8", "ignore") else: - raise ValueError(f"Unsupported string type: {type(text)}") + raise TypeError(f"Unsupported string type: {type(text)}") diff --git a/src/transformers/models/deformable_detr/modeling_deformable_detr.py b/src/transformers/models/deformable_detr/modeling_deformable_detr.py index 03648d33b9ade7..46e00787baf618 100755 --- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py +++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py @@ -2491,7 +2491,7 @@ def _max_by_axis(the_list): # Copied from transformers.models.detr.modeling_detr.NestedTensor -class NestedTensor(object): +class NestedTensor: def __init__(self, tensors, mask: Optional[Tensor]): self.tensors = tensors self.mask = mask diff --git a/src/transformers/models/deprecated/deta/modeling_deta.py b/src/transformers/models/deprecated/deta/modeling_deta.py index bc195749399e2d..075b490cfa7b6a 100644 --- a/src/transformers/models/deprecated/deta/modeling_deta.py +++ b/src/transformers/models/deprecated/deta/modeling_deta.py @@ -2516,7 +2516,7 @@ def nonzero_tuple(x): # from https://github.com/facebookresearch/detectron2/blob/9921a2caa585d4fa66c4b534b6fab6e74d89b582/detectron2/modeling/matcher.py#L9 -class DetaMatcher(object): +class DetaMatcher: """ This class assigns to each predicted "element" (e.g., a box) a ground-truth element. Each predicted element will have exactly zero or one matches; each ground-truth element may be matched to zero or more predicted elements. diff --git a/src/transformers/models/deprecated/gptsan_japanese/tokenization_gptsan_japanese.py b/src/transformers/models/deprecated/gptsan_japanese/tokenization_gptsan_japanese.py index 51789e49b2d263..f1331da83eec5d 100644 --- a/src/transformers/models/deprecated/gptsan_japanese/tokenization_gptsan_japanese.py +++ b/src/transformers/models/deprecated/gptsan_japanese/tokenization_gptsan_japanese.py @@ -236,19 +236,6 @@ def convert_tokens_to_string(self, tokens): text = "".join(words) return text - @property - def default_chat_template(self): - """ - A simple chat template that adds standard BOS, SEP and EOS tokens between messages while discarding role - information. - """ - return ( - "{% for message in messages %}" - "{% if not loop.first %}{{ bos_token}}{% endif %}" - "{{ sep_token }}{{ message.content }} {{ eos_token }}" - "{% endfor %}" - ) - def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: index = 0 if os.path.isdir(save_directory): @@ -378,7 +365,7 @@ def _batch_encode_plus( ) -class SubWordJapaneseTokenizer(object): +class SubWordJapaneseTokenizer: """ This tokenizer is based on GPTNeoXJapaneseTokenizer and has the following modifications - Decoding byte0~byte255 tokens correctly diff --git a/src/transformers/models/deprecated/mega/modeling_mega.py b/src/transformers/models/deprecated/mega/modeling_mega.py index 92d91bdb28bb2d..32f37dde5349a1 100644 --- a/src/transformers/models/deprecated/mega/modeling_mega.py +++ b/src/transformers/models/deprecated/mega/modeling_mega.py @@ -250,6 +250,9 @@ def forward(self, input): input * torch.rsqrt(mean_square + self.eps) return input + def extra_repr(self): + return f"{self.num_features}, eps={self.eps}, affine={self.affine}" + class MegaScaleNorm(nn.Module): """ diff --git a/src/transformers/models/deprecated/mmbt/configuration_mmbt.py b/src/transformers/models/deprecated/mmbt/configuration_mmbt.py index 8fcc0f1d63d290..73696087faf3bf 100644 --- a/src/transformers/models/deprecated/mmbt/configuration_mmbt.py +++ b/src/transformers/models/deprecated/mmbt/configuration_mmbt.py @@ -21,7 +21,7 @@ logger = logging.get_logger(__name__) -class MMBTConfig(object): +class MMBTConfig: """ This is the configuration class to store the configuration of a [`MMBTModel`]. It is used to instantiate a MMBT model according to the specified arguments, defining the model architecture. diff --git a/src/transformers/models/deprecated/open_llama/modeling_open_llama.py b/src/transformers/models/deprecated/open_llama/modeling_open_llama.py index 7d2098f2f63fff..b6043fde047e5a 100644 --- a/src/transformers/models/deprecated/open_llama/modeling_open_llama.py +++ b/src/transformers/models/deprecated/open_llama/modeling_open_llama.py @@ -62,6 +62,9 @@ def forward(self, hidden_states): hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) return self.weight * hidden_states.to(input_dtype) + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" + class OpenLlamaRotaryEmbedding(nn.Module): def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): diff --git a/src/transformers/models/deprecated/realm/tokenization_realm.py b/src/transformers/models/deprecated/realm/tokenization_realm.py index 671405301dff18..8211c1aee8707d 100644 --- a/src/transformers/models/deprecated/realm/tokenization_realm.py +++ b/src/transformers/models/deprecated/realm/tokenization_realm.py @@ -354,7 +354,7 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = return (vocab_file,) -class BasicTokenizer(object): +class BasicTokenizer: """ Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.). @@ -503,7 +503,7 @@ def _clean_text(self, text): return "".join(output) -class WordpieceTokenizer(object): +class WordpieceTokenizer: """Runs WordPiece tokenization.""" def __init__(self, vocab, unk_token, max_input_chars_per_word=100): diff --git a/src/transformers/models/deprecated/retribert/tokenization_retribert.py b/src/transformers/models/deprecated/retribert/tokenization_retribert.py index 2f66fcc1edd129..8b3570f1622d57 100644 --- a/src/transformers/models/deprecated/retribert/tokenization_retribert.py +++ b/src/transformers/models/deprecated/retribert/tokenization_retribert.py @@ -283,7 +283,7 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = return (vocab_file,) -class BasicTokenizer(object): +class BasicTokenizer: """ Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.). @@ -444,7 +444,7 @@ def _clean_text(self, text): return "".join(output) -class WordpieceTokenizer(object): +class WordpieceTokenizer: """Runs WordPiece tokenization.""" def __init__(self, vocab, unk_token, max_input_chars_per_word=100): diff --git a/src/transformers/models/deprecated/speech_to_text_2/modeling_speech_to_text_2.py b/src/transformers/models/deprecated/speech_to_text_2/modeling_speech_to_text_2.py index 4db60e0faeb4c1..8f1a8370933c91 100755 --- a/src/transformers/models/deprecated/speech_to_text_2/modeling_speech_to_text_2.py +++ b/src/transformers/models/deprecated/speech_to_text_2/modeling_speech_to_text_2.py @@ -831,7 +831,7 @@ def forward( >>> model.config.decoder_start_token_id = tokenizer.bos_token_id >>> # pre-process inputs and labels - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> inputs = feature_extractor( ... ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt" ... ) diff --git a/src/transformers/models/deprecated/transfo_xl/tokenization_transfo_xl.py b/src/transformers/models/deprecated/transfo_xl/tokenization_transfo_xl.py index 4229e8e5b3ad65..ca80636b23565d 100644 --- a/src/transformers/models/deprecated/transfo_xl/tokenization_transfo_xl.py +++ b/src/transformers/models/deprecated/transfo_xl/tokenization_transfo_xl.py @@ -511,7 +511,7 @@ def _tokenize(self, line, add_eos=False, add_double_eos=False): return symbols -class LMOrderedIterator(object): +class LMOrderedIterator: def __init__(self, data, bsz, bptt, device="cpu", ext_len=None): """ data -- LongTensor -- the LongTensor is strictly ordered @@ -570,7 +570,7 @@ def __iter__(self): return self.get_fixlen_iter() -class LMShuffledIterator(object): +class LMShuffledIterator: def __init__(self, data, bsz, bptt, device="cpu", ext_len=None, shuffle=False): """ data -- list[LongTensor] -- there is no order among the LongTensors @@ -679,7 +679,7 @@ def __iter__(self): yield batch -class TransfoXLCorpus(object): +class TransfoXLCorpus: @classmethod @torch_only_method def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs): diff --git a/src/transformers/models/depth_anything/modeling_depth_anything.py b/src/transformers/models/depth_anything/modeling_depth_anything.py index 0b1ef77c6a732a..e37f0a3eaf7cbf 100644 --- a/src/transformers/models/depth_anything/modeling_depth_anything.py +++ b/src/transformers/models/depth_anything/modeling_depth_anything.py @@ -298,7 +298,7 @@ def forward(self, hidden_states: List[torch.Tensor], patch_height=None, patch_wi List of hidden states from the backbone. """ if not isinstance(hidden_states, (tuple, list)): - raise ValueError("hidden_states should be a tuple or list of tensors") + raise TypeError("hidden_states should be a tuple or list of tensors") if len(hidden_states) != len(self.config.neck_hidden_sizes): raise ValueError("The number of hidden states should be equal to the number of neck hidden sizes.") diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py index 447f8a807fcb66..c3c1c033e556bf 100644 --- a/src/transformers/models/detr/modeling_detr.py +++ b/src/transformers/models/detr/modeling_detr.py @@ -2292,7 +2292,7 @@ def _max_by_axis(the_list): return maxes -class NestedTensor(object): +class NestedTensor: def __init__(self, tensors, mask: Optional[Tensor]): self.tensors = tensors self.mask = mask diff --git a/src/transformers/models/distilbert/modeling_distilbert.py b/src/transformers/models/distilbert/modeling_distilbert.py index 4d9173fd08bfd5..e80e3c41d22cb6 100755 --- a/src/transformers/models/distilbert/modeling_distilbert.py +++ b/src/transformers/models/distilbert/modeling_distilbert.py @@ -398,7 +398,7 @@ def forward( if output_attentions: sa_output, sa_weights = sa_output # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length) else: # To handle these `output_attentions` or `output_hidden_states` cases returning tuples - if type(sa_output) != tuple: + if type(sa_output) is not tuple: raise TypeError(f"sa_output must be a tuple but it is {type(sa_output)} type") sa_output = sa_output[0] diff --git a/src/transformers/models/distilbert/modeling_flax_distilbert.py b/src/transformers/models/distilbert/modeling_flax_distilbert.py index d3c48c077adc52..0cb7cdb033c148 100644 --- a/src/transformers/models/distilbert/modeling_flax_distilbert.py +++ b/src/transformers/models/distilbert/modeling_flax_distilbert.py @@ -304,7 +304,7 @@ def __call__( if output_attentions: sa_output, sa_weights = sa_output else: - assert type(sa_output) == tuple + assert type(sa_output) is tuple sa_output = sa_output[0] sa_output = self.sa_layer_norm(sa_output + hidden_states) diff --git a/src/transformers/models/distilbert/tokenization_distilbert.py b/src/transformers/models/distilbert/tokenization_distilbert.py index ff8854ba3dcf89..87b1eb192e4ad7 100644 --- a/src/transformers/models/distilbert/tokenization_distilbert.py +++ b/src/transformers/models/distilbert/tokenization_distilbert.py @@ -295,7 +295,7 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = # Copied from transformers.models.bert.tokenization_bert.BasicTokenizer -class BasicTokenizer(object): +class BasicTokenizer: """ Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.). @@ -457,7 +457,7 @@ def _clean_text(self, text): # Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer -class WordpieceTokenizer(object): +class WordpieceTokenizer: """Runs WordPiece tokenization.""" def __init__(self, vocab, unk_token, max_input_chars_per_word=100): diff --git a/src/transformers/models/dpt/convert_dinov2_depth_to_hf.py b/src/transformers/models/dpt/convert_dinov2_depth_to_hf.py index 7b3715bddf311c..367aff7f90e18b 100644 --- a/src/transformers/models/dpt/convert_dinov2_depth_to_hf.py +++ b/src/transformers/models/dpt/convert_dinov2_depth_to_hf.py @@ -200,7 +200,7 @@ def prepare_img(): def get_original_pixel_values(image): - class CenterPadding(object): + class CenterPadding: def __init__(self, multiple): super().__init__() self.multiple = multiple diff --git a/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py b/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py index a407a67f3813ed..16e4d71212b53a 100644 --- a/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py +++ b/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py @@ -43,7 +43,7 @@ def get_dpt_config(checkpoint_url): config.neck_hidden_sizes = [256, 512, 1024, 1024] expected_shape = (1, 384, 384) - if "nyu" or "midas" in checkpoint_url: + if "nyu" in checkpoint_url or "midas" in checkpoint_url: config.hidden_size = 768 config.reassemble_factors = [1, 1, 1, 0.5] config.neck_hidden_sizes = [256, 512, 768, 768] diff --git a/src/transformers/models/dpt/modeling_dpt.py b/src/transformers/models/dpt/modeling_dpt.py index b2b88855669a76..b3e4b86a2a49dc 100755 --- a/src/transformers/models/dpt/modeling_dpt.py +++ b/src/transformers/models/dpt/modeling_dpt.py @@ -1002,7 +1002,7 @@ def forward(self, hidden_states: List[torch.Tensor], patch_height=None, patch_wi List of hidden states from the backbone. """ if not isinstance(hidden_states, (tuple, list)): - raise ValueError("hidden_states should be a tuple or list of tensors") + raise TypeError("hidden_states should be a tuple or list of tensors") if len(hidden_states) != len(self.config.neck_hidden_sizes): raise ValueError("The number of hidden states should be equal to the number of neck hidden sizes.") diff --git a/src/transformers/models/electra/tokenization_electra.py b/src/transformers/models/electra/tokenization_electra.py index ceb3e7560215c2..9ecbce63f50b62 100644 --- a/src/transformers/models/electra/tokenization_electra.py +++ b/src/transformers/models/electra/tokenization_electra.py @@ -284,7 +284,7 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = # Copied from transformers.models.bert.tokenization_bert.BasicTokenizer -class BasicTokenizer(object): +class BasicTokenizer: """ Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.). @@ -446,7 +446,7 @@ def _clean_text(self, text): # Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer -class WordpieceTokenizer(object): +class WordpieceTokenizer: """Runs WordPiece tokenization.""" def __init__(self, vocab, unk_token, max_input_chars_per_word=100): diff --git a/src/transformers/models/esm/openfold_utils/chunk_utils.py b/src/transformers/models/esm/openfold_utils/chunk_utils.py index 301721d135ee4d..51ff6b74d6c3f5 100644 --- a/src/transformers/models/esm/openfold_utils/chunk_utils.py +++ b/src/transformers/models/esm/openfold_utils/chunk_utils.py @@ -32,7 +32,7 @@ def _fetch_dims(tree: Union[dict, list, tuple, torch.Tensor]) -> List[Tuple[int, elif isinstance(tree, torch.Tensor): shapes.append(tree.shape) else: - raise ValueError("Not supported") + raise TypeError("Not supported") return shapes @@ -302,7 +302,7 @@ def assign(d1: dict, d2: dict) -> None: else: out[i : i + chunk_size] = output_chunk else: - raise ValueError("Not supported") + raise TypeError("Not supported") i += chunk_size @@ -356,7 +356,7 @@ def test_chunk_size(chunk_size: int) -> bool: def _compare_arg_caches(self, ac1: Iterable, ac2: Iterable) -> bool: consistent = True for a1, a2 in zip(ac1, ac2): - assert type(ac1) == type(ac2) + assert type(ac1) is type(ac2) if isinstance(ac1, (list, tuple)): consistent &= self._compare_arg_caches(a1, a2) elif isinstance(ac1, dict): diff --git a/src/transformers/models/esm/openfold_utils/residue_constants.py b/src/transformers/models/esm/openfold_utils/residue_constants.py index 8f0ad3b50c6505..200e0d421b8386 100644 --- a/src/transformers/models/esm/openfold_utils/residue_constants.py +++ b/src/transformers/models/esm/openfold_utils/residue_constants.py @@ -394,7 +394,7 @@ def map_structure_with_atom_order(in_list: list, first_call: bool = True) -> lis elif isinstance(in_list[i], str): in_list[i] = atom_order[in_list[i]] else: - raise ValueError("Unexpected type when mapping nested lists!") + raise TypeError("Unexpected type when mapping nested lists!") return in_list diff --git a/src/transformers/models/esm/openfold_utils/rigid_utils.py b/src/transformers/models/esm/openfold_utils/rigid_utils.py index 2bc2fe5f5c4ebf..08f5ce0a4f7e2c 100644 --- a/src/transformers/models/esm/openfold_utils/rigid_utils.py +++ b/src/transformers/models/esm/openfold_utils/rigid_utils.py @@ -343,7 +343,7 @@ def __getitem__(self, index: Any) -> Rotation: Returns: The indexed rotation """ - if type(index) != tuple: + if type(index) is not tuple: index = (index,) if self._rot_mats is not None: @@ -827,7 +827,7 @@ def __getitem__(self, index: Any) -> Rigid: Returns: The indexed tensor """ - if type(index) != tuple: + if type(index) is not tuple: index = (index,) return Rigid( diff --git a/src/transformers/models/esm/openfold_utils/tensor_utils.py b/src/transformers/models/esm/openfold_utils/tensor_utils.py index 20ee34b236f177..efe72e4905b81f 100644 --- a/src/transformers/models/esm/openfold_utils/tensor_utils.py +++ b/src/transformers/models/esm/openfold_utils/tensor_utils.py @@ -134,7 +134,7 @@ def tree_map(fn, tree, leaf_type): return fn(tree) else: print(type(tree)) - raise ValueError("Not supported") + raise TypeError("Not supported") tensor_tree_map = partial(tree_map, leaf_type=torch.Tensor) diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index d1050d542a2f38..fc7a38ed134d4f 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -283,7 +283,6 @@ def __init__(self, config: FalconConfig): self.attention_dropout = nn.Dropout(config.attention_dropout) self.num_kv_heads = config.num_kv_heads if (self.new_decoder_architecture or not self.multi_query) else 1 - # Copied from transformers.models.llama.modeling_llama.LlamaAttention._init_rope with Llama->Falcon def _init_rope(self): if self.config.rope_scaling is None: self.rotary_emb = FalconRotaryEmbedding( @@ -603,6 +602,7 @@ def forward( value_layer, attention_mask, query_length, + position_ids=position_ids, dropout=attn_dropout, is_causal=self.is_causal, use_top_left_mask=self._flash_attn_uses_top_left_mask, diff --git a/src/transformers/models/flava/modeling_flava.py b/src/transformers/models/flava/modeling_flava.py index dbc4e51703847a..314925789ce1f4 100644 --- a/src/transformers/models/flava/modeling_flava.py +++ b/src/transformers/models/flava/modeling_flava.py @@ -1181,19 +1181,19 @@ def __init__(self, config: FlavaConfig): super().__init__(config) if not isinstance(config.text_config, FlavaTextConfig): - raise ValueError( + raise TypeError( "config.text_config is expected to be of type FlavaTextConfig but is of type" f" {type(config.text_config)}." ) if not isinstance(config.image_config, FlavaImageConfig): - raise ValueError( + raise TypeError( "config.image_config is expected to be of type FlavaImageConfig but is of type" f" {type(config.image_config)}." ) if not isinstance(config.multimodal_config, FlavaMultimodalConfig): - raise ValueError( + raise TypeError( "config.multimodal_config is expected to be of type FlavaMultimodalConfig but " + f"is of type {type(config.multimodal_config)}." ) diff --git a/src/transformers/models/funnel/tokenization_funnel.py b/src/transformers/models/funnel/tokenization_funnel.py index 6a710d660c4e41..68e7d958b74892 100644 --- a/src/transformers/models/funnel/tokenization_funnel.py +++ b/src/transformers/models/funnel/tokenization_funnel.py @@ -315,7 +315,7 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = # Copied from transformers.models.bert.tokenization_bert.BasicTokenizer -class BasicTokenizer(object): +class BasicTokenizer: """ Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.). @@ -477,7 +477,7 @@ def _clean_text(self, text): # Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer -class WordpieceTokenizer(object): +class WordpieceTokenizer: """Runs WordPiece tokenization.""" def __init__(self, vocab, unk_token, max_input_chars_per_word=100): diff --git a/src/transformers/models/fuyu/configuration_fuyu.py b/src/transformers/models/fuyu/configuration_fuyu.py index ffcdd2b61750a6..03d2aecc02b6c9 100644 --- a/src/transformers/models/fuyu/configuration_fuyu.py +++ b/src/transformers/models/fuyu/configuration_fuyu.py @@ -188,7 +188,6 @@ def __init__( **kwargs, ) - # Copied from transformers.models.llama.configuration_llama.LlamaConfig._rope_scaling_validation def _rope_scaling_validation(self): """ Validate the `rope_scaling` configuration. diff --git a/src/transformers/models/gemma/diff_gemma.py b/src/transformers/models/gemma/diff_gemma.py index d2a653120965da..fcdb0f0b3d7d8f 100644 --- a/src/transformers/models/gemma/diff_gemma.py +++ b/src/transformers/models/gemma/diff_gemma.py @@ -180,6 +180,9 @@ def forward(self, x): output = output * (1.0 + self.weight.float()) return output.type_as(x) + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.eps}" + ALL_LAYERNORM_LAYERS.append(GemmaRMSNorm) @@ -474,7 +477,9 @@ def forward( inputs_embeds = self.embed_tokens(input_ids) return_legacy_cache = False # noqa: F841 - if use_cache and not isinstance(past_key_values, Cache): # kept for BC (non `Cache` `past_key_values` inputs) + if ( + use_cache and not isinstance(past_key_values, Cache) and not self.training + ): # kept for BC (non `Cache` `past_key_values` inputs) return_legacy_cache = True # noqa: F841 past_key_values = DynamicCache.from_legacy_cache(past_key_values) diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py index 80e97fe700b5ca..bfedbea28866e3 100644 --- a/src/transformers/models/gemma/modeling_gemma.py +++ b/src/transformers/models/gemma/modeling_gemma.py @@ -68,6 +68,9 @@ def forward(self, x): output = output * (1.0 + self.weight.float()) return output.type_as(x) + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.eps}" + ALL_LAYERNORM_LAYERS.append(GemmaRMSNorm) @@ -393,6 +396,7 @@ def forward( value_states, attention_mask, q_len, + position_ids=position_ids, dropout=dropout_rate, sliding_window=getattr(self, "sliding_window", None), is_causal=self.is_causal, @@ -769,7 +773,9 @@ def forward( inputs_embeds = self.embed_tokens(input_ids) return_legacy_cache = False # noqa: F841 - if use_cache and not isinstance(past_key_values, Cache): # kept for BC (non `Cache` `past_key_values` inputs) + if ( + use_cache and not isinstance(past_key_values, Cache) and not self.training + ): # kept for BC (non `Cache` `past_key_values` inputs) return_legacy_cache = True # noqa: F841 past_key_values = DynamicCache.from_legacy_cache(past_key_values) @@ -794,7 +800,9 @@ def forward( # See https://github.com/huggingface/transformers/pull/29402 normalizer = torch.tensor(self.config.hidden_size**0.5, dtype=hidden_states.dtype) hidden_states = hidden_states * normalizer - if use_cache and not isinstance(past_key_values, Cache): # kept for BC (non `Cache` `past_key_values` inputs) + if ( + use_cache and not isinstance(past_key_values, Cache) and not self.training + ): # kept for BC (non `Cache` `past_key_values` inputs) return_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( diff --git a/src/transformers/models/gemma2/modeling_gemma2.py b/src/transformers/models/gemma2/modeling_gemma2.py index 10d00fa460ba69..db564aa713b43a 100644 --- a/src/transformers/models/gemma2/modeling_gemma2.py +++ b/src/transformers/models/gemma2/modeling_gemma2.py @@ -70,6 +70,9 @@ def forward(self, x): output = output * (1.0 + self.weight.float()) return output.type_as(x) + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.eps}" + class Gemma2RotaryEmbedding(nn.Module): def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py index 2b9d300aa9e1bc..7a51cb3eb2cdb8 100644 --- a/src/transformers/models/gpt2/modeling_gpt2.py +++ b/src/transformers/models/gpt2/modeling_gpt2.py @@ -1030,18 +1030,18 @@ def forward( # Attention mask. _use_sdpa = self._attn_implementation == "sdpa" and output_attentions is False and head_mask is None - if attention_mask is not None: - attention_mask = attention_mask.view(batch_size, -1) - if self._attn_implementation == "flash_attention_2": - attention_mask = attention_mask if 0 in attention_mask else None - elif _use_sdpa: - attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( - attention_mask=attention_mask, - input_shape=(batch_size, input_shape[-1]), - inputs_embeds=inputs_embeds, - past_key_values_length=past_length, - ) - else: + attention_mask = attention_mask.view(batch_size, -1) if attention_mask is not None else None + if self._attn_implementation == "flash_attention_2": + attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None + elif _use_sdpa: + attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( + attention_mask=attention_mask, + input_shape=(batch_size, input_shape[-1]), + inputs_embeds=inputs_embeds, + past_key_values_length=past_length, + ) + else: + if attention_mask is not None: # We create a 3D attention mask from a 2D tensor mask. # Sizes are [batch_size, 1, 1, to_seq_length] # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] diff --git a/src/transformers/models/gpt2/tokenization_gpt2.py b/src/transformers/models/gpt2/tokenization_gpt2.py index 9bca559d9ea009..badacf6dbe71ff 100644 --- a/src/transformers/models/gpt2/tokenization_gpt2.py +++ b/src/transformers/models/gpt2/tokenization_gpt2.py @@ -329,10 +329,3 @@ def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs): if is_split_into_words or add_prefix_space: text = " " + text return (text, kwargs) - - @property - def default_chat_template(self): - """ - A simple chat template that ignores role information and just concatenates messages with EOS tokens. - """ - return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}" diff --git a/src/transformers/models/gpt2/tokenization_gpt2_fast.py b/src/transformers/models/gpt2/tokenization_gpt2_fast.py index e6747119f4227f..90e83f0d35a351 100644 --- a/src/transformers/models/gpt2/tokenization_gpt2_fast.py +++ b/src/transformers/models/gpt2/tokenization_gpt2_fast.py @@ -139,12 +139,3 @@ def _encode_plus(self, *args, **kwargs) -> BatchEncoding: def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: files = self._tokenizer.model.save(save_directory, name=filename_prefix) return tuple(files) - - @property - # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.default_chat_template - def default_chat_template(self): - """ - A simple chat template that ignores role information and just concatenates messages with EOS tokens. - """ - - return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}" diff --git a/src/transformers/models/gpt_neox/configuration_gpt_neox.py b/src/transformers/models/gpt_neox/configuration_gpt_neox.py index 8e4c94692e0537..944dbb5e02f098 100644 --- a/src/transformers/models/gpt_neox/configuration_gpt_neox.py +++ b/src/transformers/models/gpt_neox/configuration_gpt_neox.py @@ -154,7 +154,6 @@ def __init__( "The hidden size is not divisble by the number of attention heads! Make sure to update them!" ) - # Copied from transformers.models.llama.configuration_llama.LlamaConfig._rope_scaling_validation def _rope_scaling_validation(self): """ Validate the `rope_scaling` configuration. diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py index fd5e0b4fe62e25..32988e88df34a8 100755 --- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py +++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py @@ -824,25 +824,23 @@ def forward( inputs_embeds = self.embed_in(input_ids) # Attention mask. - if attention_mask is not None: - assert batch_size > 0, "batch_size has to be defined and > 0" - attention_mask = attention_mask.view(batch_size, -1) - if self._attn_implementation == "flash_attention_2": - attention_mask = attention_mask if 0 in attention_mask else None - elif self._attn_implementation == "sdpa" and not output_attentions and head_mask is None: - attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( - attention_mask=attention_mask, - input_shape=(batch_size, seq_length), - inputs_embeds=inputs_embeds, - past_key_values_length=past_length, - ) - else: - attention_mask = _prepare_4d_causal_attention_mask( - attention_mask=attention_mask, - input_shape=(batch_size, seq_length), - inputs_embeds=inputs_embeds, - past_key_values_length=past_length, - ) + attention_mask = attention_mask.view(batch_size, -1) if attention_mask is not None else None + if self._attn_implementation == "flash_attention_2": + attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None + elif self._attn_implementation == "sdpa" and not output_attentions and head_mask is None: + attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( + attention_mask=attention_mask, + input_shape=(batch_size, seq_length), + inputs_embeds=inputs_embeds, + past_key_values_length=past_length, + ) + else: + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask=attention_mask, + input_shape=(batch_size, seq_length), + inputs_embeds=inputs_embeds, + past_key_values_length=past_length, + ) # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head diff --git a/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py b/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py index 2504fa3cc05154..c79e6d9ada15d3 100644 --- a/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py +++ b/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py @@ -228,11 +228,3 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: files = self._tokenizer.model.save(save_directory, name=filename_prefix) return tuple(files) - - @property - # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.default_chat_template - def default_chat_template(self): - """ - A simple chat template that ignores role information and just concatenates messages with EOS tokens. - """ - return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}" diff --git a/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py index f36f7e3fd6104d..285dcb7d18e2b8 100644 --- a/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py +++ b/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py @@ -161,18 +161,6 @@ def convert_tokens_to_string(self, tokens): out_string = "".join(tokens).strip() return out_string - @property - def default_chat_template(self): - """ - A simple chat template that just adds BOS/EOS tokens around messages while discarding role information. - """ - return ( - "{% for message in messages %}" - "{{ bos_token + eos_token + message.content + eos_token }}" - "{% endfor %}" - "{% if add_generation_prompt %} {{ bos_token + eos_token }} {% endif %}" - ) - def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: index = 0 if os.path.isdir(save_directory): @@ -204,7 +192,7 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = return vocab_file, emoji_file -class SubWordJapaneseTokenizer(object): +class SubWordJapaneseTokenizer: """ https://github.com/tanreinama/Japanese-BPEEncoder_V2 This tokenizer class is under MIT Lisence according to the original repository. diff --git a/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py b/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py index 1000bfd1b6c8b1..262aeaba5eea10 100644 --- a/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py +++ b/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py @@ -294,19 +294,3 @@ def decode_fast(self, token_ids: Union[int, List[int]]) -> str: """ return self.sp_model.decode(token_ids) - - @property - def default_chat_template(self): - """ - This chat template formats messages like an instant messenger chat log, with "User:" and "Bot:" strings - preceding messages. BOS tokens are added between all messages. - """ - return ( - "{{ eos_token }}{{ bos_token }}" - "{% for message in messages %}" - "{% if message['role'] == 'user' %}{{ 'User: ' + message['content']}}" - "{% else %}{{ 'Bot: ' + message['content']}}{% endif %}" - "{{ message['text'] }}{{ bos_token }}" - "{% endfor %}" - "Bot:" - ) diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py index dcdccc50cc116d..6527a36bd2e3be 100644 --- a/src/transformers/models/grounding_dino/modeling_grounding_dino.py +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -2617,7 +2617,7 @@ def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: f # Copied from transformers.models.detr.modeling_detr.NestedTensor -class NestedTensor(object): +class NestedTensor: def __init__(self, tensors, mask: Optional[Tensor]): self.tensors = tensors self.mask = mask diff --git a/src/transformers/models/groupvit/modeling_groupvit.py b/src/transformers/models/groupvit/modeling_groupvit.py index 32e1d777cb7dad..2a0d4f3c0e4e2b 100644 --- a/src/transformers/models/groupvit/modeling_groupvit.py +++ b/src/transformers/models/groupvit/modeling_groupvit.py @@ -1302,13 +1302,13 @@ def __init__(self, config: GroupViTConfig): super().__init__(config) if not isinstance(config.text_config, GroupViTTextConfig): - raise ValueError( + raise TypeError( "config.text_config is expected to be of type GroupViTTextConfig but is of type" f" {type(config.text_config)}." ) if not isinstance(config.vision_config, GroupViTVisionConfig): - raise ValueError( + raise TypeError( "config.vision_config is expected to be of type GroupViTVisionConfig but is of type" f" {type(config.vision_config)}." ) diff --git a/src/transformers/models/groupvit/modeling_tf_groupvit.py b/src/transformers/models/groupvit/modeling_tf_groupvit.py index f06c5f57f83fb3..b5838a5264f69d 100644 --- a/src/transformers/models/groupvit/modeling_tf_groupvit.py +++ b/src/transformers/models/groupvit/modeling_tf_groupvit.py @@ -1443,13 +1443,13 @@ def __init__(self, config: GroupViTConfig, **kwargs): super().__init__(**kwargs) if not isinstance(config.text_config, GroupViTTextConfig): - raise ValueError( + raise TypeError( "config.text_config is expected to be of type GroupViTTextConfig but is of type" f" {type(config.text_config)}." ) if not isinstance(config.vision_config, GroupViTVisionConfig): - raise ValueError( + raise TypeError( "config.vision_config is expected to be of type GroupViTVisionConfig but is of type" f" {type(config.vision_config)}." ) diff --git a/src/transformers/models/herbert/tokenization_herbert.py b/src/transformers/models/herbert/tokenization_herbert.py index 6e37922028e7be..bb078d4dde6db6 100644 --- a/src/transformers/models/herbert/tokenization_herbert.py +++ b/src/transformers/models/herbert/tokenization_herbert.py @@ -113,7 +113,7 @@ def whitespace_tokenize(text): # Copied from transformers.models.bert.tokenization_bert.BasicTokenizer -class BasicTokenizer(object): +class BasicTokenizer: """ Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.). diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py index fd0c271b668109..da79c2894877b4 100755 --- a/src/transformers/models/hubert/modeling_hubert.py +++ b/src/transformers/models/hubert/modeling_hubert.py @@ -1325,7 +1325,7 @@ def forward( ... return batch - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = ds.map(map_to_array) >>> input_values = processor(ds["speech"][0], return_tensors="pt").input_values # Batch size 1 diff --git a/src/transformers/models/hubert/modeling_tf_hubert.py b/src/transformers/models/hubert/modeling_tf_hubert.py index 6c2a341927e200..2adfeea5b8b883 100644 --- a/src/transformers/models/hubert/modeling_tf_hubert.py +++ b/src/transformers/models/hubert/modeling_tf_hubert.py @@ -1471,7 +1471,7 @@ def call( ... return batch - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = ds.map(map_to_array) >>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values # Batch size 1 @@ -1583,7 +1583,7 @@ def call( ... return batch - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = ds.map(map_to_array) >>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values # Batch size 1 diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py index 6d658259860973..63eac6cf528ad9 100644 --- a/src/transformers/models/idefics/modeling_idefics.py +++ b/src/transformers/models/idefics/modeling_idefics.py @@ -431,6 +431,9 @@ def forward(self, hidden_states): return self.weight * hidden_states + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" + ALL_LAYERNORM_LAYERS.append(IdeficsRMSNorm) diff --git a/src/transformers/models/idefics2/modeling_idefics2.py b/src/transformers/models/idefics2/modeling_idefics2.py index 4d978c053d3fa6..7c80ddfe920f8a 100644 --- a/src/transformers/models/idefics2/modeling_idefics2.py +++ b/src/transformers/models/idefics2/modeling_idefics2.py @@ -301,7 +301,7 @@ def forward( # Flash attention requires the input to have the shape # batch_size x seq_length x head_dim x hidden_dim # therefore we just need to keep the original shape - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim) key_states = key_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) @@ -311,7 +311,6 @@ def forward( # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache # to be able to avoid many of these transpose/reshape/view. - query_states = query_states.transpose(1, 2) key_states = key_states.transpose(1, 2) value_states = value_states.transpose(1, 2) @@ -677,6 +676,9 @@ def forward(self, hidden_states): hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) return self.weight * hidden_states.to(input_dtype) + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" + class Idefics2PerceiverAttention(nn.Module): def __init__(self, config, layer_idx: Optional[int] = None) -> None: @@ -817,7 +819,7 @@ def forward( key_states = self.k_proj(torch.cat([context, latents], dim=-2)) value_states = self.v_proj(torch.cat([context, latents], dim=-2)) - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim) key_states = key_states.view(bsz, kv_seq_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, kv_seq_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) @@ -882,7 +884,6 @@ def forward( value_states = value_states.to(target_dtype) # Reashape to the expected shape for Flash Attention - query_states = query_states.transpose(1, 2) key_states = key_states.transpose(1, 2) value_states = value_states.transpose(1, 2) diff --git a/src/transformers/models/idefics2/processing_idefics2.py b/src/transformers/models/idefics2/processing_idefics2.py index c665ba74d06aef..2e14118144baaa 100644 --- a/src/transformers/models/idefics2/processing_idefics2.py +++ b/src/transformers/models/idefics2/processing_idefics2.py @@ -251,60 +251,3 @@ def model_input_names(self): tokenizer_input_names = self.tokenizer.model_input_names image_processor_input_names = self.image_processor.model_input_names return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) - - @property - def default_chat_template(self): - """ - This template formats inputs in the form of a chat history. For each message in the chat history: - * the template will output the role of the speaker followed by the content of the message. - * content can be a single string or a list of strings and images. - * If the content element is an image, the template will output a sequence of tokens and token before and after each image - * The template will output an token at the end of each message. - - Example: - - ```python - messages = [{ - "role": "user", - "content": [ - {"type": "text", "text": "What’s in this image?"}, - {"type": "image"}, - {"type": "image"}, - ], - }, - { - "role": "assistant", - "content": [{"type": "text", "text": "This picture depicts Idefix, the dog of Obelix in Asterix and Obelix. Idefix is running on the ground."},] - }] - ``` - - Will create outputs like: - ``` - User: What is in this Image? - Assistant: This picture depicts Idefix, the dog of Obelix in Asterix and Obelix. Idefix is running on the ground. - ``` - """ - # fmt: off - return ( - "{% for message in messages %}" - "{{message['role'].capitalize()}}" - "{% if message['content'][0]['type'] == 'image' %}" - "{{':'}}" - "{% else %}" - "{{': '}}" - "{% endif %}" - "{% for line in message['content'] %}" - "{% if line['type'] == 'text' %}" - "{{line['text']}}" - "{% elif line['type'] == 'image' %}" - "{{ '' }}" - "{% endif %}" - "{% endfor %}" - "\n" - "{% endfor %}" - - "{% if add_generation_prompt %}" - "{{ 'Assistant:' }}" - "{% endif %}" - ) - # fmt: on diff --git a/src/transformers/models/jamba/modeling_jamba.py b/src/transformers/models/jamba/modeling_jamba.py index 768e8e01607588..cb7ef14179ebbc 100755 --- a/src/transformers/models/jamba/modeling_jamba.py +++ b/src/transformers/models/jamba/modeling_jamba.py @@ -178,6 +178,9 @@ def forward(self, hidden_states): hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) return self.weight * hidden_states.to(input_dtype) + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" + # Copied from transformers.models.llama.modeling_llama.repeat_kv def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: @@ -406,7 +409,7 @@ def forward( # Flash attention requires the input to have the shape # batch_size x seq_length x head_dim x hidden_dim # therefore we just need to keep the original shape - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim) key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) @@ -469,7 +472,6 @@ def forward( value_states = value_states.to(target_dtype) # Reashape to the expected shape for Flash Attention - query_states = query_states.transpose(1, 2) key_states = key_states.transpose(1, 2) value_states = value_states.transpose(1, 2) @@ -1625,7 +1627,7 @@ def set_input_embeddings(self, value): @add_start_docstrings_to_model_forward(JAMBA_INPUTS_DOCSTRING) def forward( self, - input_ids: torch.LongTensor = None, + input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, diff --git a/src/transformers/models/jetmoe/modeling_jetmoe.py b/src/transformers/models/jetmoe/modeling_jetmoe.py index 16d8335e0a52e7..eaed498d4cb558 100644 --- a/src/transformers/models/jetmoe/modeling_jetmoe.py +++ b/src/transformers/models/jetmoe/modeling_jetmoe.py @@ -374,6 +374,9 @@ def forward(self, hidden_states): hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) return self.weight * hidden_states.to(input_dtype) + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" + # Copied from transformers.models.gemma.modeling_gemma.GemmaRotaryEmbedding with Gemma->JetMoe class JetMoeRotaryEmbedding(nn.Module): @@ -978,7 +981,9 @@ def forward( inputs_embeds = self.embed_tokens(input_ids) return_legacy_cache = False - if use_cache and not isinstance(past_key_values, Cache): # kept for BC (non `Cache` `past_key_values` inputs) + if ( + use_cache and not isinstance(past_key_values, Cache) and not self.training + ): # kept for BC (non `Cache` `past_key_values` inputs) return_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) @@ -1363,7 +1368,7 @@ def set_input_embeddings(self, value): @add_start_docstrings_to_model_forward(JETMOE_INPUTS_DOCSTRING) def forward( self, - input_ids: torch.LongTensor = None, + input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, diff --git a/src/transformers/models/layoutlm/tokenization_layoutlm.py b/src/transformers/models/layoutlm/tokenization_layoutlm.py index fa6a5f29e93ae7..b0a57dac5fdadc 100644 --- a/src/transformers/models/layoutlm/tokenization_layoutlm.py +++ b/src/transformers/models/layoutlm/tokenization_layoutlm.py @@ -285,7 +285,7 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = # Copied from transformers.models.bert.tokenization_bert.BasicTokenizer -class BasicTokenizer(object): +class BasicTokenizer: """ Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.). @@ -447,7 +447,7 @@ def _clean_text(self, text): # Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer -class WordpieceTokenizer(object): +class WordpieceTokenizer: """Runs WordPiece tokenization.""" def __init__(self, vocab, unk_token, max_input_chars_per_word=100): diff --git a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py index c9a138391e0f25..fe0305562374d7 100644 --- a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py +++ b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py @@ -1323,7 +1323,7 @@ def _pad( # Copied from transformers.models.bert.tokenization_bert.BasicTokenizer -class BasicTokenizer(object): +class BasicTokenizer: """ Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.). @@ -1485,7 +1485,7 @@ def _clean_text(self, text): # Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer -class WordpieceTokenizer(object): +class WordpieceTokenizer: """Runs WordPiece tokenization.""" def __init__(self, vocab, unk_token, max_input_chars_per_word=100): diff --git a/src/transformers/models/llama/configuration_llama.py b/src/transformers/models/llama/configuration_llama.py index 1a059101e42492..710809093f3849 100644 --- a/src/transformers/models/llama/configuration_llama.py +++ b/src/transformers/models/llama/configuration_llama.py @@ -20,10 +20,7 @@ """LLaMA model configuration""" from ...configuration_utils import PretrainedConfig -from ...utils import logging - - -logger = logging.get_logger(__name__) +from ...modeling_rope_utils import rope_config_validation class LlamaConfig(PretrainedConfig): @@ -76,21 +73,50 @@ class LlamaConfig(PretrainedConfig): End of stream token id. pretraining_tp (`int`, *optional*, defaults to 1): Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this - document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to understand more about it. This value is - necessary to ensure exact reproducibility of the pretraining results. Please refer to [this - issue](https://github.com/pytorch/pytorch/issues/76232). + document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to + understand more about it. This value is necessary to ensure exact reproducibility of the pretraining + results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232). tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings rope_theta (`float`, *optional*, defaults to 10000.0): The base period of the RoPE embeddings. rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling - strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is - `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update - `max_position_embeddings` to the expected new maximum. See the following thread for more information on how - these scaling strategies behave: - https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an - experimental feature, subject to breaking API changes in future versions. + Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type + and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value + accordingly. + Expected contents: + `rope_type` (`str`): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', + 'llama3'], with 'default' being the original RoPE implementation. + `factor` (`float`, *optional*): + Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In + most scaling types, a `factor` of x will enable the model to handle sequences of length x * + original maximum pre-trained length. + `original_max_position_embeddings` (`int`, *optional*): + Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during + pretraining. + `attention_factor` (`float`, *optional*): + Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention + computation. If unspecified, it defaults to value recommended by the implementation, using the + `factor` field to infer the suggested value. + `beta_fast` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear + ramp function. If unspecified, it defaults to 32. + `beta_slow` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear + ramp function. If unspecified, it defaults to 1. + `short_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to short contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `long_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to long contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `low_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE + `high_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE attention_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -158,11 +184,16 @@ def __init__( self.use_cache = use_cache self.rope_theta = rope_theta self.rope_scaling = rope_scaling - self._rope_scaling_validation() self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.mlp_bias = mlp_bias + # Validate the correctness of rotary position embeddings parameters + # BC: if there is a 'type' field, move it to 'rope_type'. + if self.rope_scaling is not None and "type" in self.rope_scaling: + self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_config_validation(self) + super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, @@ -170,23 +201,3 @@ def __init__( tie_word_embeddings=tie_word_embeddings, **kwargs, ) - - def _rope_scaling_validation(self): - """ - Validate the `rope_scaling` configuration. - """ - if self.rope_scaling is None: - return - - if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2: - raise ValueError( - "`rope_scaling` must be a dictionary with two fields, `type` and `factor`, " f"got {self.rope_scaling}" - ) - rope_scaling_type = self.rope_scaling.get("type", None) - rope_scaling_factor = self.rope_scaling.get("factor", None) - if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]: - raise ValueError( - f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}" - ) - if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0: - raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}") diff --git a/src/transformers/models/llama/convert_llama_weights_to_hf.py b/src/transformers/models/llama/convert_llama_weights_to_hf.py index fd6ab4f2e926de..384daab6b6d7a5 100644 --- a/src/transformers/models/llama/convert_llama_weights_to_hf.py +++ b/src/transformers/models/llama/convert_llama_weights_to_hf.py @@ -17,10 +17,11 @@ import os import shutil import warnings +from typing import List import torch -from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer, PreTrainedTokenizerFast +from transformers import GenerationConfig, LlamaConfig, LlamaForCausalLM, LlamaTokenizer, PreTrainedTokenizerFast from transformers.convert_slow_tokenizer import TikTokenConverter @@ -85,8 +86,12 @@ "65B": 8, "70B": 8, "70Bf": 8, + "405B": 8, + "405B-MP16": 16, } +CONTEXT_LENGTH_FOR_VERSION = {"3.1": 131072, "3": 8192, "2": 4096, "1": 2048} + def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256): return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of) @@ -107,9 +112,10 @@ def write_model( input_base_path, model_size=None, safe_serialization=True, - llama_version=1, + llama_version="1", vocab_size=None, num_shards=None, + instruct=False, ): os.makedirs(model_path, exist_ok=True) tmp_model_path = os.path.join(model_path, "tmp") @@ -125,18 +131,11 @@ def write_model( dims_per_head = dim // n_heads base = params.get("rope_theta", 10000.0) inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head)) - if base > 10000.0 and llama_version != 3: + if base > 10000.0 and float(llama_version) < 3: max_position_embeddings = 16384 else: - # Depending on the Llama version, the default max_position_embeddings has different values. - if llama_version == 1: - max_position_embeddings = 2048 - elif llama_version == 2: - max_position_embeddings = 4096 - elif llama_version == 3: - max_position_embeddings = 8192 - - vocab_size = vocab_size if vocab_size is not None else 32000 + max_position_embeddings = CONTEXT_LENGTH_FOR_VERSION[llama_version] + if params.get("n_kv_heads", None) is not None: num_key_value_heads = params["n_kv_heads"] # for GQA / MQA num_key_value_heads_per_shard = num_key_value_heads // num_shards @@ -144,8 +143,7 @@ def write_model( else: # compatibility with other checkpoints num_key_value_heads = n_heads num_key_value_heads_per_shard = n_heads_per_shard - key_value_dim = dims_per_head * num_key_value_heads - print(num_shards, num_key_value_heads, num_key_value_heads_per_shard, key_value_dim) + key_value_dim = dim # permute for sliced rotary def permute(w, n_heads, dim1=dim, dim2=dim): @@ -159,11 +157,9 @@ def permute(w, n_heads, dim1=dim, dim2=dim): loaded = torch.load(os.path.join(input_base_path, "consolidated.00.pth"), map_location="cpu") else: # Sharded - loaded = [ - torch.load(os.path.join(input_base_path, file), map_location="cpu") - for file in sorted(os.listdir(input_base_path)) - if file.endswith(".pth") - ] + checkpoint_list = sorted([file for file in os.listdir(input_base_path) if file.endswith(".pth")]) + print("Loading in order:", checkpoint_list) + loaded = [torch.load(os.path.join(input_base_path, file), map_location="cpu") for file in checkpoint_list] param_count = 0 index_dict = {"weight_map": {}} for layer_i in range(n_layers): @@ -263,7 +259,7 @@ def permute(w, n_heads, dim1=dim, dim2=dim): "lm_head.weight": loaded["output.weight"], } else: - concat_dim = 0 if llama_version == 3 else 1 + concat_dim = 0 if llama_version in ["3", "3.1"] else 1 state_dict = { "model.norm.weight": loaded[0]["norm.weight"], "model.embed_tokens.weight": torch.cat( @@ -282,6 +278,18 @@ def permute(w, n_heads, dim1=dim, dim2=dim): write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json")) ffn_dim_multiplier = params["ffn_dim_multiplier"] if "ffn_dim_multiplier" in params else 1 multiple_of = params["multiple_of"] if "multiple_of" in params else 256 + + if llama_version in ["3", "3.1"]: + bos_token_id = 128000 + + if instruct: + eos_token_id = [128001, 128008, 128009] + else: + eos_token_id = 128001 + else: + bos_token_id = 1 + eos_token_id = 2 + config = LlamaConfig( hidden_size=dim, intermediate_size=compute_intermediate_size(dim, ffn_dim_multiplier, multiple_of), @@ -292,11 +300,21 @@ def permute(w, n_heads, dim1=dim, dim2=dim): vocab_size=vocab_size, rope_theta=base, max_position_embeddings=max_position_embeddings, - bos_token_id=128000 if llama_version == 3 else 1, - eos_token_id=128001 if llama_version == 3 else 2, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, ) config.save_pretrained(tmp_model_path) + if instruct: + generation_config = GenerationConfig( + do_sample=True, + temperature=0.6, + top_p=0.9, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + ) + generation_config.save_pretrained(tmp_model_path) + # Make space so we can load the model properly now. del state_dict del loaded @@ -313,7 +331,7 @@ def permute(w, n_heads, dim1=dim, dim2=dim): class Llama3Converter(TikTokenConverter): - def __init__(self, vocab_file, num_reserved_special_tokens=256, **kwargs): + def __init__(self, vocab_file, special_tokens=None, instruct=False, model_max_length=None, **kwargs): super().__init__(vocab_file, **kwargs) tokenizer = self.converted() chat_template = ( @@ -327,34 +345,24 @@ def __init__(self, vocab_file, num_reserved_special_tokens=256, **kwargs): "{% endfor %}" "{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}" ) - num_reserved_special_tokens = 256 - special_tokens = [ - "<|begin_of_text|>", - "<|end_of_text|>", - "<|reserved_special_token_0|>", - "<|reserved_special_token_1|>", - "<|reserved_special_token_2|>", - "<|reserved_special_token_3|>", - "<|start_header_id|>", - "<|end_header_id|>", - "<|reserved_special_token_4|>", - "<|eot_id|>", # end of turn - ] + [f"<|reserved_special_token_{i}|>" for i in range(5, num_reserved_special_tokens - 5)] tokenizer.add_special_tokens(special_tokens) self.tokenizer = PreTrainedTokenizerFast( tokenizer_object=tokenizer, bos_token="<|begin_of_text|>", - eos_token="<|end_of_text|>", - chat_template=chat_template, + eos_token="<|end_of_text|>" if not instruct else "<|eot_id|>", + chat_template=chat_template if instruct else None, model_input_names=["input_ids", "attention_mask"], + model_max_length=model_max_length, ) -def write_tokenizer(tokenizer_path, input_tokenizer_path, llama_version=2): +def write_tokenizer(tokenizer_path, input_tokenizer_path, llama_version="2", special_tokens=None, instruct=False): tokenizer_class = LlamaTokenizer if LlamaTokenizerFast is None else LlamaTokenizerFast - if llama_version == 3: - tokenizer = Llama3Converter(input_tokenizer_path).tokenizer + if llama_version in ["3", "3.1"]: + tokenizer = Llama3Converter( + input_tokenizer_path, special_tokens, instruct, model_max_length=CONTEXT_LENGTH_FOR_VERSION[llama_version] + ).tokenizer else: tokenizer = tokenizer_class(input_tokenizer_path) print(f"Saving a {tokenizer_class.__name__} to {tokenizer_path}.") @@ -362,6 +370,37 @@ def write_tokenizer(tokenizer_path, input_tokenizer_path, llama_version=2): return tokenizer +DEFAULT_LLAMA_SPECIAL_TOKENS = { + "3": [ + "<|begin_of_text|>", + "<|end_of_text|>", + "<|reserved_special_token_0|>", + "<|reserved_special_token_1|>", + "<|reserved_special_token_2|>", + "<|reserved_special_token_3|>", + "<|start_header_id|>", + "<|end_header_id|>", + "<|reserved_special_token_4|>", + "<|eot_id|>", # end of turn + ] + + [f"<|reserved_special_token_{i}|>" for i in range(5, 256 - 5)], + "3.1": [ + "<|begin_of_text|>", + "<|end_of_text|>", + "<|reserved_special_token_0|>", + "<|reserved_special_token_1|>", + "<|finetune_right_pad_id|>", + "<|reserved_special_token_2|>", + "<|start_header_id|>", + "<|end_header_id|>", + "<|eom_id|>", # end of message + "<|eot_id|>", # end of turn + "<|python_tag|>", + ] + + [f"<|reserved_special_token_{i}|>" for i in range(3, 256 - 8)], +} + + def main(): parser = argparse.ArgumentParser() parser.add_argument( @@ -383,9 +422,9 @@ def main(): # Different Llama versions used different default values for max_position_embeddings, hence the need to be able to specify which version is being used. parser.add_argument( "--llama_version", - choices=[1, 2, 3], - default=1, - type=int, + choices=["1", "2", "3", "3.1"], + default="1", + type=str, help="Version of the Llama model to convert. Currently supports Llama1 and Llama2. Controls the context size", ) parser.add_argument( @@ -394,11 +433,34 @@ def main(): type=int, help="The number of individual shards used for the model. Does not have to be the same as the number of consolidated_xx.pth", ) + parser.add_argument( + "--special_tokens", + default=None, + type=List[str], + help="The list of special tokens that should be added to the model.", + ) + parser.add_argument( + "--instruct", + default=False, + type=bool, + help="Whether the model is an instruct model or not. Will affect special tokens for llama 3.1.", + ) args = parser.parse_args() if args.model_size is None and args.num_shards is None: raise ValueError("You have to set at least `num_shards` if you are not giving the `model_size`") + if args.special_tokens is None: + args.special_tokens = DEFAULT_LLAMA_SPECIAL_TOKENS[str(args.llama_version)] + spm_path = os.path.join(args.input_dir, "tokenizer.model") - vocab_size = len(write_tokenizer(args.output_dir, spm_path, llama_version=args.llama_version)) + vocab_size = len( + write_tokenizer( + args.output_dir, + spm_path, + llama_version=args.llama_version, + special_tokens=args.special_tokens, + instruct=args.instruct, + ) + ) if args.model_size != "tokenizer_only": write_model( model_path=args.output_dir, @@ -408,6 +470,7 @@ def main(): llama_version=args.llama_version, vocab_size=vocab_size, num_shards=args.num_shards, + instruct=args.instruct, ) diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index 5c0c57f3effe86..0db0ed60fca507 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -37,6 +37,7 @@ SequenceClassifierOutputWithPast, TokenClassifierOutput, ) +from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS from ...modeling_utils import PreTrainedModel from ...pytorch_utils import ALL_LAYERNORM_LAYERS from ...utils import ( @@ -70,29 +71,85 @@ def forward(self, hidden_states): hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) return self.weight * hidden_states.to(input_dtype) + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" + ALL_LAYERNORM_LAYERS.append(LlamaRMSNorm) class LlamaRotaryEmbedding(nn.Module): - def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + def __init__( + self, + dim=None, + max_position_embeddings=2048, + base=10000, + device=None, + scaling_factor=1.0, + rope_type="default", + config: Optional[LlamaConfig] = None, + ): super().__init__() - self.scaling_factor = scaling_factor - self.dim = dim - self.max_position_embeddings = max_position_embeddings - self.base = base - inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim)) + # TODO (joao): remove the `if` below, only used for BC + self.rope_kwargs = {} + if config is None: + logger.warning_once( + "`LlamaRotaryEmbedding` can now be fully parameterized by passing the model config through the " + "`config` argument. All other arguments will be removed in v4.45" + ) + self.rope_kwargs = { + "rope_type": rope_type, + "factor": scaling_factor, + "dim": dim, + "base": base, + "max_position_embeddings": max_position_embeddings, + } + self.rope_type = rope_type + self.max_seq_len_cached = max_position_embeddings + self.original_max_seq_len = max_position_embeddings + else: + # BC: "rope_type" was originally "type" + if config.rope_scaling is not None: + self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) + else: + self.rope_type = "default" + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs) self.register_buffer("inv_freq", inv_freq, persistent=False) - # For BC we register cos and sin cached - self.max_seq_len_cached = max_position_embeddings + self.original_inv_freq = self.inv_freq + + def _dynamic_frequency_update(self, position_ids, device): + """ + dynamic RoPE layers should recompute `inv_freq` in the following situations: + 1 - growing beyond the cached sequence length (allow scaling) + 2 - the current sequence length is in the original scale (avoid losing precision with small sequences) + """ + seq_len = torch.max(position_ids) + 1 + if seq_len > self.max_seq_len_cached: # growth + inv_freq, self.attention_scaling = self.rope_init_fn( + self.config, device, seq_len=seq_len, **self.rope_kwargs + ) + self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: may break with compilation + self.max_seq_len_cached = seq_len + + if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len: # reset + self.register_buffer("inv_freq", self.original_inv_freq, persistent=False) + self.max_seq_len_cached = self.original_max_seq_len @torch.no_grad() def forward(self, x, position_ids): - # x: [bs, num_attention_heads, seq_len, head_size] + if "dynamic" in self.rope_type: + self._dynamic_frequency_update(position_ids, device=x.device) + + # Core RoPE block inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) position_ids_expanded = position_ids[:, None, :].float() - # Force float32 since bfloat16 loses precision on long contexts - # See https://github.com/huggingface/transformers/pull/29285 + # Force float32 (see https://github.com/huggingface/transformers/pull/29285) device_type = x.device.type device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu" with torch.autocast(device_type=device_type, enabled=False): @@ -100,36 +157,37 @@ def forward(self, x, position_ids): emb = torch.cat((freqs, freqs), dim=-1) cos = emb.cos() sin = emb.sin() + + # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention + cos = cos * self.attention_scaling + sin = sin * self.attention_scaling + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding): """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" - def forward(self, x, position_ids): - # difference to the original RoPE: a scaling factor is aplied to the position ids - position_ids = position_ids.float() / self.scaling_factor - cos, sin = super().forward(x, position_ids) - return cos, sin + def __init__(self, *args, **kwargs): + logger.warning_once( + "`LlamaLinearScalingRotaryEmbedding` is deprecated an will be removed in v4.45. Please use " + "`LlamaRotaryEmbedding`, which now also does linear scaling (simply pass the model config to __init__)." + ) + kwargs["rope_type"] = "linear" + super().__init__(*args, **kwargs) class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding): """LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla""" - def forward(self, x, position_ids): - # difference to the original RoPE: inv_freq is recomputed when the sequence length > original length - seq_len = torch.max(position_ids) + 1 - if seq_len > self.max_position_embeddings: - base = self.base * ( - (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1) - ) ** (self.dim / (self.dim - 2)) - inv_freq = 1.0 / ( - base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(x.device) / self.dim) - ) - self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: this may break with compilation - - cos, sin = super().forward(x, position_ids) - return cos, sin + def __init__(self, *args, **kwargs): + logger.warning_once( + "`LlamaDynamicNTKScalingRotaryEmbedding` is deprecated an will be removed in v4.45. Please use " + "`LlamaRotaryEmbedding`, which now also does dynamic ntk scaling (simply pass the model config to " + "__init__)." + ) + kwargs["rope_type"] = "dynamic" + super().__init__(*args, **kwargs) def rotate_half(x): @@ -246,34 +304,9 @@ def __init__(self, config: LlamaConfig, layer_idx: Optional[int] = None): self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.attention_bias) - self._init_rope() - - def _init_rope(self): - if self.config.rope_scaling is None: - self.rotary_emb = LlamaRotaryEmbedding( - self.head_dim, - max_position_embeddings=self.max_position_embeddings, - base=self.rope_theta, - ) - else: - scaling_type = self.config.rope_scaling["type"] - scaling_factor = self.config.rope_scaling["factor"] - if scaling_type == "linear": - self.rotary_emb = LlamaLinearScalingRotaryEmbedding( - self.head_dim, - max_position_embeddings=self.max_position_embeddings, - scaling_factor=scaling_factor, - base=self.rope_theta, - ) - elif scaling_type == "dynamic": - self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding( - self.head_dim, - max_position_embeddings=self.max_position_embeddings, - scaling_factor=scaling_factor, - base=self.rope_theta, - ) - else: - raise ValueError(f"Unknown RoPE scaling type {scaling_type}") + + # TODO (joao): remove in v4.45 (RoPE is computed in the model, not in the decoder layers) + self.rotary_emb = LlamaRotaryEmbedding(config=self.config) def forward( self, @@ -284,6 +317,7 @@ def forward( output_attentions: bool = False, use_cache: bool = False, cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.45 **kwargs, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: bsz, q_len, _ = hidden_states.size() @@ -314,7 +348,16 @@ def forward( key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - cos, sin = self.rotary_emb(value_states, position_ids) + if position_embeddings is None: + logger.warning_once( + "The attention layers in this model are transitioning from computing the RoPE embeddings internally " + "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed " + "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.45 `position_ids` will be " + "removed and `position_embeddings` will be mandatory." + ) + cos, sin = self.rotary_emb(value_states, position_ids) + else: + cos, sin = position_embeddings query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) if past_key_value is not None: @@ -383,6 +426,7 @@ def forward( output_attentions: bool = False, use_cache: bool = False, cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.45 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: if isinstance(past_key_value, StaticCache): raise ValueError( @@ -405,7 +449,16 @@ def forward( key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - cos, sin = self.rotary_emb(value_states, position_ids) + if position_embeddings is None: + logger.warning_once( + "The attention layers in this model are transitioning from computing the RoPE embeddings internally " + "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed " + "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.45 `position_ids` will be " + "removed and `position_embeddings` will be mandatory." + ) + cos, sin = self.rotary_emb(value_states, position_ids) + else: + cos, sin = position_embeddings query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) if past_key_value is not None: @@ -453,6 +506,7 @@ def forward( value_states, attention_mask, q_len, + position_ids=position_ids, dropout=dropout_rate, sliding_window=getattr(self, "sliding_window", None), use_top_left_mask=self._flash_attn_uses_top_left_mask, @@ -485,6 +539,7 @@ def forward( output_attentions: bool = False, use_cache: bool = False, cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.45 **kwargs, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: if output_attentions: @@ -501,6 +556,7 @@ def forward( output_attentions=output_attentions, use_cache=use_cache, cache_position=cache_position, + position_embeddings=position_embeddings, ) bsz, q_len, _ = hidden_states.size() @@ -513,7 +569,16 @@ def forward( key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - cos, sin = self.rotary_emb(value_states, position_ids) + if position_embeddings is None: + logger.warning_once( + "The attention layers in this model are transitioning from computing the RoPE embeddings internally " + "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed " + "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.45 `position_ids` will be " + "removed and `position_embeddings` will be mandatory." + ) + cos, sin = self.rotary_emb(value_states, position_ids) + else: + cos, sin = position_embeddings query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) if past_key_value is not None: @@ -583,6 +648,7 @@ def forward( output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.45 **kwargs, ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: """ @@ -600,6 +666,9 @@ def forward( past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): Indices depicting the position of the input sequence tokens in the sequence + position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*): + Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`, + with `head_dim` being the embedding dimension of each attention head. kwargs (`dict`, *optional*): Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code into the model @@ -617,6 +686,7 @@ def forward( output_attentions=output_attentions, use_cache=use_cache, cache_position=cache_position, + position_embeddings=position_embeddings, **kwargs, ) hidden_states = residual + hidden_states @@ -779,6 +849,7 @@ def __init__(self, config: LlamaConfig): [LlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] ) self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.rotary_emb = LlamaRotaryEmbedding(config=config) self.gradient_checkpointing = False # Initialize weights and apply final processing @@ -826,7 +897,9 @@ def forward( inputs_embeds = self.embed_tokens(input_ids) return_legacy_cache = False - if use_cache and not isinstance(past_key_values, Cache): # kept for BC (non `Cache` `past_key_values` inputs) + if ( + use_cache and not isinstance(past_key_values, Cache) and not self.training + ): # kept for BC (non `Cache` `past_key_values` inputs) return_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( @@ -845,10 +918,11 @@ def forward( causal_mask = self._update_causal_mask( attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions ) - - # embed positions hidden_states = inputs_embeds + # create position embeddings to be shared across the decoder layers + position_embeddings = self.rotary_emb(hidden_states, position_ids) + # decoder layers all_hidden_states = () if output_hidden_states else None all_self_attns = () if output_attentions else None @@ -868,6 +942,7 @@ def forward( output_attentions, use_cache, cache_position, + position_embeddings, ) else: layer_outputs = decoder_layer( @@ -878,6 +953,7 @@ def forward( output_attentions=output_attentions, use_cache=use_cache, cache_position=cache_position, + position_embeddings=position_embeddings, ) hidden_states = layer_outputs[0] @@ -1192,7 +1268,7 @@ def set_input_embeddings(self, value): @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING) def forward( self, - input_ids: torch.LongTensor = None, + input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, diff --git a/src/transformers/models/llama/tokenization_llama.py b/src/transformers/models/llama/tokenization_llama.py index 80865ba98d6d67..cc03c1470ee24f 100644 --- a/src/transformers/models/llama/tokenization_llama.py +++ b/src/transformers/models/llama/tokenization_llama.py @@ -261,9 +261,8 @@ def _tokenize(self, text, **kwargs): `unk_token`. Here is an example with `unk_token = ""` and `unk_token_length = 4`. `self.tokenizer.sp_model.encode(" Hey", out_type = str)[4:]`. """ - tokens = self.sp_model.encode(text, out_type=str) if self.legacy or not text.startswith((SPIECE_UNDERLINE, " ")): - return tokens + return self.sp_model.encode(text, out_type=str) # 1. Encode string + prefix ex: " Hey" tokens = self.sp_model.encode(self.unk_token + text, out_type=str) @@ -411,57 +410,3 @@ def create_token_type_ids_from_sequences( output += [1] * len(bos_token_id + token_ids_1 + eos_token_id) return output - - @property - def default_chat_template(self): - """ - LLaMA uses [INST] and [/INST] to indicate user messages, and <> and <> to indicate system messages. - Assistant messages do not have special tokens, because LLaMA chat models are generally trained with strict - user/assistant/user/assistant message ordering, and so assistant messages can be identified from the ordering - rather than needing special tokens. The system message is partly 'embedded' in the first user message, which - results in an unusual token ordering when it is present. This template should definitely be changed if you wish - to fine-tune a model with more flexible role ordering! - - The output should look something like: - - [INST] B_SYS SystemPrompt E_SYS Prompt [/INST] Answer [INST] Prompt [/INST] Answer - [INST] Prompt [/INST] - - The reference for this chat template is [this code - snippet](https://github.com/facebookresearch/llama/blob/556949fdfb72da27c2f4a40b7f0e4cf0b8153a28/llama/generation.py#L320-L362) - in the original repository. - """ - template = ( - "{% if messages[0]['role'] == 'system' %}" - "{% set loop_messages = messages[1:] %}" # Extract system message if it's present - "{% set system_message = messages[0]['content'] %}" - "{% elif USE_DEFAULT_PROMPT == true and not '<>' in messages[0]['content'] %}" - "{% set loop_messages = messages %}" # Or use the default system message if the flag is set - "{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}" - "{% else %}" - "{% set loop_messages = messages %}" - "{% set system_message = false %}" - "{% endif %}" - "{% for message in loop_messages %}" # Loop over all non-system messages - "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}" - "{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}" - "{% endif %}" - "{% if loop.index0 == 0 and system_message != false %}" # Embed system message in first message - "{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}" - "{% else %}" - "{% set content = message['content'] %}" - "{% endif %}" - "{% if message['role'] == 'user' %}" # After all of that, handle messages/roles in a fairly normal way - "{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}" - "{% elif message['role'] == 'system' %}" - "{{ '<>\\n' + content.strip() + '\\n<>\\n\\n' }}" - "{% elif message['role'] == 'assistant' %}" - "{{ ' ' + content.strip() + ' ' + eos_token }}" - "{% endif %}" - "{% endfor %}" - ) - template = template.replace("USE_DEFAULT_PROMPT", "true" if self.use_default_system_prompt else "false") - default_message = DEFAULT_SYSTEM_PROMPT.replace("\n", "\\n").replace("'", "\\'") - template = template.replace("DEFAULT_SYSTEM_MESSAGE", default_message) - - return template diff --git a/src/transformers/models/llama/tokenization_llama_fast.py b/src/transformers/models/llama/tokenization_llama_fast.py index 91d3bf3615171f..67e339b4290a2b 100644 --- a/src/transformers/models/llama/tokenization_llama_fast.py +++ b/src/transformers/models/llama/tokenization_llama_fast.py @@ -241,61 +241,6 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = return (out_vocab_file,) - @property - # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.default_chat_template - def default_chat_template(self): - """ - LLaMA uses [INST] and [/INST] to indicate user messages, and <> and <> to indicate system messages. - Assistant messages do not have special tokens, because LLaMA chat models are generally trained with strict - user/assistant/user/assistant message ordering, and so assistant messages can be identified from the ordering - rather than needing special tokens. The system message is partly 'embedded' in the first user message, which - results in an unusual token ordering when it is present. This template should definitely be changed if you wish - to fine-tune a model with more flexible role ordering! - - The output should look something like: - - [INST] B_SYS SystemPrompt E_SYS Prompt [/INST] Answer [INST] Prompt [/INST] Answer - [INST] Prompt [/INST] - - The reference for this chat template is [this code - snippet](https://github.com/facebookresearch/llama/blob/556949fdfb72da27c2f4a40b7f0e4cf0b8153a28/llama/generation.py#L320-L362) - in the original repository. - """ - template = ( - "{% if messages[0]['role'] == 'system' %}" - "{% set loop_messages = messages[1:] %}" # Extract system message if it's present - "{% set system_message = messages[0]['content'] %}" - "{% elif USE_DEFAULT_PROMPT == true and not '<>' in messages[0]['content'] %}" - "{% set loop_messages = messages %}" # Or use the default system message if the flag is set - "{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}" - "{% else %}" - "{% set loop_messages = messages %}" - "{% set system_message = false %}" - "{% endif %}" - "{% for message in loop_messages %}" # Loop over all non-system messages - "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}" - "{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}" - "{% endif %}" - "{% if loop.index0 == 0 and system_message != false %}" # Embed system message in first message - "{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}" - "{% else %}" - "{% set content = message['content'] %}" - "{% endif %}" - "{% if message['role'] == 'user' %}" # After all of that, handle messages/roles in a fairly normal way - "{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}" - "{% elif message['role'] == 'system' %}" - "{{ '<>\\n' + content.strip() + '\\n<>\\n\\n' }}" - "{% elif message['role'] == 'assistant' %}" - "{{ ' ' + content.strip() + ' ' + eos_token }}" - "{% endif %}" - "{% endfor %}" - ) - template = template.replace("USE_DEFAULT_PROMPT", "true" if self.use_default_system_prompt else "false") - default_message = DEFAULT_SYSTEM_PROMPT.replace("\n", "\\n").replace("'", "\\'") - template = template.replace("DEFAULT_SYSTEM_MESSAGE", default_message) - - return template - # TODO ArthurZ let's rely on the template processor instead, refactor all fast tokenizers # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.build_inputs_with_special_tokens def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): diff --git a/src/transformers/models/llava/processing_llava.py b/src/transformers/models/llava/processing_llava.py index e51e6ba076532a..a563b1cb82e788 100644 --- a/src/transformers/models/llava/processing_llava.py +++ b/src/transformers/models/llava/processing_llava.py @@ -104,14 +104,14 @@ def __call__( - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. """ if images is not None: - pixel_values = self.image_processor(images, return_tensors=return_tensors)["pixel_values"] + image_inputs = self.image_processor(images, return_tensors=return_tensors) else: - pixel_values = None + image_inputs = {} text_inputs = self.tokenizer( text, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length ) - return BatchFeature(data={**text_inputs, "pixel_values": pixel_values}) + return BatchFeature(data={**text_inputs, **image_inputs}) # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama def batch_decode(self, *args, **kwargs): diff --git a/src/transformers/models/llava_next/convert_llava_next_weights_to_hf.py b/src/transformers/models/llava_next/convert_llava_next_weights_to_hf.py index 2c8aefe39dc255..06edc5c9b1adbc 100644 --- a/src/transformers/models/llava_next/convert_llava_next_weights_to_hf.py +++ b/src/transformers/models/llava_next/convert_llava_next_weights_to_hf.py @@ -24,6 +24,7 @@ """ import argparse +import gc import glob import json from pathlib import Path @@ -111,6 +112,16 @@ def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False): elif model_id == "liuhaotian/llava-v1.6-34b": text_model_id = "NousResearch/Nous-Hermes-2-Yi-34B" image_token_index = 64000 + elif model_id == "lmms-lab/llama3-llava-next-8b": + text_model_id = "meta-llama/Meta-Llama-3-8B-Instruct" + image_token_index = 128256 + elif model_id == "lmms-lab/llava-next-72b": + text_model_id = "Qwen/Qwen1.5-72B-Chat" + image_token_index = 151646 + elif model_id == "lmms-lab/llava-next-110b": + text_model_id = "Qwen/Qwen1.5-110B-Chat" + image_token_index = 151646 + vision_model_id = data["mm_vision_tower"] torch.set_default_dtype(torch.float16) @@ -120,7 +131,7 @@ def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False): tokenizer = AutoTokenizer.from_pretrained(text_model_id, use_fast=use_fast) tokenizer.add_tokens(AddedToken("", special=True, normalized=False), special_tokens=True) - if model_id == "liuhaotian/llava-v1.6-mistral-7b": + if model_id in ("liuhaotian/llava-v1.6-mistral-7b", "lmms-lab/llama3-llava-next-8b"): # Mistral-7B doesn't have a padding token set yet tokenizer.add_special_tokens({"pad_token": ""}) @@ -151,28 +162,45 @@ def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False): # We add an image token so we resize the model # Pad to 64 for performance reasons - pad_shape = 64 - vocab_size = config.text_config.vocab_size - if model_id == "liuhaotian/llava-v1.6-34b": - # this one has 3 additional tokens, namely <|startoftext|>, <|endoftext|> and - num_tokens = vocab_size + 3 - else: - # this one has 2 additional tokens, namely and - num_tokens = vocab_size + 2 - model.resize_token_embeddings(num_tokens, pad_to_multiple_of=pad_shape) - model.language_model.model.embed_tokens.weight.data[vocab_size:] = torch.stack( - tuple( - (dist.sample() for _ in range(model.language_model.model.embed_tokens.weight.data[vocab_size:].shape[0])) - ), - dim=0, - ) - model.language_model.lm_head.weight.data[vocab_size:] = torch.stack( - tuple((dist.sample() for _ in range(model.language_model.lm_head.weight.data[vocab_size:].shape[0]))), - dim=0, - ) + # Qwen-based models have extra unused space in the vocab size already, so no need to resize + if model_id not in ["lmms-lab/llava-next-72b", "lmms-lab/llava-next-110b"]: + pad_shape = 64 + vocab_size = config.text_config.vocab_size + if model_id == "liuhaotian/llava-v1.6-34b": + # this one has 3 additional tokens, namely <|startoftext|>, <|endoftext|> and + num_tokens = vocab_size + 3 + else: + # this one has 2 additional tokens, namely and + num_tokens = vocab_size + 2 + model.resize_token_embeddings(num_tokens, pad_to_multiple_of=pad_shape) + model.language_model.model.embed_tokens.weight.data[vocab_size:] = torch.stack( + tuple( + ( + dist.sample() + for _ in range(model.language_model.model.embed_tokens.weight.data[vocab_size:].shape[0]) + ) + ), + dim=0, + ) + model.language_model.lm_head.weight.data[vocab_size:] = torch.stack( + tuple((dist.sample() for _ in range(model.language_model.lm_head.weight.data[vocab_size:].shape[0]))), + dim=0, + ) + + print(f"Saving model and processor for {model_id} to {pytorch_dump_folder_path}") + Path(pytorch_dump_folder_path).mkdir(exist_ok=True) + model.save_pretrained(pytorch_dump_folder_path) + processor.save_pretrained(pytorch_dump_folder_path) + + # Make space so we can load the model properly now. + del state_dict + gc.collect() - device = "cuda:2" - model.to(device) + # Load everything back for inference tests in float32 because prev script was written as that + # Though it's mostly loaded in fp16 as original weights are in fp16 + model = LlavaNextForConditionalGeneration.from_pretrained(pytorch_dump_folder_path, device_map="auto") + processor = LlavaNextProcessor.from_pretrained(pytorch_dump_folder_path) + device = model.device # prepare inputs image = load_image() @@ -182,6 +210,11 @@ def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False): prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: \nWhat is shown in this image? ASSISTANT:" elif model_id == "liuhaotian/llava-v1.6-34b": prompt = "<|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n\nWhat is shown in this image?<|im_end|><|im_start|>assistant\n" + elif model_id == "lmms-lab/llama3-llava-next-8b": + prompt = "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.<|eot_id|><|start_header_id|><|start_header_id|>user<|end_header_id|>\n\n\nWhat is shown in this image?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" + elif model_id in ["lmms-lab/llava-next-72b", "lmms-lab/llava-next-110b"]: + prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n\nWhat is shown in this image?<|im_end|>\n<|im_start|>assistant\n" + inputs = processor(images=image, text=prompt, return_tensors="pt") # verify inputs @@ -194,8 +227,6 @@ def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False): original_input_ids = torch.load(filepath, map_location="cpu") # replace -200 by image_token_index (since we use token ID = 32000 for the image token) original_input_ids[original_input_ids == -200] = image_token_index - print(tokenizer.decode([id for id in original_input_ids.tolist()[0] if id != -200])) - assert original_input_ids[0].tolist() == inputs.input_ids[0].tolist() elif model_id == "liuhaotian/llava-v1.6-34b": @@ -243,6 +274,26 @@ def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False): dtype=torch.float32, device=device, ) + elif model_id == "lmms-lab/llama3-llava-next-8b": + expected_slice = torch.tensor( + [[-3.9648, 1.1396, 3.3145], [-5.3594, -1.5654, -1.9619], [-12.3750, -10.6797, -9.3125]], + dtype=torch.float32, + device=device, + ) + elif model_id == "lmms-lab/llava-next-72b": + # Not yet checked against reference + expected_slice = torch.tensor( + [[3.7148, 3.9277, 3.4395], [-0.4341, 1.1387, 6.5117], [3.2324, 3.4688, 4.1133]], + dtype=torch.float32, + device=device, + ) + elif model_id == "lmms-lab/llava-next-110b": + # Not yet checked against reference + expected_slice = torch.tensor( + [[-2.5449, -1.6738, -2.0371], [1.0811, 3.4961, 5.0312], [1.7803, 2.5137, 2.4277]], + dtype=torch.float32, + device=device, + ) else: raise ValueError(f"Model {model_id} not supported") @@ -268,6 +319,12 @@ def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False): expected_text = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: \nWhat is shown in this image? ASSISTANT: The image appears to be a radar chart, also known as a spider chart or star chart, which is a graphical method of displaying multivariate data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point.\n\nIn this particular radar chart, there are several variables represented:\n\n- MM-Vet\n- LLa-Va-Bench\n- SEED-Bench\n- MM" elif model_id == "liuhaotian/llava-v1.6-34b": expected_text = "<|im_start|> system\nAnswer the questions. <|im_start|> user\n\nWhat is shown in this image? <|im_start|> assistant\nThe image appears to be a radar chart, also known as a spider chart, which is a graphical method of displaying multivariate data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point.\n\nIn this particular chart, there are several datasets represented by different colors and labeled with various acronyms such as MM-Vet, LLaVA-Bench, SEED-Bench, MM-Bench-CN, MM-" + elif model_id == "lmms-lab/llama3-llava-next-8b": + expected_text = 'system\n\nYou are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.user\n\n\nWhat is shown in this image?assistant\n\n\nThe image shows a radar chart, also known as a spider chart or a web chart, which is a type of graph used to display multivariate data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point. Each axis represents a different variable, and the values are plotted along each axis and connected to form a polygon.\n\nIn this particular radar chart, there are several axes labeled with different variables, such as "MM-Vet," "LL' + elif model_id == "lmms-lab/llava-next-72b": + expected_text = "system\nYou are a helpful assistant.\nuser\n\nWhat is shown in this image?\nassistant\nThe image displays a radar chart, also known as a spider chart or a star chart, which is a graphical method of displaying multivariate data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point. Each axis represents a different variable, and the value of each variable is represented by the distance from the center of the chart to the point where the axis intersects with the line representing that variable's value.\n\nIn this particular chart, there are several axes" + elif model_id == "lmms-lab/llava-next-110b": + expected_text = "system\nYou are a helpful assistant.\nuser\n\nWhat is shown in this image?\nassistant\nThe image shows a radar chart comparing the performance of different models on various visual question answering (VQA) benchmarks. Each colored line represents a different model, and the distance from the center of the chart indicates the score or performance level of the model on a particular benchmark. The benchmarks are labeled around the edges of the chart, and include VQA v2, GQA, VizWiz, TextVQA, MMBench-CN, MME, and others. The chart allows for a" else: raise ValueError(f"Model {model_id} not supported") @@ -281,7 +338,7 @@ def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False): inputs = processor( images=[image, cats_image], - text=[prompt, "[INST] \nHow many cats are there? [/INST]"], + text=[prompt, prompt], padding=True, return_tensors="pt", ).to(device) @@ -305,16 +362,11 @@ def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False): outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True) print(outputs) - if pytorch_dump_folder_path is not None: - print(f"Saving model and processor for {model_id} to {pytorch_dump_folder_path}") - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - if push_to_hub: - repo_id = model_id.split("/")[-1] - model.push_to_hub(f"llava-hf/{repo_id}-hf") - processor.push_to_hub(f"llava-hf/{repo_id}-hf") + checkpoint_name = model_id.split("/")[-1] + print(f"Pushing to repo llava-hf/{checkpoint_name}-hf") + model.push_to_hub(f"llava-hf/{checkpoint_name}-hf") + processor.push_to_hub(f"llava-hf/{checkpoint_name}-hf") if __name__ == "__main__": @@ -328,11 +380,14 @@ def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False): "liuhaotian/llava-v1.6-vicuna-7b", "liuhaotian/llava-v1.6-vicuna-13b", "liuhaotian/llava-v1.6-34b", + "lmms-lab/llama3-llava-next-8b", + "lmms-lab/llava-next-72b", + "lmms-lab/llava-next-110b", ], required=False, ) parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." + "--pytorch_dump_folder_path", type=str, required=True, help="Path to the output PyTorch model directory." ) parser.add_argument( "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub." diff --git a/src/transformers/models/llava_next/image_processing_llava_next.py b/src/transformers/models/llava_next/image_processing_llava_next.py index 6295fb9562458b..f744b9fcf9c1cd 100644 --- a/src/transformers/models/llava_next/image_processing_llava_next.py +++ b/src/transformers/models/llava_next/image_processing_llava_next.py @@ -513,7 +513,7 @@ def get_image_patches( List[np.array]: A list of NumPy arrays containing the processed image patches. """ if not isinstance(grid_pinpoints, list): - raise ValueError("grid_pinpoints must be a list of possible resolutions.") + raise TypeError("grid_pinpoints must be a list of possible resolutions.") possible_resolutions = grid_pinpoints diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py index 23e3c25025fcb6..ad76561df54fd7 100644 --- a/src/transformers/models/llava_next/modeling_llava_next.py +++ b/src/transformers/models/llava_next/modeling_llava_next.py @@ -60,12 +60,12 @@ def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size): tuple: The shape of the image patch grid in the format (width, height). """ if not isinstance(grid_pinpoints, list): - raise ValueError("grid_pinpoints should be a list of tuples or lists") + raise TypeError("grid_pinpoints should be a list of tuples or lists") # ! VERY IMPORTANT if image_size is tensor, must convert to into tuple, otherwise it will cause wrong calculate if not isinstance(image_size, (list, tuple)): if not isinstance(image_size, (torch.Tensor, np.ndarray)): - raise ValueError( + raise TypeError( f"image_size invalid type: {type(image_size)} not valid, should be either list, tuple, np.ndarray or tensor" ) image_size = image_size.tolist() @@ -91,12 +91,12 @@ def image_size_to_num_patches(image_size, grid_pinpoints, patch_size: int): int: the number of patches """ if not isinstance(grid_pinpoints, list): - raise ValueError("grid_pinpoints should be a list of tuples or lists") + raise TypeError("grid_pinpoints should be a list of tuples or lists") # ! VERY IMPORTANT if image_size is tensor, must convert to into tuple, otherwise it will cause wrong calculate if not isinstance(image_size, (list, tuple)): if not isinstance(image_size, (torch.Tensor, np.ndarray)): - raise ValueError(f"image_size invalid type {type(image_size)} with value {image_size}") + raise TypeError(f"image_size invalid type {type(image_size)} with value {image_size}") image_size = image_size.tolist() best_resolution = select_best_resolution(image_size, grid_pinpoints) @@ -518,8 +518,8 @@ def _merge_input_ids_with_image_features( _left_padding = torch.any(attention_mask[:, 0] == 0) _right_padding = torch.any(attention_mask[:, -1] == 0) - left_padding = True - if batch_size > 1: + left_padding = True if not self.training else False + if batch_size > 1 and not self.training: if _left_padding and not _right_padding: left_padding = True elif not _left_padding and _right_padding: diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py index 30b6abdf8e9f44..e3264dfd91e1a1 100644 --- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py @@ -66,12 +66,12 @@ def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size): tuple: The shape of the image patch grid in the format (width, height). """ if not isinstance(grid_pinpoints, list): - raise ValueError("grid_pinpoints should be a list of tuples or lists") + raise TypeError("grid_pinpoints should be a list of tuples or lists") # ! VERY IMPORTANT if image_size is tensor, must convert to into tuple, otherwise it will cause wrong calculate if not isinstance(image_size, (list, tuple)): if not isinstance(image_size, (torch.Tensor, np.ndarray)): - raise ValueError( + raise TypeError( f"image_size invalid type: {type(image_size)} not valid, should be either list, tuple, np.ndarray or tensor" ) image_size = image_size.tolist() @@ -97,12 +97,12 @@ def image_size_to_num_patches(image_size, grid_pinpoints, patch_size: int): int: the number of patches """ if not isinstance(grid_pinpoints, list): - raise ValueError("grid_pinpoints should be a list of tuples or lists") + raise TypeError("grid_pinpoints should be a list of tuples or lists") # ! VERY IMPORTANT if image_size is tensor, must convert to into tuple, otherwise it will cause wrong calculate if not isinstance(image_size, (list, tuple)): if not isinstance(image_size, (torch.Tensor, np.ndarray)): - raise ValueError(f"image_size invalid type {type(image_size)} with value {image_size}") + raise TypeError(f"image_size invalid type {type(image_size)} with value {image_size}") image_size = image_size.tolist() best_resolution = select_best_resolution(image_size, grid_pinpoints) @@ -562,8 +562,8 @@ def _merge_input_ids_with_image_features( _left_padding = torch.any(attention_mask[:, 0] == 0) _right_padding = torch.any(attention_mask[:, -1] == 0) - left_padding = True - if batch_size > 1: + left_padding = True if not self.training else False + if batch_size > 1 and not self.training: if _left_padding and not _right_padding: left_padding = True elif not _left_padding and _right_padding: diff --git a/src/transformers/models/llava_next_video/processing_llava_next_video.py b/src/transformers/models/llava_next_video/processing_llava_next_video.py index 81426b3a0af3ac..6b5e86ab414958 100644 --- a/src/transformers/models/llava_next_video/processing_llava_next_video.py +++ b/src/transformers/models/llava_next_video/processing_llava_next_video.py @@ -159,63 +159,3 @@ def model_input_names(self): tokenizer_input_names = self.tokenizer.model_input_names image_processor_input_names = self.image_processor.model_input_names return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) - - @property - def default_chat_template(self): - """ - This default vicuna template formats inputs in the form of a chat history. For each message in the chat history: - * the template will output the role of the speaker followed by the content of the message. - * content is a list of strings and images. - * If the content element is an image, the template will output a sequence of or