From 12b6880c81db7742a29ea425dcb9e63b7dbdc449 Mon Sep 17 00:00:00 2001 From: Sai-Suraj-27 Date: Mon, 22 Jul 2024 22:16:17 +0530 Subject: [PATCH 01/73] fix: Fixed raising `TypeError` instead of `ValueError` for invalid type (#32111) * Raised TypeError instead of ValueError for invalid types. * Updated formatting using ruff. * Retrieved few changes. * Retrieved few changes. * Updated tests accordingly. --- .../distillation/grouped_batch_sampler.py | 2 +- .../research_projects/tapex/wikisql_utils.py | 4 +-- src/transformers/agents/agent_types.py | 2 +- src/transformers/configuration_utils.py | 2 +- src/transformers/data/processors/xnli.py | 12 ++++---- .../generation/beam_constraints.py | 10 +++---- src/transformers/generation/utils.py | 4 +-- src/transformers/image_processing_base.py | 2 +- src/transformers/image_transforms.py | 6 ++-- src/transformers/image_utils.py | 2 +- src/transformers/integrations/awq.py | 2 +- src/transformers/integrations/peft.py | 4 +-- src/transformers/modeling_tf_utils.py | 2 +- .../models/align/modeling_align.py | 4 +-- .../models/altclip/modeling_altclip.py | 4 +-- .../models/bark/processing_bark.py | 2 +- src/transformers/models/blip/modeling_blip.py | 4 +-- .../models/blip/modeling_tf_blip.py | 4 +-- .../models/chameleon/processing_chameleon.py | 2 +- .../chinese_clip/modeling_chinese_clip.py | 4 +-- src/transformers/models/clap/modeling_clap.py | 4 +-- src/transformers/models/clip/modeling_clip.py | 4 +-- .../models/clip/modeling_tf_clip.py | 4 +-- .../models/clipseg/modeling_clipseg.py | 4 +-- src/transformers/models/clvp/modeling_clvp.py | 6 ++-- .../deberta_v2/tokenization_deberta_v2.py | 2 +- .../depth_anything/modeling_depth_anything.py | 2 +- src/transformers/models/dpt/modeling_dpt.py | 2 +- .../models/esm/openfold_utils/chunk_utils.py | 4 +-- .../esm/openfold_utils/residue_constants.py | 2 +- .../models/esm/openfold_utils/tensor_utils.py | 2 +- .../models/flava/modeling_flava.py | 6 ++-- .../models/groupvit/modeling_groupvit.py | 4 +-- .../models/groupvit/modeling_tf_groupvit.py | 4 +-- .../llava_next/image_processing_llava_next.py | 2 +- .../models/llava_next/modeling_llava_next.py | 8 ++--- .../modeling_llava_next_video.py | 8 ++--- .../models/luke/tokenization_luke.py | 2 +- .../models/mluke/tokenization_mluke.py | 2 +- .../models/owlv2/modeling_owlv2.py | 4 +-- .../models/owlvit/modeling_owlvit.py | 4 +-- src/transformers/models/rag/retrieval_rag.py | 2 +- .../models/siglip/modeling_siglip.py | 4 +-- .../models/udop/configuration_udop.py | 2 +- .../processing_wav2vec2_with_lm.py | 2 +- .../models/x_clip/modeling_x_clip.py | 4 +-- .../models/zoedepth/modeling_zoedepth.py | 2 +- .../pipelines/audio_classification.py | 2 +- .../pipelines/automatic_speech_recognition.py | 2 +- .../zero_shot_audio_classification.py | 2 +- src/transformers/processing_utils.py | 2 +- src/transformers/tokenization_utils.py | 2 +- src/transformers/utils/quantization_config.py | 30 +++++++++---------- tests/agents/test_tools_common.py | 2 +- tests/models/luke/test_tokenization_luke.py | 2 +- tests/models/mluke/test_tokenization_mluke.py | 2 +- .../test_pipelines_feature_extraction.py | 2 +- tests/test_pipeline_mixin.py | 2 +- 58 files changed, 111 insertions(+), 113 deletions(-) diff --git a/examples/research_projects/distillation/grouped_batch_sampler.py b/examples/research_projects/distillation/grouped_batch_sampler.py index fd126b13b58ee7..e25def738a8483 100644 --- a/examples/research_projects/distillation/grouped_batch_sampler.py +++ b/examples/research_projects/distillation/grouped_batch_sampler.py @@ -59,7 +59,7 @@ class GroupedBatchSampler(BatchSampler): def __init__(self, sampler, group_ids, batch_size): if not isinstance(sampler, Sampler): - raise ValueError( + raise TypeError( "sampler should be an instance of torch.utils.data.Sampler, but got sampler={}".format(sampler) ) self.sampler = sampler diff --git a/examples/research_projects/tapex/wikisql_utils.py b/examples/research_projects/tapex/wikisql_utils.py index 3351bddf019448..13d10e091a10c1 100644 --- a/examples/research_projects/tapex/wikisql_utils.py +++ b/examples/research_projects/tapex/wikisql_utils.py @@ -48,7 +48,7 @@ def convert_to_float(value): if isinstance(value, int): return float(value) if not isinstance(value, str): - raise ValueError("Argument value is not a string. Can't parse it as float") + raise TypeError("Argument value is not a string. Can't parse it as float") sanitized = value try: @@ -158,7 +158,7 @@ def _respect_conditions(table, row, conditions): cmp_value = _normalize_for_match(cmp_value) if not isinstance(table_value, type(cmp_value)): - raise ValueError("Type difference {} != {}".format(type(table_value), type(cmp_value))) + raise TypeError("Type difference {} != {}".format(type(table_value), type(cmp_value))) if not _compare(cond.operator, table_value, cmp_value): return False diff --git a/src/transformers/agents/agent_types.py b/src/transformers/agents/agent_types.py index 0b4999b7f76d3c..114b6de01c3333 100644 --- a/src/transformers/agents/agent_types.py +++ b/src/transformers/agents/agent_types.py @@ -107,7 +107,7 @@ def __init__(self, value): elif isinstance(value, np.ndarray): self._tensor = torch.tensor(value) else: - raise ValueError(f"Unsupported type for {self.__class__.__name__}: {type(value)}") + raise TypeError(f"Unsupported type for {self.__class__.__name__}: {type(value)}") def _ipython_display_(self, include=None, exclude=None): """ diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py index c6de824339bbc0..2f84bc29aee25d 100755 --- a/src/transformers/configuration_utils.py +++ b/src/transformers/configuration_utils.py @@ -1004,7 +1004,7 @@ def update_from_string(self, update_str: str): elif isinstance(old_v, float): v = float(v) elif not isinstance(old_v, str): - raise ValueError( + raise TypeError( f"You can only update int, float, bool or string values in the config, got {v} for key {k}" ) diff --git a/src/transformers/data/processors/xnli.py b/src/transformers/data/processors/xnli.py index 459c5bc3a6a38e..4d8ec17a8345db 100644 --- a/src/transformers/data/processors/xnli.py +++ b/src/transformers/data/processors/xnli.py @@ -47,11 +47,11 @@ def get_train_examples(self, data_dir): text_b = line[1] label = "contradiction" if line[2] == "contradictory" else line[2] if not isinstance(text_a, str): - raise ValueError(f"Training input {text_a} is not a string") + raise TypeError(f"Training input {text_a} is not a string") if not isinstance(text_b, str): - raise ValueError(f"Training input {text_b} is not a string") + raise TypeError(f"Training input {text_b} is not a string") if not isinstance(label, str): - raise ValueError(f"Training label {label} is not a string") + raise TypeError(f"Training label {label} is not a string") examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples @@ -70,11 +70,11 @@ def get_test_examples(self, data_dir): text_b = line[7] label = line[1] if not isinstance(text_a, str): - raise ValueError(f"Training input {text_a} is not a string") + raise TypeError(f"Training input {text_a} is not a string") if not isinstance(text_b, str): - raise ValueError(f"Training input {text_b} is not a string") + raise TypeError(f"Training input {text_b} is not a string") if not isinstance(label, str): - raise ValueError(f"Training label {label} is not a string") + raise TypeError(f"Training label {label} is not a string") examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples diff --git a/src/transformers/generation/beam_constraints.py b/src/transformers/generation/beam_constraints.py index b53c4512427a87..e6462f322c49f7 100644 --- a/src/transformers/generation/beam_constraints.py +++ b/src/transformers/generation/beam_constraints.py @@ -156,7 +156,7 @@ def advance(self): def does_advance(self, token_id: int): if not isinstance(token_id, int): - raise ValueError(f"`token_id` has to be an `int`, but is {token_id} of type {type(token_id)}") + raise TypeError(f"`token_id` has to be an `int`, but is {token_id} of type {type(token_id)}") if self.completed: return False @@ -165,7 +165,7 @@ def does_advance(self, token_id: int): def update(self, token_id: int): if not isinstance(token_id, int): - raise ValueError(f"`token_id` has to be an `int`, but is {token_id} of type {type(token_id)}") + raise TypeError(f"`token_id` has to be an `int`, but is {token_id} of type {type(token_id)}") stepped = False completed = False @@ -300,7 +300,7 @@ def advance(self): def does_advance(self, token_id: int): if not isinstance(token_id, int): - raise ValueError(f"`token_id` is supposed to be type `int`, but is {token_id} of type {type(token_id)}") + raise TypeError(f"`token_id` is supposed to be type `int`, but is {token_id} of type {type(token_id)}") next_tokens = self.trie.next_tokens(self.current_seq) @@ -308,7 +308,7 @@ def does_advance(self, token_id: int): def update(self, token_id: int): if not isinstance(token_id, int): - raise ValueError(f"`token_id` is supposed to be type `int`, but is {token_id} of type {type(token_id)}") + raise TypeError(f"`token_id` is supposed to be type `int`, but is {token_id} of type {type(token_id)}") stepped = False completed = False @@ -432,7 +432,7 @@ def reset(self, token_ids: Optional[List[int]]): def add(self, token_id: int): if not isinstance(token_id, int): - raise ValueError(f"`token_id` should be an `int`, but is `{token_id}`.") + raise TypeError(f"`token_id` should be an `int`, but is `{token_id}`.") complete, stepped = False, False diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index 51019da9a6b378..9d3a92d268819a 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -4281,7 +4281,7 @@ def _split(data, full_batch_size: int, split_size: int = None): for i in range(0, full_batch_size, split_size) ] else: - raise ValueError(f"Unexpected attribute type: {type(data)}") + raise TypeError(f"Unexpected attribute type: {type(data)}") def _split_model_inputs( @@ -4388,7 +4388,7 @@ def _concat(data): # If the elements are integers or floats, return a tensor return torch.tensor(data) else: - raise ValueError(f"Unexpected attribute type: {type(data[0])}") + raise TypeError(f"Unexpected attribute type: {type(data[0])}") # Use a dictionary comprehension to gather attributes from all objects and concatenate them concatenated_data = { diff --git a/src/transformers/image_processing_base.py b/src/transformers/image_processing_base.py index 6c80aee0164722..9b314f83c11fb1 100644 --- a/src/transformers/image_processing_base.py +++ b/src/transformers/image_processing_base.py @@ -544,7 +544,7 @@ def fetch_images(self, image_url_or_urls: Union[str, List[str]]): response.raise_for_status() return Image.open(BytesIO(response.content)) else: - raise ValueError(f"only a single or a list of entries is supported but got type={type(image_url_or_urls)}") + raise TypeError(f"only a single or a list of entries is supported but got type={type(image_url_or_urls)}") ImageProcessingMixin.push_to_hub = copy_func(ImageProcessingMixin.push_to_hub) diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py index 4e4812879eed1c..580570f6066278 100644 --- a/src/transformers/image_transforms.py +++ b/src/transformers/image_transforms.py @@ -75,7 +75,7 @@ def to_channel_dimension_format( `np.ndarray`: The image with the channel dimension set to `channel_dim`. """ if not isinstance(image, np.ndarray): - raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}") + raise TypeError(f"Input image must be of type np.ndarray, got {type(image)}") if input_channel_dim is None: input_channel_dim = infer_channel_dimension_format(image) @@ -121,7 +121,7 @@ def rescale( `np.ndarray`: The rescaled image. """ if not isinstance(image, np.ndarray): - raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}") + raise TypeError(f"Input image must be of type np.ndarray, got {type(image)}") rescaled_image = image * scale if data_format is not None: @@ -453,7 +453,7 @@ def center_crop( return_numpy = True if return_numpy is None else return_numpy if not isinstance(image, np.ndarray): - raise ValueError(f"Input image must be of type np.ndarray, got {type(image)}") + raise TypeError(f"Input image must be of type np.ndarray, got {type(image)}") if not isinstance(size, Iterable) or len(size) != 2: raise ValueError("size must have 2 elements representing the height and width of the output image") diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py index 13edcb3a2ad43b..4449b602491ad9 100644 --- a/src/transformers/image_utils.py +++ b/src/transformers/image_utils.py @@ -377,7 +377,7 @@ def load_image(image: Union[str, "PIL.Image.Image"], timeout: Optional[float] = elif isinstance(image, PIL.Image.Image): image = image else: - raise ValueError( + raise TypeError( "Incorrect format used for image. Should be an url linking to an image, a base64 string, a local path, or a PIL image." ) image = PIL.ImageOps.exif_transpose(image) diff --git a/src/transformers/integrations/awq.py b/src/transformers/integrations/awq.py index 30427aa405dd56..550c23fde3d4ad 100644 --- a/src/transformers/integrations/awq.py +++ b/src/transformers/integrations/awq.py @@ -199,7 +199,7 @@ def get_modules_to_fuse(model, quantization_config): The quantization configuration to use. """ if not isinstance(model, PreTrainedModel): - raise ValueError(f"The model should be an instance of `PreTrainedModel`, got {model.__class__.__name__}") + raise TypeError(f"The model should be an instance of `PreTrainedModel`, got {model.__class__.__name__}") # Always default to `quantization_config.modules_to_fuse` if quantization_config.modules_to_fuse is not None: diff --git a/src/transformers/integrations/peft.py b/src/transformers/integrations/peft.py index a543315410c785..923aa59e4184dc 100644 --- a/src/transformers/integrations/peft.py +++ b/src/transformers/integrations/peft.py @@ -262,9 +262,7 @@ def add_adapter(self, adapter_config, adapter_name: Optional[str] = None) -> Non raise ValueError(f"Adapter with name {adapter_name} already exists. Please use a different name.") if not isinstance(adapter_config, PeftConfig): - raise ValueError( - f"adapter_config should be an instance of PeftConfig. Got {type(adapter_config)} instead." - ) + raise TypeError(f"adapter_config should be an instance of PeftConfig. Got {type(adapter_config)} instead.") # Retrieve the name or path of the model, one could also use self.config._name_or_path # but to be consistent with what we do in PEFT: https://github.com/huggingface/peft/blob/6e783780ca9df3a623992cc4d1d665001232eae0/src/peft/mapping.py#L100 diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py index 0ad5dd0396194a..3d7658ba372130 100644 --- a/src/transformers/modeling_tf_utils.py +++ b/src/transformers/modeling_tf_utils.py @@ -1209,7 +1209,7 @@ def build(self, input_shape=None): def __init__(self, config, *inputs, **kwargs): super().__init__(*inputs, **kwargs) if not isinstance(config, PretrainedConfig): - raise ValueError( + raise TypeError( f"Parameter config in `{self.__class__.__name__}(config)` should be an instance of class " "`PretrainedConfig`. To create a model from a pretrained model use " f"`model = {self.__class__.__name__}.from_pretrained(PRETRAINED_MODEL_NAME)`" diff --git a/src/transformers/models/align/modeling_align.py b/src/transformers/models/align/modeling_align.py index d6e6023a26f768..1b744d0f208d46 100644 --- a/src/transformers/models/align/modeling_align.py +++ b/src/transformers/models/align/modeling_align.py @@ -1418,13 +1418,13 @@ def __init__(self, config: AlignConfig): super().__init__(config) if not isinstance(config.text_config, AlignTextConfig): - raise ValueError( + raise TypeError( "config.text_config is expected to be of type AlignTextConfig but is of type" f" {type(config.text_config)}." ) if not isinstance(config.vision_config, AlignVisionConfig): - raise ValueError( + raise TypeError( "config.vision_config is expected to be of type AlignVisionConfig but is of type" f" {type(config.vision_config)}." ) diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py index 10c9e10491bbda..f9856ef701f9e0 100755 --- a/src/transformers/models/altclip/modeling_altclip.py +++ b/src/transformers/models/altclip/modeling_altclip.py @@ -1466,12 +1466,12 @@ def __init__(self, config: AltCLIPConfig): super().__init__(config) if not isinstance(config.vision_config, AltCLIPVisionConfig): - raise ValueError( + raise TypeError( "config.vision_config is expected to be of type AltCLIPVisionConfig but is of type" f" {type(config.vision_config)}." ) if not isinstance(config.text_config, AltCLIPTextConfig): - raise ValueError( + raise TypeError( "config.text_config is expected to be of type AltCLIPTextConfig but is of type" f" {type(config.text_config)}." ) diff --git a/src/transformers/models/bark/processing_bark.py b/src/transformers/models/bark/processing_bark.py index a9bf55b51f6015..53715f3260422c 100644 --- a/src/transformers/models/bark/processing_bark.py +++ b/src/transformers/models/bark/processing_bark.py @@ -211,7 +211,7 @@ def _validate_voice_preset_dict(self, voice_preset: Optional[dict] = None): raise ValueError(f"Voice preset unrecognized, missing {key} as a key.") if not isinstance(voice_preset[key], np.ndarray): - raise ValueError(f"{key} voice preset must be a {str(self.preset_shape[key])}D ndarray.") + raise TypeError(f"{key} voice preset must be a {str(self.preset_shape[key])}D ndarray.") if len(voice_preset[key].shape) != self.preset_shape[key]: raise ValueError(f"{key} voice preset must be a {str(self.preset_shape[key])}D ndarray.") diff --git a/src/transformers/models/blip/modeling_blip.py b/src/transformers/models/blip/modeling_blip.py index a8f20ace6bd862..46e3a6005b0af6 100644 --- a/src/transformers/models/blip/modeling_blip.py +++ b/src/transformers/models/blip/modeling_blip.py @@ -755,13 +755,13 @@ def __init__(self, config: BlipConfig): super().__init__(config) if not isinstance(config.text_config, BlipTextConfig): - raise ValueError( + raise TypeError( "config.text_config is expected to be of type BlipTextConfig but is of type" f" {type(config.text_config)}." ) if not isinstance(config.vision_config, BlipVisionConfig): - raise ValueError( + raise TypeError( "config.vision_config is expected to be of type BlipVisionConfig but is of type" f" {type(config.vision_config)}." ) diff --git a/src/transformers/models/blip/modeling_tf_blip.py b/src/transformers/models/blip/modeling_tf_blip.py index 1557677eb3fbf2..6c9942b73acefb 100644 --- a/src/transformers/models/blip/modeling_tf_blip.py +++ b/src/transformers/models/blip/modeling_tf_blip.py @@ -794,13 +794,13 @@ def __init__(self, config: BlipConfig, *args, **kwargs): super().__init__(*args, **kwargs) if not isinstance(config.text_config, BlipTextConfig): - raise ValueError( + raise TypeError( "config.text_config is expected to be of type BlipTextConfig but is of type" f" {type(config.text_config)}." ) if not isinstance(config.vision_config, BlipVisionConfig): - raise ValueError( + raise TypeError( "config.vision_config is expected to be of type BlipVisionConfig but is of type" f" {type(config.vision_config)}." ) diff --git a/src/transformers/models/chameleon/processing_chameleon.py b/src/transformers/models/chameleon/processing_chameleon.py index 559cac62e3d5a7..1480808336d14e 100644 --- a/src/transformers/models/chameleon/processing_chameleon.py +++ b/src/transformers/models/chameleon/processing_chameleon.py @@ -113,7 +113,7 @@ def __call__( if isinstance(text, str): text = [text] elif not isinstance(text, list) and not isinstance(text[0], str): - raise ValueError("Invalid input text. Please provide a string, or a list of strings") + raise TypeError("Invalid input text. Please provide a string, or a list of strings") # Replace the image token with the expanded image token sequence prompt_strings = [] diff --git a/src/transformers/models/chinese_clip/modeling_chinese_clip.py b/src/transformers/models/chinese_clip/modeling_chinese_clip.py index 801969c465bfb0..6fbd9459f5ad71 100644 --- a/src/transformers/models/chinese_clip/modeling_chinese_clip.py +++ b/src/transformers/models/chinese_clip/modeling_chinese_clip.py @@ -1341,13 +1341,13 @@ def __init__(self, config: ChineseCLIPConfig): super().__init__(config) if not isinstance(config.text_config, ChineseCLIPTextConfig): - raise ValueError( + raise TypeError( "config.text_config is expected to be of type ChineseCLIPTextConfig but is of type" f" {type(config.text_config)}." ) if not isinstance(config.vision_config, ChineseCLIPVisionConfig): - raise ValueError( + raise TypeError( "config.vision_config is expected to be of type ChineseCLIPVisionConfig but is of type" f" {type(config.vision_config)}." ) diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py index 3e83daa942c022..939032f2c894cc 100644 --- a/src/transformers/models/clap/modeling_clap.py +++ b/src/transformers/models/clap/modeling_clap.py @@ -1928,13 +1928,13 @@ def __init__(self, config: ClapConfig): super().__init__(config) if not isinstance(config.text_config, ClapTextConfig): - raise ValueError( + raise TypeError( "config.text_config is expected to be of type ClapTextConfig but is of type" f" {type(config.text_config)}." ) if not isinstance(config.audio_config, ClapAudioConfig): - raise ValueError( + raise TypeError( "config.audio_config is expected to be of type ClapAudioConfig but is of type" f" {type(config.audio_config)}." ) diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py index b96acfc0936c1d..ee85fe3125873b 100644 --- a/src/transformers/models/clip/modeling_clip.py +++ b/src/transformers/models/clip/modeling_clip.py @@ -1119,13 +1119,13 @@ def __init__(self, config: CLIPConfig): super().__init__(config) if not isinstance(config.text_config, CLIPTextConfig): - raise ValueError( + raise TypeError( "config.text_config is expected to be of type CLIPTextConfig but is of type" f" {type(config.text_config)}." ) if not isinstance(config.vision_config, CLIPVisionConfig): - raise ValueError( + raise TypeError( "config.vision_config is expected to be of type CLIPVisionConfig but is of type" f" {type(config.vision_config)}." ) diff --git a/src/transformers/models/clip/modeling_tf_clip.py b/src/transformers/models/clip/modeling_tf_clip.py index b728da52c222b4..ca5f4aede21854 100644 --- a/src/transformers/models/clip/modeling_tf_clip.py +++ b/src/transformers/models/clip/modeling_tf_clip.py @@ -825,13 +825,13 @@ def __init__(self, config: CLIPConfig, **kwargs): super().__init__(**kwargs) if not isinstance(config.text_config, CLIPTextConfig): - raise ValueError( + raise TypeError( "config.text_config is expected to be of type CLIPTextConfig but is of type" f" {type(config.text_config)}." ) if not isinstance(config.vision_config, CLIPVisionConfig): - raise ValueError( + raise TypeError( "config.vision_config is expected to be of type CLIPVisionConfig but is of type" f" {type(config.vision_config)}." ) diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py index af7b94a10f4d7c..97fcf3d1f2b3e6 100644 --- a/src/transformers/models/clipseg/modeling_clipseg.py +++ b/src/transformers/models/clipseg/modeling_clipseg.py @@ -924,13 +924,13 @@ def __init__(self, config: CLIPSegConfig): super().__init__(config) if not isinstance(config.text_config, CLIPSegTextConfig): - raise ValueError( + raise TypeError( "config.text_config is expected to be of type CLIPSegTextConfig but is of type" f" {type(config.text_config)}." ) if not isinstance(config.vision_config, CLIPSegVisionConfig): - raise ValueError( + raise TypeError( "config.vision_config is expected to be of type CLIPSegVisionConfig but is of type" f" {type(config.vision_config)}." ) diff --git a/src/transformers/models/clvp/modeling_clvp.py b/src/transformers/models/clvp/modeling_clvp.py index a673d64614d786..4124e380a3d73d 100644 --- a/src/transformers/models/clvp/modeling_clvp.py +++ b/src/transformers/models/clvp/modeling_clvp.py @@ -1513,19 +1513,19 @@ def __init__(self, config: ClvpConfig): super().__init__(config) if not isinstance(config.text_config, ClvpEncoderConfig): - raise ValueError( + raise TypeError( "config.text_config is expected to be of type `ClvpEncoderConfig` but is of type" f" {type(config.text_config)}." ) if not isinstance(config.speech_config, ClvpEncoderConfig): - raise ValueError( + raise TypeError( "config.speech_config is expected to be of type `ClvpEncoderConfig` but is of type" f" {type(config.speech_config)}." ) if not isinstance(config.decoder_config, ClvpDecoderConfig): - raise ValueError( + raise TypeError( "config.decoder_config is expected to be of type `ClvpDecoderConfig` but is of type" f" {type(config.decoder_config)}." ) diff --git a/src/transformers/models/deberta_v2/tokenization_deberta_v2.py b/src/transformers/models/deberta_v2/tokenization_deberta_v2.py index 2876ac7660493c..6ff689f80a5c1b 100644 --- a/src/transformers/models/deberta_v2/tokenization_deberta_v2.py +++ b/src/transformers/models/deberta_v2/tokenization_deberta_v2.py @@ -518,4 +518,4 @@ def convert_to_unicode(text): elif isinstance(text, bytes): return text.decode("utf-8", "ignore") else: - raise ValueError(f"Unsupported string type: {type(text)}") + raise TypeError(f"Unsupported string type: {type(text)}") diff --git a/src/transformers/models/depth_anything/modeling_depth_anything.py b/src/transformers/models/depth_anything/modeling_depth_anything.py index 0b1ef77c6a732a..e37f0a3eaf7cbf 100644 --- a/src/transformers/models/depth_anything/modeling_depth_anything.py +++ b/src/transformers/models/depth_anything/modeling_depth_anything.py @@ -298,7 +298,7 @@ def forward(self, hidden_states: List[torch.Tensor], patch_height=None, patch_wi List of hidden states from the backbone. """ if not isinstance(hidden_states, (tuple, list)): - raise ValueError("hidden_states should be a tuple or list of tensors") + raise TypeError("hidden_states should be a tuple or list of tensors") if len(hidden_states) != len(self.config.neck_hidden_sizes): raise ValueError("The number of hidden states should be equal to the number of neck hidden sizes.") diff --git a/src/transformers/models/dpt/modeling_dpt.py b/src/transformers/models/dpt/modeling_dpt.py index b2b88855669a76..b3e4b86a2a49dc 100755 --- a/src/transformers/models/dpt/modeling_dpt.py +++ b/src/transformers/models/dpt/modeling_dpt.py @@ -1002,7 +1002,7 @@ def forward(self, hidden_states: List[torch.Tensor], patch_height=None, patch_wi List of hidden states from the backbone. """ if not isinstance(hidden_states, (tuple, list)): - raise ValueError("hidden_states should be a tuple or list of tensors") + raise TypeError("hidden_states should be a tuple or list of tensors") if len(hidden_states) != len(self.config.neck_hidden_sizes): raise ValueError("The number of hidden states should be equal to the number of neck hidden sizes.") diff --git a/src/transformers/models/esm/openfold_utils/chunk_utils.py b/src/transformers/models/esm/openfold_utils/chunk_utils.py index 301721d135ee4d..16131b8590954b 100644 --- a/src/transformers/models/esm/openfold_utils/chunk_utils.py +++ b/src/transformers/models/esm/openfold_utils/chunk_utils.py @@ -32,7 +32,7 @@ def _fetch_dims(tree: Union[dict, list, tuple, torch.Tensor]) -> List[Tuple[int, elif isinstance(tree, torch.Tensor): shapes.append(tree.shape) else: - raise ValueError("Not supported") + raise TypeError("Not supported") return shapes @@ -302,7 +302,7 @@ def assign(d1: dict, d2: dict) -> None: else: out[i : i + chunk_size] = output_chunk else: - raise ValueError("Not supported") + raise TypeError("Not supported") i += chunk_size diff --git a/src/transformers/models/esm/openfold_utils/residue_constants.py b/src/transformers/models/esm/openfold_utils/residue_constants.py index 8f0ad3b50c6505..200e0d421b8386 100644 --- a/src/transformers/models/esm/openfold_utils/residue_constants.py +++ b/src/transformers/models/esm/openfold_utils/residue_constants.py @@ -394,7 +394,7 @@ def map_structure_with_atom_order(in_list: list, first_call: bool = True) -> lis elif isinstance(in_list[i], str): in_list[i] = atom_order[in_list[i]] else: - raise ValueError("Unexpected type when mapping nested lists!") + raise TypeError("Unexpected type when mapping nested lists!") return in_list diff --git a/src/transformers/models/esm/openfold_utils/tensor_utils.py b/src/transformers/models/esm/openfold_utils/tensor_utils.py index 20ee34b236f177..efe72e4905b81f 100644 --- a/src/transformers/models/esm/openfold_utils/tensor_utils.py +++ b/src/transformers/models/esm/openfold_utils/tensor_utils.py @@ -134,7 +134,7 @@ def tree_map(fn, tree, leaf_type): return fn(tree) else: print(type(tree)) - raise ValueError("Not supported") + raise TypeError("Not supported") tensor_tree_map = partial(tree_map, leaf_type=torch.Tensor) diff --git a/src/transformers/models/flava/modeling_flava.py b/src/transformers/models/flava/modeling_flava.py index dbc4e51703847a..314925789ce1f4 100644 --- a/src/transformers/models/flava/modeling_flava.py +++ b/src/transformers/models/flava/modeling_flava.py @@ -1181,19 +1181,19 @@ def __init__(self, config: FlavaConfig): super().__init__(config) if not isinstance(config.text_config, FlavaTextConfig): - raise ValueError( + raise TypeError( "config.text_config is expected to be of type FlavaTextConfig but is of type" f" {type(config.text_config)}." ) if not isinstance(config.image_config, FlavaImageConfig): - raise ValueError( + raise TypeError( "config.image_config is expected to be of type FlavaImageConfig but is of type" f" {type(config.image_config)}." ) if not isinstance(config.multimodal_config, FlavaMultimodalConfig): - raise ValueError( + raise TypeError( "config.multimodal_config is expected to be of type FlavaMultimodalConfig but " + f"is of type {type(config.multimodal_config)}." ) diff --git a/src/transformers/models/groupvit/modeling_groupvit.py b/src/transformers/models/groupvit/modeling_groupvit.py index 32e1d777cb7dad..2a0d4f3c0e4e2b 100644 --- a/src/transformers/models/groupvit/modeling_groupvit.py +++ b/src/transformers/models/groupvit/modeling_groupvit.py @@ -1302,13 +1302,13 @@ def __init__(self, config: GroupViTConfig): super().__init__(config) if not isinstance(config.text_config, GroupViTTextConfig): - raise ValueError( + raise TypeError( "config.text_config is expected to be of type GroupViTTextConfig but is of type" f" {type(config.text_config)}." ) if not isinstance(config.vision_config, GroupViTVisionConfig): - raise ValueError( + raise TypeError( "config.vision_config is expected to be of type GroupViTVisionConfig but is of type" f" {type(config.vision_config)}." ) diff --git a/src/transformers/models/groupvit/modeling_tf_groupvit.py b/src/transformers/models/groupvit/modeling_tf_groupvit.py index f06c5f57f83fb3..b5838a5264f69d 100644 --- a/src/transformers/models/groupvit/modeling_tf_groupvit.py +++ b/src/transformers/models/groupvit/modeling_tf_groupvit.py @@ -1443,13 +1443,13 @@ def __init__(self, config: GroupViTConfig, **kwargs): super().__init__(**kwargs) if not isinstance(config.text_config, GroupViTTextConfig): - raise ValueError( + raise TypeError( "config.text_config is expected to be of type GroupViTTextConfig but is of type" f" {type(config.text_config)}." ) if not isinstance(config.vision_config, GroupViTVisionConfig): - raise ValueError( + raise TypeError( "config.vision_config is expected to be of type GroupViTVisionConfig but is of type" f" {type(config.vision_config)}." ) diff --git a/src/transformers/models/llava_next/image_processing_llava_next.py b/src/transformers/models/llava_next/image_processing_llava_next.py index 6295fb9562458b..f744b9fcf9c1cd 100644 --- a/src/transformers/models/llava_next/image_processing_llava_next.py +++ b/src/transformers/models/llava_next/image_processing_llava_next.py @@ -513,7 +513,7 @@ def get_image_patches( List[np.array]: A list of NumPy arrays containing the processed image patches. """ if not isinstance(grid_pinpoints, list): - raise ValueError("grid_pinpoints must be a list of possible resolutions.") + raise TypeError("grid_pinpoints must be a list of possible resolutions.") possible_resolutions = grid_pinpoints diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py index 23e3c25025fcb6..5b897b817330b7 100644 --- a/src/transformers/models/llava_next/modeling_llava_next.py +++ b/src/transformers/models/llava_next/modeling_llava_next.py @@ -60,12 +60,12 @@ def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size): tuple: The shape of the image patch grid in the format (width, height). """ if not isinstance(grid_pinpoints, list): - raise ValueError("grid_pinpoints should be a list of tuples or lists") + raise TypeError("grid_pinpoints should be a list of tuples or lists") # ! VERY IMPORTANT if image_size is tensor, must convert to into tuple, otherwise it will cause wrong calculate if not isinstance(image_size, (list, tuple)): if not isinstance(image_size, (torch.Tensor, np.ndarray)): - raise ValueError( + raise TypeError( f"image_size invalid type: {type(image_size)} not valid, should be either list, tuple, np.ndarray or tensor" ) image_size = image_size.tolist() @@ -91,12 +91,12 @@ def image_size_to_num_patches(image_size, grid_pinpoints, patch_size: int): int: the number of patches """ if not isinstance(grid_pinpoints, list): - raise ValueError("grid_pinpoints should be a list of tuples or lists") + raise TypeError("grid_pinpoints should be a list of tuples or lists") # ! VERY IMPORTANT if image_size is tensor, must convert to into tuple, otherwise it will cause wrong calculate if not isinstance(image_size, (list, tuple)): if not isinstance(image_size, (torch.Tensor, np.ndarray)): - raise ValueError(f"image_size invalid type {type(image_size)} with value {image_size}") + raise TypeError(f"image_size invalid type {type(image_size)} with value {image_size}") image_size = image_size.tolist() best_resolution = select_best_resolution(image_size, grid_pinpoints) diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py index 30b6abdf8e9f44..f2ccb99e618753 100644 --- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py @@ -66,12 +66,12 @@ def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size): tuple: The shape of the image patch grid in the format (width, height). """ if not isinstance(grid_pinpoints, list): - raise ValueError("grid_pinpoints should be a list of tuples or lists") + raise TypeError("grid_pinpoints should be a list of tuples or lists") # ! VERY IMPORTANT if image_size is tensor, must convert to into tuple, otherwise it will cause wrong calculate if not isinstance(image_size, (list, tuple)): if not isinstance(image_size, (torch.Tensor, np.ndarray)): - raise ValueError( + raise TypeError( f"image_size invalid type: {type(image_size)} not valid, should be either list, tuple, np.ndarray or tensor" ) image_size = image_size.tolist() @@ -97,12 +97,12 @@ def image_size_to_num_patches(image_size, grid_pinpoints, patch_size: int): int: the number of patches """ if not isinstance(grid_pinpoints, list): - raise ValueError("grid_pinpoints should be a list of tuples or lists") + raise TypeError("grid_pinpoints should be a list of tuples or lists") # ! VERY IMPORTANT if image_size is tensor, must convert to into tuple, otherwise it will cause wrong calculate if not isinstance(image_size, (list, tuple)): if not isinstance(image_size, (torch.Tensor, np.ndarray)): - raise ValueError(f"image_size invalid type {type(image_size)} with value {image_size}") + raise TypeError(f"image_size invalid type {type(image_size)} with value {image_size}") image_size = image_size.tolist() best_resolution = select_best_resolution(image_size, grid_pinpoints) diff --git a/src/transformers/models/luke/tokenization_luke.py b/src/transformers/models/luke/tokenization_luke.py index d37258f2a40012..1a570992ffb406 100644 --- a/src/transformers/models/luke/tokenization_luke.py +++ b/src/transformers/models/luke/tokenization_luke.py @@ -889,7 +889,7 @@ def _batch_encode_plus( def _check_entity_input_format(self, entities: Optional[EntityInput], entity_spans: Optional[EntitySpanInput]): if not isinstance(entity_spans, list): - raise ValueError("entity_spans should be given as a list") + raise TypeError("entity_spans should be given as a list") elif len(entity_spans) > 0 and not isinstance(entity_spans[0], tuple): raise ValueError( "entity_spans should be given as a list of tuples containing the start and end character indices" diff --git a/src/transformers/models/mluke/tokenization_mluke.py b/src/transformers/models/mluke/tokenization_mluke.py index 004f6526f5f421..3ac8191402af90 100644 --- a/src/transformers/models/mluke/tokenization_mluke.py +++ b/src/transformers/models/mluke/tokenization_mluke.py @@ -721,7 +721,7 @@ def _batch_encode_plus( # Copied from transformers.models.luke.tokenization_luke.LukeTokenizer._check_entity_input_format def _check_entity_input_format(self, entities: Optional[EntityInput], entity_spans: Optional[EntitySpanInput]): if not isinstance(entity_spans, list): - raise ValueError("entity_spans should be given as a list") + raise TypeError("entity_spans should be given as a list") elif len(entity_spans) > 0 and not isinstance(entity_spans[0], tuple): raise ValueError( "entity_spans should be given as a list of tuples containing the start and end character indices" diff --git a/src/transformers/models/owlv2/modeling_owlv2.py b/src/transformers/models/owlv2/modeling_owlv2.py index 0c4b60a4f5ec79..bc6735ff86b562 100644 --- a/src/transformers/models/owlv2/modeling_owlv2.py +++ b/src/transformers/models/owlv2/modeling_owlv2.py @@ -1015,13 +1015,13 @@ def __init__(self, config: Owlv2Config): super().__init__(config) if not isinstance(config.text_config, Owlv2TextConfig): - raise ValueError( + raise TypeError( "config.text_config is expected to be of type Owlv2TextConfig but is of type" f" {type(config.text_config)}." ) if not isinstance(config.vision_config, Owlv2VisionConfig): - raise ValueError( + raise TypeError( "config.vision_config is expected to be of type Owlv2VisionConfig but is of type" f" {type(config.vision_config)}." ) diff --git a/src/transformers/models/owlvit/modeling_owlvit.py b/src/transformers/models/owlvit/modeling_owlvit.py index 89d92c2209a143..94b815985878a0 100644 --- a/src/transformers/models/owlvit/modeling_owlvit.py +++ b/src/transformers/models/owlvit/modeling_owlvit.py @@ -998,13 +998,13 @@ def __init__(self, config: OwlViTConfig): super().__init__(config) if not isinstance(config.text_config, OwlViTTextConfig): - raise ValueError( + raise TypeError( "config.text_config is expected to be of type OwlViTTextConfig but is of type" f" {type(config.text_config)}." ) if not isinstance(config.vision_config, OwlViTVisionConfig): - raise ValueError( + raise TypeError( "config.vision_config is expected to be of type OwlViTVisionConfig but is of type" f" {type(config.vision_config)}." ) diff --git a/src/transformers/models/rag/retrieval_rag.py b/src/transformers/models/rag/retrieval_rag.py index a448132300d338..b9ae49b5e9c1aa 100644 --- a/src/transformers/models/rag/retrieval_rag.py +++ b/src/transformers/models/rag/retrieval_rag.py @@ -204,7 +204,7 @@ def __init__(self, vector_size, dataset, index_initialized=False): def _check_dataset_format(self, with_index: bool): if not isinstance(self.dataset, Dataset): - raise ValueError(f"Dataset should be a datasets.Dataset object, but got {type(self.dataset)}") + raise TypeError(f"Dataset should be a datasets.Dataset object, but got {type(self.dataset)}") if len({"title", "text", "embeddings"} - set(self.dataset.column_names)) > 0: raise ValueError( "Dataset should be a dataset with the following columns: " diff --git a/src/transformers/models/siglip/modeling_siglip.py b/src/transformers/models/siglip/modeling_siglip.py index 7c15dea3876b9f..797a8fa0c0ef66 100644 --- a/src/transformers/models/siglip/modeling_siglip.py +++ b/src/transformers/models/siglip/modeling_siglip.py @@ -1202,13 +1202,13 @@ def __init__(self, config: SiglipConfig): super().__init__(config) if not isinstance(config.text_config, SiglipTextConfig): - raise ValueError( + raise TypeError( "config.text_config is expected to be of type SiglipTextConfig but is of type" f" {type(config.text_config)}." ) if not isinstance(config.vision_config, SiglipVisionConfig): - raise ValueError( + raise TypeError( "config.vision_config is expected to be of type SiglipVisionConfig but is of type" f" {type(config.vision_config)}." ) diff --git a/src/transformers/models/udop/configuration_udop.py b/src/transformers/models/udop/configuration_udop.py index bc1765e289c6a1..5ae8bcebfd79a2 100644 --- a/src/transformers/models/udop/configuration_udop.py +++ b/src/transformers/models/udop/configuration_udop.py @@ -135,7 +135,7 @@ def __init__( self.patch_size = patch_size self.num_channels = num_channels if not isinstance(relative_bias_args, list): - raise ValueError("`relative_bias_args` should be a list of dictionaries.") + raise TypeError("`relative_bias_args` should be a list of dictionaries.") self.relative_bias_args = relative_bias_args act_info = self.feed_forward_proj.split("-") diff --git a/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py b/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py index 410fe710194e9f..0081008009e3a0 100644 --- a/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py +++ b/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py @@ -92,7 +92,7 @@ def __init__( super().__init__(feature_extractor, tokenizer) if not isinstance(decoder, BeamSearchDecoderCTC): - raise ValueError(f"`decoder` has to be of type {BeamSearchDecoderCTC.__class__}, but is {type(decoder)}") + raise TypeError(f"`decoder` has to be of type {BeamSearchDecoderCTC.__class__}, but is {type(decoder)}") if feature_extractor.__class__.__name__ not in ["Wav2Vec2FeatureExtractor", "SeamlessM4TFeatureExtractor"]: raise ValueError( diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py index b00b42281b916a..791e501d173721 100644 --- a/src/transformers/models/x_clip/modeling_x_clip.py +++ b/src/transformers/models/x_clip/modeling_x_clip.py @@ -1242,13 +1242,13 @@ def __init__(self, config: XCLIPConfig): super().__init__(config) if not isinstance(config.text_config, XCLIPTextConfig): - raise ValueError( + raise TypeError( "config.text_config is expected to be of type XCLIPTextConfig but is of type" f" {type(config.text_config)}." ) if not isinstance(config.vision_config, XCLIPVisionConfig): - raise ValueError( + raise TypeError( "config.vision_config is expected to be of type XCLIPVisionConfig but is of type" f" {type(config.vision_config)}." ) diff --git a/src/transformers/models/zoedepth/modeling_zoedepth.py b/src/transformers/models/zoedepth/modeling_zoedepth.py index f03f775d1e4faf..2a00487c1b4b90 100644 --- a/src/transformers/models/zoedepth/modeling_zoedepth.py +++ b/src/transformers/models/zoedepth/modeling_zoedepth.py @@ -334,7 +334,7 @@ def forward(self, hidden_states: List[torch.Tensor], patch_height, patch_width) List of hidden states from the backbone. """ if not isinstance(hidden_states, (tuple, list)): - raise ValueError("hidden_states should be a tuple or list of tensors") + raise TypeError("hidden_states should be a tuple or list of tensors") if len(hidden_states) != len(self.config.neck_hidden_sizes): raise ValueError("The number of hidden states should be equal to the number of neck hidden sizes.") diff --git a/src/transformers/pipelines/audio_classification.py b/src/transformers/pipelines/audio_classification.py index a0e8f626db644e..517fbd9a7f409f 100644 --- a/src/transformers/pipelines/audio_classification.py +++ b/src/transformers/pipelines/audio_classification.py @@ -190,7 +190,7 @@ def preprocess(self, inputs): ).numpy() if not isinstance(inputs, np.ndarray): - raise ValueError("We expect a numpy ndarray as input") + raise TypeError("We expect a numpy ndarray as input") if len(inputs.shape) != 1: raise ValueError("We expect a single channel audio input for AudioClassificationPipeline") diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py index 01faab6d74adac..f3de341d88954c 100644 --- a/src/transformers/pipelines/automatic_speech_recognition.py +++ b/src/transformers/pipelines/automatic_speech_recognition.py @@ -406,7 +406,7 @@ def preprocess(self, inputs, chunk_length_s=0, stride_length_s=None): # of the original length in the stride so we can cut properly. stride = (inputs.shape[0], int(round(stride[0] * ratio)), int(round(stride[1] * ratio))) if not isinstance(inputs, np.ndarray): - raise ValueError(f"We expect a numpy ndarray as input, got `{type(inputs)}`") + raise TypeError(f"We expect a numpy ndarray as input, got `{type(inputs)}`") if len(inputs.shape) != 1: raise ValueError("We expect a single channel audio input for AutomaticSpeechRecognitionPipeline") diff --git a/src/transformers/pipelines/zero_shot_audio_classification.py b/src/transformers/pipelines/zero_shot_audio_classification.py index d9109aebd9c529..59500d14e104e7 100644 --- a/src/transformers/pipelines/zero_shot_audio_classification.py +++ b/src/transformers/pipelines/zero_shot_audio_classification.py @@ -114,7 +114,7 @@ def preprocess(self, audio, candidate_labels=None, hypothesis_template="This is audio = ffmpeg_read(audio, self.feature_extractor.sampling_rate) if not isinstance(audio, np.ndarray): - raise ValueError("We expect a numpy ndarray as input") + raise TypeError("We expect a numpy ndarray as input") if len(audio.shape) != 1: raise ValueError("We expect a single channel audio input for ZeroShotAudioClassificationPipeline") diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index 24c6af79663652..a699ce94673618 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -356,7 +356,7 @@ def __init__(self, *args, **kwargs): proper_class = getattr(transformers_module, class_name) if not isinstance(arg, proper_class): - raise ValueError( + raise TypeError( f"Received a {type(arg).__name__} for argument {attribute_name}, but a {class_name} was expected." ) diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py index 4231526265ba59..1853d2de4560ea 100644 --- a/src/transformers/tokenization_utils.py +++ b/src/transformers/tokenization_utils.py @@ -474,7 +474,7 @@ def added_tokens_decoder(self, value: Dict[int, Union[AddedToken, str]]) -> Dict # Always raise an error if string because users should define the behavior for index, token in value.items(): if not isinstance(token, (str, AddedToken)) or not isinstance(index, int): - raise ValueError( + raise TypeError( f"The provided `added_tokens_decoder` has an element of type {index.__class__, token.__class__}, should be a dict of {int, Union[AddedToken, str]}" ) diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py index 20d142b83f46c2..506c4db447c7aa 100755 --- a/src/transformers/utils/quantization_config.py +++ b/src/transformers/utils/quantization_config.py @@ -405,7 +405,7 @@ def load_in_4bit(self): @load_in_4bit.setter def load_in_4bit(self, value: bool): if not isinstance(value, bool): - raise ValueError("load_in_4bit must be a boolean") + raise TypeError("load_in_4bit must be a boolean") if self.load_in_8bit and value: raise ValueError("load_in_4bit and load_in_8bit are both True, but only one can be used at the same time") @@ -418,7 +418,7 @@ def load_in_8bit(self): @load_in_8bit.setter def load_in_8bit(self, value: bool): if not isinstance(value, bool): - raise ValueError("load_in_8bit must be a boolean") + raise TypeError("load_in_8bit must be a boolean") if self.load_in_4bit and value: raise ValueError("load_in_4bit and load_in_8bit are both True, but only one can be used at the same time") @@ -429,30 +429,30 @@ def post_init(self): Safety checker that arguments are correct - also replaces some NoneType arguments with their default values. """ if not isinstance(self.load_in_4bit, bool): - raise ValueError("load_in_4bit must be a boolean") + raise TypeError("load_in_4bit must be a boolean") if not isinstance(self.load_in_8bit, bool): - raise ValueError("load_in_8bit must be a boolean") + raise TypeError("load_in_8bit must be a boolean") if not isinstance(self.llm_int8_threshold, float): - raise ValueError("llm_int8_threshold must be a float") + raise TypeError("llm_int8_threshold must be a float") if self.llm_int8_skip_modules is not None and not isinstance(self.llm_int8_skip_modules, list): - raise ValueError("llm_int8_skip_modules must be a list of strings") + raise TypeError("llm_int8_skip_modules must be a list of strings") if not isinstance(self.llm_int8_enable_fp32_cpu_offload, bool): - raise ValueError("llm_int8_enable_fp32_cpu_offload must be a boolean") + raise TypeError("llm_int8_enable_fp32_cpu_offload must be a boolean") if not isinstance(self.llm_int8_has_fp16_weight, bool): - raise ValueError("llm_int8_has_fp16_weight must be a boolean") + raise TypeError("llm_int8_has_fp16_weight must be a boolean") if self.bnb_4bit_compute_dtype is not None and not isinstance(self.bnb_4bit_compute_dtype, torch.dtype): - raise ValueError("bnb_4bit_compute_dtype must be torch.dtype") + raise TypeError("bnb_4bit_compute_dtype must be torch.dtype") if not isinstance(self.bnb_4bit_quant_type, str): - raise ValueError("bnb_4bit_quant_type must be a string") + raise TypeError("bnb_4bit_quant_type must be a string") if not isinstance(self.bnb_4bit_use_double_quant, bool): - raise ValueError("bnb_4bit_use_double_quant must be a boolean") + raise TypeError("bnb_4bit_use_double_quant must be a boolean") if self.load_in_4bit and not version.parse(importlib.metadata.version("bitsandbytes")) >= version.parse( "0.39.0" @@ -957,13 +957,13 @@ def post_init(self): Safety checker that arguments are correct - also replaces some NoneType arguments with their default values. """ if not isinstance(self.in_group_size, int): - raise ValueError("in_group_size must be a float") + raise TypeError("in_group_size must be a float") if not isinstance(self.out_group_size, int): - raise ValueError("out_group_size must be a float") + raise TypeError("out_group_size must be a float") if not isinstance(self.num_codebooks, int): - raise ValueError("num_codebooks must be a float") + raise TypeError("num_codebooks must be a float") if not isinstance(self.nbits_per_codebook, int): - raise ValueError("nbits_per_codebook must be a float") + raise TypeError("nbits_per_codebook must be a float") if self.linear_weights_not_to_quantize is not None and not isinstance( self.linear_weights_not_to_quantize, list diff --git a/tests/agents/test_tools_common.py b/tests/agents/test_tools_common.py index bd560e9053679c..bb8881d92e915d 100644 --- a/tests/agents/test_tools_common.py +++ b/tests/agents/test_tools_common.py @@ -60,7 +60,7 @@ def output_type(output): elif isinstance(output, (torch.Tensor, AgentAudio)): return "audio" else: - raise ValueError(f"Invalid output: {output}") + raise TypeError(f"Invalid output: {output}") @is_agent_test diff --git a/tests/models/luke/test_tokenization_luke.py b/tests/models/luke/test_tokenization_luke.py index ae6db98eaf8b6d..a7b544d4608d71 100644 --- a/tests/models/luke/test_tokenization_luke.py +++ b/tests/models/luke/test_tokenization_luke.py @@ -188,7 +188,7 @@ def test_if_tokenize_single_text_raise_error_with_invalid_inputs(self): with self.assertRaises(ValueError): tokenizer(sentence, entities=tuple(entities), entity_spans=spans) - with self.assertRaises(ValueError): + with self.assertRaises(TypeError): tokenizer(sentence, entities=entities, entity_spans=tuple(spans)) with self.assertRaises(ValueError): diff --git a/tests/models/mluke/test_tokenization_mluke.py b/tests/models/mluke/test_tokenization_mluke.py index edb62a791c0245..bc9210c9139f62 100644 --- a/tests/models/mluke/test_tokenization_mluke.py +++ b/tests/models/mluke/test_tokenization_mluke.py @@ -151,7 +151,7 @@ def test_if_tokenize_single_text_raise_error_with_invalid_inputs(self): with self.assertRaises(ValueError): tokenizer(sentence, entities=tuple(entities), entity_spans=spans) - with self.assertRaises(ValueError): + with self.assertRaises(TypeError): tokenizer(sentence, entities=entities, entity_spans=tuple(spans)) with self.assertRaises(ValueError): diff --git a/tests/pipelines/test_pipelines_feature_extraction.py b/tests/pipelines/test_pipelines_feature_extraction.py index 4d25941c3f0fd9..c9169056ff97a7 100644 --- a/tests/pipelines/test_pipelines_feature_extraction.py +++ b/tests/pipelines/test_pipelines_feature_extraction.py @@ -171,7 +171,7 @@ def get_shape(self, input_, shape=None): elif isinstance(input_, float): return 0 else: - raise ValueError("We expect lists of floats, nothing else") + raise TypeError("We expect lists of floats, nothing else") return shape def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"): diff --git a/tests/test_pipeline_mixin.py b/tests/test_pipeline_mixin.py index 6ca7ea0681db58..8a0ca08e8dabec 100644 --- a/tests/test_pipeline_mixin.py +++ b/tests/test_pipeline_mixin.py @@ -145,7 +145,7 @@ def run_task_tests(self, task, torch_dtype="float32"): if not isinstance(model_architectures, tuple): model_architectures = (model_architectures,) if not isinstance(model_architectures, tuple): - raise ValueError(f"`model_architectures` must be a tuple. Got {type(model_architectures)} instead.") + raise TypeError(f"`model_architectures` must be a tuple. Got {type(model_architectures)} instead.") for model_architecture in model_architectures: model_arch_name = model_architecture.__name__ From 7987710696803c74ce1b5e7f9dfa055096a6c00e Mon Sep 17 00:00:00 2001 From: Bertrand Thia <56003053+bt2513@users.noreply.github.com> Date: Mon, 22 Jul 2024 13:08:27 -0400 Subject: [PATCH 02/73] [RoBERTa] Minor clarifications to model doc (#31949) * minor edits and clarifications * address comment Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --------- Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/en/model_doc/roberta.md | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/docs/source/en/model_doc/roberta.md b/docs/source/en/model_doc/roberta.md index 364b5b37e5f3f0..2a1843d8885abe 100644 --- a/docs/source/en/model_doc/roberta.md +++ b/docs/source/en/model_doc/roberta.md @@ -51,19 +51,19 @@ This model was contributed by [julien-c](https://huggingface.co/julien-c). The o ## Usage tips -- This implementation is the same as [`BertModel`] with a tiny embeddings tweak as well as a setup - for Roberta pretrained models. -- RoBERTa has the same architecture as BERT, but uses a byte-level BPE as a tokenizer (same as GPT-2) and uses a +- This implementation is the same as [`BertModel`] with a minor tweak to the embeddings, as well as a setup + for RoBERTa pretrained models. +- RoBERTa has the same architecture as BERT but uses a byte-level BPE as a tokenizer (same as GPT-2) and uses a different pretraining scheme. -- RoBERTa doesn't have `token_type_ids`, you don't need to indicate which token belongs to which segment. Just - separate your segments with the separation token `tokenizer.sep_token` (or ``) -- Same as BERT with better pretraining tricks: - - * dynamic masking: tokens are masked differently at each epoch, whereas BERT does it once and for all - * together to reach 512 tokens (so the sentences are in an order than may span several documents) - * train with larger batches - * use BPE with bytes as a subunit and not characters (because of unicode characters) -- [CamemBERT](camembert) is a wrapper around RoBERTa. Refer to this page for usage examples. +- RoBERTa doesn't have `token_type_ids`, so you don't need to indicate which token belongs to which segment. Just + separate your segments with the separation token `tokenizer.sep_token` (or ``). +- RoBERTa is similar to BERT but with better pretraining techniques: + + * Dynamic masking: tokens are masked differently at each epoch, whereas BERT does it once and for all. + * Sentence packing: Sentences are packed together to reach 512 tokens (so the sentences are in an order that may span several documents). + * Larger batches: Training uses larger batches. + * Byte-level BPE vocabulary: Uses BPE with bytes as a subunit instead of characters, accommodating Unicode characters. +- [CamemBERT](camembert) is a wrapper around RoBERTa. Refer to its model page for usage examples. ## Resources From 74d0eb3fedf353bd670aa85ae8fcf4c85f287b5b Mon Sep 17 00:00:00 2001 From: Yoni Gottesman Date: Mon, 22 Jul 2024 20:24:43 +0300 Subject: [PATCH 03/73] Return assistant generated tokens mask in apply_chat_template (#30650) return assistant generated tokens mask in apply_chat_template --- src/transformers/tokenization_utils_base.py | 120 ++++++++++++++-- .../test_tokenization_layoutlmv2.py | 4 + .../test_tokenization_layoutlmv3.py | 4 + .../layoutxlm/test_tokenization_layoutxlm.py | 4 + .../markuplm/test_tokenization_markuplm.py | 4 + tests/models/tapas/test_tokenization_tapas.py | 4 + tests/models/udop/test_tokenization_udop.py | 4 + tests/test_tokenization_common.py | 129 ++++++++++++++++++ 8 files changed, 265 insertions(+), 8 deletions(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 6d2e7f502e0089..434eaa2fac8ba2 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1697,6 +1697,7 @@ def apply_chat_template( max_length: Optional[int] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_dict: bool = False, + return_assistant_tokens_mask: bool = False, tokenizer_kwargs: Optional[Dict[str, Any]] = None, **kwargs, ) -> Union[str, List[int], List[str], List[List[int]], BatchEncoding]: @@ -1747,6 +1748,10 @@ def apply_chat_template( return_dict (`bool`, defaults to `False`): Whether to return a dictionary with named outputs. Has no effect if tokenize is `False`. tokenizer_kwargs (`Dict[str: Any]`, *optional*): Additional kwargs to pass to the tokenizer. + return_assistant_tokens_mask (`bool`, defaults to `False`): + Whether to return a mask of the assistant generated tokens. For tokens generated by the assistant, + the mask will contain 1. For user and system tokens, the mask will contain 0. + This functionality is only available for chat templates that support it via the `{% generation %}` keyword. **kwargs: Additional kwargs to pass to the template renderer. Will be accessible by the chat template. Returns: @@ -1761,6 +1766,9 @@ def apply_chat_template( "of tokenizer outputs to return." ) + if return_assistant_tokens_mask and not return_dict: + raise ValueError("`return_assistant_tokens_mask=True` is incompatible with `return_dict=False`") + if tokenizer_kwargs is None: tokenizer_kwargs = {} @@ -1813,6 +1821,11 @@ def apply_chat_template( "then to ensure that this model continues working without issues." ) + if return_assistant_tokens_mask and not re.search(r"\{\%-?\s*generation\s*-?\%\}", chat_template): + logger.warning_once( + "return_assistant_tokens_mask==True but chat template does not contain `{% generation %}` keyword." + ) + # Compilation function uses a cache to avoid recompiling the same template compiled_template = self._compile_jinja_template(chat_template) @@ -1847,18 +1860,30 @@ def apply_chat_template( raise TypeError("Documents should be a list of dicts with 'title' and 'text' keys!") rendered = [] + all_generation_indices = [] template_kwargs = {**self.special_tokens_map, **kwargs} # kwargs overwrite special tokens if both are present for chat in conversations: if hasattr(chat, "messages"): # Indicates it's a Conversation object chat = chat.messages - rendered_chat = compiled_template.render( - messages=chat, - tools=tool_schemas, - documents=documents, - add_generation_prompt=add_generation_prompt, - **template_kwargs, - ) + if return_assistant_tokens_mask: + rendered_chat, generation_indices = self._render_with_assistant_indices( + compiled_template=compiled_template, + messages=chat, + tools=tool_schemas, + documents=documents, + add_generation_prompt=add_generation_prompt, + **template_kwargs, + ) + all_generation_indices.append(generation_indices) + else: + rendered_chat = compiled_template.render( + messages=chat, + tools=tool_schemas, + documents=documents, + add_generation_prompt=add_generation_prompt, + **template_kwargs, + ) rendered.append(rendered_chat) if not is_batched: @@ -1875,17 +1900,54 @@ def apply_chat_template( **tokenizer_kwargs, ) if return_dict: + if return_assistant_tokens_mask: + assistant_masks = [] + if is_batched or return_tensors: + input_ids = out["input_ids"] + else: + input_ids = [out["input_ids"]] + for i in range(len(input_ids)): + current_mask = [0] * len(input_ids[i]) + for assistant_start_char, assistant_end_char in all_generation_indices[i]: + start_token = out.char_to_token(i, assistant_start_char) + end_token = out.char_to_token(i, assistant_end_char - 1) + if start_token is None: + # start_token is out of bounds maybe due to truncation. + break + for token_id in range(start_token, end_token + 1 if end_token else len(input_ids)): + current_mask[token_id] = 1 + assistant_masks.append(current_mask) + out["assistant_masks"] = assistant_masks if is_batched else assistant_masks[0] return out else: return out["input_ids"] else: return rendered + def _render_with_assistant_indices( + self, compiled_template, messages, tools, documents, add_generation_prompt, **template_kwargs + ): + rendered_blocks = [] + generation_indices = [] + with compiled_template.environment.activate_tracker(rendered_blocks, generation_indices): + for block in compiled_template.generate( + messages=messages, + tools=tools, + documents=documents, + add_generation_prompt=add_generation_prompt, + **template_kwargs, + ): + rendered_blocks.append(block) + rendered_chat = "".join(rendered_blocks) + return rendered_chat, generation_indices + @lru_cache def _compile_jinja_template(self, chat_template): try: import jinja2 + from jinja2 import nodes from jinja2.exceptions import TemplateError + from jinja2.ext import Extension from jinja2.sandbox import ImmutableSandboxedEnvironment except ImportError: raise ImportError("apply_chat_template requires jinja2 to be installed.") @@ -1903,7 +1965,49 @@ def tojson(x, ensure_ascii=False, indent=None, separators=None, sort_keys=False) # We also expose some options like custom indents and separators return json.dumps(x, ensure_ascii=ensure_ascii, indent=indent, separators=separators, sort_keys=sort_keys) - jinja_env = ImmutableSandboxedEnvironment(trim_blocks=True, lstrip_blocks=True) + class AssistantTracker(Extension): + # This extension is used to track the indices of assistant-generated tokens in the rendered chat + tags = {"generation"} + + def __init__(self, environment: ImmutableSandboxedEnvironment): + # The class is only initiated by jinja. + super().__init__(environment) + environment.extend(activate_tracker=self.activate_tracker) + self._rendered_blocks = None + self._generation_indices = None + + def parse(self, parser: jinja2.parser.Parser) -> jinja2.nodes.CallBlock: + lineno = next(parser.stream).lineno + body = parser.parse_statements(["name:endgeneration"], drop_needle=True) + return nodes.CallBlock(self.call_method("_generation_support"), [], [], body).set_lineno(lineno) + + @jinja2.pass_eval_context + def _generation_support(self, context: jinja2.nodes.EvalContext, caller: jinja2.runtime.Macro) -> str: + rv = caller() + if self.is_active(): + # Only track generation indices if the tracker is active + start_index = len("".join(self._rendered_blocks)) + end_index = start_index + len(rv) + self._generation_indices.append((start_index, end_index)) + return rv + + def is_active(self) -> bool: + return self._rendered_blocks or self._generation_indices + + @contextmanager + def activate_tracker(self, rendered_blocks: list[int], generation_indices: list[int]): + try: + if self.is_active(): + raise ValueError("AssistantTracker should not be reused before closed") + self._rendered_blocks = rendered_blocks + self._generation_indices = generation_indices + + yield + finally: + self._rendered_blocks = None + self._generation_indices = None + + jinja_env = ImmutableSandboxedEnvironment(trim_blocks=True, lstrip_blocks=True, extensions=[AssistantTracker]) jinja_env.filters["tojson"] = tojson jinja_env.globals["raise_exception"] = raise_exception return jinja_env.from_string(chat_template) diff --git a/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py b/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py index 0dbeef0c4176c7..bb526e140e5740 100644 --- a/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py +++ b/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py @@ -2483,3 +2483,7 @@ def test_np_encode_plus_sent_to_model(self): @unittest.skip(reason="Chat is not supported") def test_chat_template(self): pass + + @unittest.skip("Chat is not supported") + def test_chat_template_return_assistant_tokens_mask(self): + pass diff --git a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py index e478e0ac62cb5c..5ea384f0b26422 100644 --- a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py +++ b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py @@ -2436,3 +2436,7 @@ def test_tf_encode_plus_sent_to_model(self): @unittest.skip(reason="Chat is not supported") def test_chat_template(self): pass + + @unittest.skip("Chat is not supported") + def test_chat_template_return_assistant_tokens_mask(self): + pass diff --git a/tests/models/layoutxlm/test_tokenization_layoutxlm.py b/tests/models/layoutxlm/test_tokenization_layoutxlm.py index 2f8b19a662ab73..c0e44fcb30491f 100644 --- a/tests/models/layoutxlm/test_tokenization_layoutxlm.py +++ b/tests/models/layoutxlm/test_tokenization_layoutxlm.py @@ -1977,3 +1977,7 @@ def test_sentencepiece_tokenize_and_decode(self): @unittest.skip(reason="Chat is not supported") def test_chat_template(self): pass + + @unittest.skip("Chat is not supported") + def test_chat_template_return_assistant_tokens_mask(self): + pass diff --git a/tests/models/markuplm/test_tokenization_markuplm.py b/tests/models/markuplm/test_tokenization_markuplm.py index b2c0d20bdb2434..458df94ec2fbcc 100644 --- a/tests/models/markuplm/test_tokenization_markuplm.py +++ b/tests/models/markuplm/test_tokenization_markuplm.py @@ -2316,3 +2316,7 @@ def test_chat_template(self): @unittest.skip(reason="The model tested fails `Hub -> Fast == Hub -> Slow`, nothing much we can do") def test_added_tokens_serialization(self): pass + + @unittest.skip("Chat is not supported") + def test_chat_template_return_assistant_tokens_mask(self): + pass diff --git a/tests/models/tapas/test_tokenization_tapas.py b/tests/models/tapas/test_tokenization_tapas.py index 8fe65438d5cac7..a9b8e9a0c77fa6 100644 --- a/tests/models/tapas/test_tokenization_tapas.py +++ b/tests/models/tapas/test_tokenization_tapas.py @@ -1277,3 +1277,7 @@ def test_np_encode_plus_sent_to_model(self): @unittest.skip(reason="Chat is not supported") def test_chat_template(self): pass + + @unittest.skip("Chat is not supported") + def test_chat_template_return_assistant_tokens_mask(self): + pass diff --git a/tests/models/udop/test_tokenization_udop.py b/tests/models/udop/test_tokenization_udop.py index 151695c1c126fc..78153172f2c729 100644 --- a/tests/models/udop/test_tokenization_udop.py +++ b/tests/models/udop/test_tokenization_udop.py @@ -1157,6 +1157,10 @@ def test_offsets_mapping(self): def test_chat_template(self): pass + @unittest.skip(reason="Chat template tests don't play well with table/layout models.") + def test_chat_template_return_assistant_tokens_mask(self): + pass + @unittest.skip(reason="Chat template tests don't play well with table/layout models.") def test_chat_template_batched(self): pass diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index 867ca859ebc109..a1fb5124a457f2 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -1153,6 +1153,135 @@ def test_chat_template_batched(self): dummy_conversations, chat_template=dummy_template, tokenize=True ) # Check that no error raised + @require_jinja + def test_chat_template_return_assistant_tokens_mask(self): + dummy_template = ( + "{% for message in messages %}" + "{% if (message['role'] != 'assistant') %}" + "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}" + "{% elif (message['role'] == 'assistant')%}" + "{{'<|im_start|>' + message['role'] + '\n'}}" + "{% generation %}" + "{{message['content'] + '<|im_end|>'}}" + "{% endgeneration %}" + "{{'\n'}}" + "{% endif %}" + "{% endfor %}" + ) + conversations = [ + [ + {"role": "system", "content": "system message"}, + {"role": "user", "content": "user message"}, + {"role": "assistant", "content": "start turn 1 assistant message. end turn 1"}, + {"role": "user", "content": "user message 2"}, + {"role": "assistant", "content": "start turn 2 assistant message. end turn 2"}, + ], + [ + {"role": "system", "content": "system message 3"}, + {"role": "user", "content": "user message 3"}, + {"role": "assistant", "content": "start turn 3 assistant message. end turn 3"}, + {"role": "user", "content": "user message 4"}, + {"role": "assistant", "content": "start turn 4 assistant message. end turn 4"}, + ], + ] + + # These are the prefix and suffix strings of all the assistant messages. Used to find the assistant substring + # in the entire chat string, and then find the corresponding tokens in the tokenized output. + assistant_prefix_suffix = [ + [("start turn 1", "end turn 1<|im_end|>"), ("start turn 2", "end turn 2<|im_end|>")], + [("start turn 3", "end turn 3<|im_end|>"), ("start turn 4", "end turn 4<|im_end|>")], + ] + for tokenizer, pretrained_name, _ in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + if not self.test_rust_tokenizer: + self.skipTest(reason="No fast tokenizer defined") + + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name) + + # check batched + output = tokenizer_r.apply_chat_template( + conversations, + chat_template=dummy_template, + tokenize=True, + return_assistant_tokens_mask=True, + return_dict=True, + ) + for i, conv in enumerate(conversations): + chat_string = tokenizer_r.apply_chat_template( + conversations[i], tokenize=False, chat_template=dummy_template + ) + assistant_start = output.char_to_token(i, chat_string.index(assistant_prefix_suffix[i][0][0])) + assistant_end = output.char_to_token( + i, + chat_string.index(assistant_prefix_suffix[i][0][1]) + + len(assistant_prefix_suffix[i][0][1]) + - 1, + ) + + assistant_start2 = output.char_to_token(i, chat_string.index(assistant_prefix_suffix[i][1][0])) + assistant_end2 = output.char_to_token( + i, + chat_string.index(assistant_prefix_suffix[i][1][1]) + + len(assistant_prefix_suffix[i][1][1]) + - 1, + ) + + # assert 1 in first assistant message + self.assertEqual( + output["assistant_masks"][i][assistant_start : assistant_end + 1], + [1] * (assistant_end - assistant_start + 1), + ) + # assert 1 second assistant message + self.assertEqual( + output["assistant_masks"][i][assistant_start2 : assistant_end2 + 1], + [1] * (assistant_end2 - assistant_start2 + 1), + ) + + # assert 0 in user/system indices + self.assertEqual(output["assistant_masks"][i][:assistant_start], [0] * assistant_start) + self.assertEqual( + output["assistant_masks"][i][assistant_end + 1 : assistant_start2], + [0] * (assistant_start2 - assistant_end - 1), + ) + + # check not batched + output = tokenizer_r.apply_chat_template( + conversations[0], + chat_template=dummy_template, + tokenize=True, + return_assistant_tokens_mask=True, + return_dict=True, + ) + + chat_string = tokenizer_r.apply_chat_template( + conversations[0], tokenize=False, chat_template=dummy_template + ) + assistant_start = output.char_to_token(0, chat_string.index(assistant_prefix_suffix[0][0][0])) + assistant_end = output.char_to_token( + 0, chat_string.index(assistant_prefix_suffix[0][0][1]) + len(assistant_prefix_suffix[0][0][1]) - 1 + ) + assistant_start2 = output.char_to_token(0, chat_string.index(assistant_prefix_suffix[0][1][0])) + assistant_end2 = output.char_to_token( + 0, chat_string.index(assistant_prefix_suffix[0][1][1]) + len(assistant_prefix_suffix[0][1][1]) - 1 + ) + + # assert 1 in assistant indices + self.assertEqual( + output["assistant_masks"][assistant_start : assistant_end + 1], + [1] * (assistant_end - assistant_start + 1), + ) + self.assertEqual( + output["assistant_masks"][assistant_start2 : assistant_end2 + 1], + [1] * (assistant_end2 - assistant_start2 + 1), + ) + + # assert 0 in user/system indices + self.assertEqual(output["assistant_masks"][:assistant_start], [0] * assistant_start) + self.assertEqual( + output["assistant_masks"][assistant_end + 1 : assistant_start2], + [0] * (assistant_start2 - assistant_end - 1), + ) + @require_jinja def test_chat_template_dict(self): dummy_template_1 = "{{'a'}}" From 817a676bd711f9626e13578068b36ef09cf572dc Mon Sep 17 00:00:00 2001 From: amyeroberts <22614925+amyeroberts@users.noreply.github.com> Date: Mon, 22 Jul 2024 18:29:50 +0100 Subject: [PATCH 04/73] Don't default to other weights file when use_safetensors=True (#31874) * Don't default to other weights file when use_safetensors=True * Add tests * Update tests/utils/test_modeling_utils.py * Add clarifying comments to tests * Update tests/utils/test_modeling_utils.py * Update tests/utils/test_modeling_utils.py --- src/transformers/modeling_utils.py | 15 ++++--- tests/utils/test_modeling_utils.py | 66 ++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 6 deletions(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index ce0086d1e3bcd8..a2cea6dcdc2483 100755 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -3395,14 +3395,14 @@ def from_pretrained( pretrained_model_name_or_path, subfolder, _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant) ) is_sharded = True - elif os.path.isfile( + elif not use_safetensors and os.path.isfile( os.path.join(pretrained_model_name_or_path, subfolder, _add_variant(WEIGHTS_NAME, variant)) ): # Load from a PyTorch checkpoint archive_file = os.path.join( pretrained_model_name_or_path, subfolder, _add_variant(WEIGHTS_NAME, variant) ) - elif os.path.isfile( + elif not use_safetensors and os.path.isfile( os.path.join(pretrained_model_name_or_path, subfolder, _add_variant(WEIGHTS_INDEX_NAME, variant)) ): # Load from a sharded PyTorch checkpoint @@ -3411,15 +3411,18 @@ def from_pretrained( ) is_sharded = True # At this stage we don't have a weight file so we will raise an error. - elif os.path.isfile( - os.path.join(pretrained_model_name_or_path, subfolder, TF_WEIGHTS_NAME + ".index") - ) or os.path.isfile(os.path.join(pretrained_model_name_or_path, subfolder, TF2_WEIGHTS_NAME)): + elif not use_safetensors and ( + os.path.isfile(os.path.join(pretrained_model_name_or_path, subfolder, TF_WEIGHTS_NAME + ".index")) + or os.path.isfile(os.path.join(pretrained_model_name_or_path, subfolder, TF2_WEIGHTS_NAME)) + ): raise EnvironmentError( f"Error no file named {_add_variant(WEIGHTS_NAME, variant)} found in directory" f" {pretrained_model_name_or_path} but there is a file for TensorFlow weights. Use" " `from_tf=True` to load this model from those weights." ) - elif os.path.isfile(os.path.join(pretrained_model_name_or_path, subfolder, FLAX_WEIGHTS_NAME)): + elif not use_safetensors and os.path.isfile( + os.path.join(pretrained_model_name_or_path, subfolder, FLAX_WEIGHTS_NAME) + ): raise EnvironmentError( f"Error no file named {_add_variant(WEIGHTS_NAME, variant)} found in directory" f" {pretrained_model_name_or_path} but there is a file for Flax weights. Use `from_flax=True`" diff --git a/tests/utils/test_modeling_utils.py b/tests/utils/test_modeling_utils.py index ec39c1428b28c8..c47f26cffa2d83 100644 --- a/tests/utils/test_modeling_utils.py +++ b/tests/utils/test_modeling_utils.py @@ -815,6 +815,72 @@ def test_checkpoint_variant_local_sharded_safe(self): for p1, p2 in zip(model.parameters(), new_model.parameters()): self.assertTrue(torch.allclose(p1, p2)) + def test_checkpoint_loading_only_safetensors_available(self): + # Test that the loading behaviour is as expected when only safetensor checkpoints are available + # - We can load the model with use_safetensors=True + # - We can load the model without specifying use_safetensors i.e. we search for the available checkpoint, + # preferring safetensors + # - We cannot load the model with use_safetensors=False + model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert") + + with tempfile.TemporaryDirectory() as tmp_dir: + model.save_pretrained(tmp_dir, max_shard_size="50kB", safe_serialization=True) + + weights_index_name = ".".join(SAFE_WEIGHTS_INDEX_NAME.split(".")[:-1] + ["json"]) + weights_index_file = os.path.join(tmp_dir, weights_index_name) + self.assertTrue(os.path.isfile(weights_index_file)) + + for i in range(1, 5): + weights_name = f"model-0000{i}-of-00005" + ".safetensors" + weights_name_file = os.path.join(tmp_dir, weights_name) + self.assertTrue(os.path.isfile(weights_name_file)) + + # Setting use_safetensors=False should raise an error as the checkpoint was saved with safetensors=True + with self.assertRaises(OSError): + _ = BertModel.from_pretrained(tmp_dir, use_safetensors=False) + + # We can load the model with use_safetensors=True + new_model = BertModel.from_pretrained(tmp_dir, use_safetensors=True) + + # We can load the model without specifying use_safetensors + new_model = BertModel.from_pretrained(tmp_dir) + + for p1, p2 in zip(model.parameters(), new_model.parameters()): + self.assertTrue(torch.allclose(p1, p2)) + + def test_checkpoint_loading_only_pytorch_bin_available(self): + # Test that the loading behaviour is as expected when only pytorch checkpoints are available + # - We can load the model with use_safetensors=False + # - We can load the model without specifying use_safetensors i.e. we search for the available checkpoint, + # preferring safetensors but falling back to pytorch + # - We cannot load the model with use_safetensors=True + model = BertModel.from_pretrained("hf-internal-testing/tiny-random-bert") + + with tempfile.TemporaryDirectory() as tmp_dir: + model.save_pretrained(tmp_dir, max_shard_size="50kB", safe_serialization=False) + + weights_index_name = ".".join(WEIGHTS_INDEX_NAME.split(".")[:-1] + ["json"]) + weights_index_file = os.path.join(tmp_dir, weights_index_name) + self.assertTrue(os.path.isfile(weights_index_file)) + + for i in range(1, 5): + weights_name = WEIGHTS_NAME.split(".")[0].split("_")[0] + f"_model-0000{i}-of-00005" + ".bin" + weights_name_file = os.path.join(tmp_dir, weights_name) + self.assertTrue(os.path.isfile(weights_name_file)) + + # Setting use_safetensors=True should raise an error as the checkpoint was saved with safetensors=False + with self.assertRaises(OSError): + _ = BertModel.from_pretrained(tmp_dir, use_safetensors=True) + + # We can load the model with use_safetensors=False + new_model = BertModel.from_pretrained(tmp_dir, use_safetensors=False) + + # We can load the model without specifying use_safetensors + new_model = BertModel.from_pretrained(tmp_dir) + + for p1, p2 in zip(model.parameters(), new_model.parameters()): + self.assertTrue(torch.allclose(p1, p2)) + def test_checkpoint_variant_hub(self): with tempfile.TemporaryDirectory() as tmp_dir: with self.assertRaises(EnvironmentError): From bd9dca3b855b5a20ea11097b89c40f34d775f1c7 Mon Sep 17 00:00:00 2001 From: Arthur <48595927+ArthurZucker@users.noreply.github.com> Date: Mon, 22 Jul 2024 19:42:47 +0200 Subject: [PATCH 05/73] set warning level to info for special tokens have been added (#32138) fixes #7002 --- src/transformers/tokenization_utils_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 434eaa2fac8ba2..58052579f2be08 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -2506,7 +2506,7 @@ def _from_pretrained( ) if added_tokens_decoder != {} and max(list(added_tokens_decoder.keys())[-1], 0) > tokenizer.vocab_size: - logger.warning_advice( + logger.info( "Special tokens have been added in the vocabulary, make sure the associated word embeddings are" " fine-tuned or trained." ) From 96a074fa7e2c04b904f72d9e827398d4c5f90f25 Mon Sep 17 00:00:00 2001 From: Marc Sun <57196510+SunMarc@users.noreply.github.com> Date: Mon, 22 Jul 2024 20:21:59 +0200 Subject: [PATCH 06/73] Add new quant method (#32047) * Add new quant method * update * fix multi-device * add test * add offload * style * style * add simple example * initial doc * docstring * style again * works ? * better docs * switch to non persistant * remove print * fix init * code review --- docs/source/en/_toctree.yml | 2 + docs/source/en/main_classes/quantization.md | 5 + docs/source/en/quantization/fbgemm_fp8.md | 58 ++++ docs/source/en/quantization/overview.md | 1 + src/transformers/__init__.py | 2 + src/transformers/integrations/__init__.py | 2 + src/transformers/integrations/fbgemm_fp8.py | 161 +++++++++++ src/transformers/modeling_utils.py | 12 +- src/transformers/quantizers/auto.py | 11 +- .../quantizers/quantizer_fbgemm_fp8.py | 205 +++++++++++++ src/transformers/testing_utils.py | 8 + src/transformers/utils/__init__.py | 1 + src/transformers/utils/import_utils.py | 5 + src/transformers/utils/quantization_config.py | 32 +++ tests/quantization/fbgemm_fp8/__init__.py | 0 .../fbgemm_fp8/test_fbgemm_fp8.py | 270 ++++++++++++++++++ 16 files changed, 770 insertions(+), 5 deletions(-) create mode 100644 docs/source/en/quantization/fbgemm_fp8.md create mode 100644 src/transformers/integrations/fbgemm_fp8.py create mode 100644 src/transformers/quantizers/quantizer_fbgemm_fp8.py create mode 100644 tests/quantization/fbgemm_fp8/__init__.py create mode 100644 tests/quantization/fbgemm_fp8/test_fbgemm_fp8.py diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 430670aa4364e6..740bb4b0719c61 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -157,6 +157,8 @@ title: EETQ - local: quantization/hqq title: HQQ + - local: quantization/fbgemm_fp8 + title: FBGEMM_FP8 - local: quantization/optimum title: Optimum - local: quantization/contribute diff --git a/docs/source/en/main_classes/quantization.md b/docs/source/en/main_classes/quantization.md index f1e2acdcfe4809..fc5808415cbe5f 100755 --- a/docs/source/en/main_classes/quantization.md +++ b/docs/source/en/main_classes/quantization.md @@ -56,3 +56,8 @@ Learn how to quantize models in the [Quantization](../quantization) guide. ## HqqConfig [[autodoc]] HqqConfig + +## FbgemmFp8Config + +[[autodoc]] FbgemmFp8Config + diff --git a/docs/source/en/quantization/fbgemm_fp8.md b/docs/source/en/quantization/fbgemm_fp8.md new file mode 100644 index 00000000000000..4df194d31be7ca --- /dev/null +++ b/docs/source/en/quantization/fbgemm_fp8.md @@ -0,0 +1,58 @@ + + +# FBGEMM FP8 + +With FBGEMM FP8 quantization method, you can quantize your model in FP8 (W8A8): +- the weights will be quantized in 8bit (FP8) per channel +- the activation will be quantized in 8bit (FP8) per token + +It relies on the [FBGEMM](https://github.com/pytorch/FBGEMM) library which provides efficient low-precision general matrix multiplication for small batch sizes and support for accuracy-loss minimizing techniques such as row-wise quantization and outlier-aware quantization. + +> [!TIP] +> You need a GPU with compute capability>=9 (e.g. H100) + +Before you begin, make sure the following libraries are installed with their latest version: + +```bash +pip install --upgrade accelerate fbgemm-gpu torch +``` + +If you are having issues with fbgemm-gpu and torch library, you might need to install the nighlty release. You can follow the instruction [here](https://pytorch.org/FBGEMM/fbgemm_gpu-development/InstallationInstructions.html#fbgemm-gpu-install-libraries:~:text=found%20here.-,Install%20the%20FBGEMM_GPU%20Package,-Install%20through%20PyTorch) + + +```py +from transformers import FbgemmFp8Config, AutoModelForCausalLM, AutoTokenizer + +model_name = "meta-llama/Meta-Llama-3-8B" +quantization_config = FbgemmFp8Config() +quantized_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", quantization_config=quantization_config) + +tokenizer = AutoTokenizer.from_pretrained(model_name) +input_text = "What are we having for dinner?" +input_ids = tokenizer(input_text, return_tensors="pt").to("cuda") + +output = quantized_model.generate(**input_ids, max_new_tokens=10) +print(tokenizer.decode(output[0], skip_special_tokens=True)) +``` + +A quantized model can be saved via "saved_pretrained" and be reused again via the "from_pretrained". + +```py +quant_path = "/path/to/save/quantized/model" +model.save_pretrained(quant_path) +model = AutoModelForCausalLM.from_pretrained(quant_path, device_map="auto") +``` \ No newline at end of file diff --git a/docs/source/en/quantization/overview.md b/docs/source/en/quantization/overview.md index 6cd13fc894633b..99fc669e49f448 100644 --- a/docs/source/en/quantization/overview.md +++ b/docs/source/en/quantization/overview.md @@ -55,4 +55,5 @@ Use the table below to help you decide which quantization method to use. | [GPTQ](./gptq) | 🔴 | 🔴 | 🟢 | 🟢 | 🔴 | 🔴 | 2 - 3 - 4 - 8 | 🟢 | 🟢 | 🟢 | https://github.com/AutoGPTQ/AutoGPTQ | | [HQQ](./hqq) | 🟢 | 🟢 | 🟢 | 🔴 | 🔴 | 🟢 | 1 - 8 | 🟢 | 🔴 | 🟢 | https://github.com/mobiusml/hqq/ | | [Quanto](./quanto) | 🟢 | 🟢 | 🟢 | 🔴 | 🟢 | 🟢 | 2 / 4 / 8 | 🔴 | 🔴 | 🟢 | https://github.com/huggingface/quanto | +| [FBGEMM_FP8](./fbgemm_fp8.md) | 🟢 | 🔴 | 🟢 | 🔴 | 🔴 | 🔴 | 8 | 🔴 | 🟢 | 🟢 | https://github.com/pytorch/FBGEMM | diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 3b7a3a59a7a80e..fe31cd3c237b1f 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -934,6 +934,7 @@ "AwqConfig", "BitsAndBytesConfig", "EetqConfig", + "FbgemmFp8Config", "GPTQConfig", "HqqConfig", "QuantoConfig", @@ -5665,6 +5666,7 @@ AwqConfig, BitsAndBytesConfig, EetqConfig, + FbgemmFp8Config, GPTQConfig, HqqConfig, QuantoConfig, diff --git a/src/transformers/integrations/__init__.py b/src/transformers/integrations/__init__.py index 9b838bd1608490..4c756a23ae0aa4 100755 --- a/src/transformers/integrations/__init__.py +++ b/src/transformers/integrations/__init__.py @@ -45,6 +45,7 @@ "unset_hf_deepspeed_config", ], "eetq": ["replace_with_eetq_linear"], + "fbgemm_fp8": ["FbgemmFp8Linear", "replace_with_fbgemm_fp8_linear"], "ggml": [ "GGUF_CONFIG_MAPPING", "GGUF_TENSOR_MAPPING", @@ -126,6 +127,7 @@ unset_hf_deepspeed_config, ) from .eetq import replace_with_eetq_linear + from .fbgemm_fp8 import FbgemmFp8Linear, replace_with_fbgemm_fp8_linear from .ggml import ( GGUF_CONFIG_MAPPING, GGUF_TENSOR_MAPPING, diff --git a/src/transformers/integrations/fbgemm_fp8.py b/src/transformers/integrations/fbgemm_fp8.py new file mode 100644 index 00000000000000..a0f5b2b76089b9 --- /dev/null +++ b/src/transformers/integrations/fbgemm_fp8.py @@ -0,0 +1,161 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..utils import is_accelerate_available, is_fbgemm_gpu_available, is_torch_available, logging + + +if is_torch_available(): + import torch + from torch import nn + +if is_accelerate_available(): + from accelerate import init_empty_weights + +if is_fbgemm_gpu_available(): + import fbgemm_gpu.experimental.gen_ai # noqa: F401 + +logger = logging.get_logger(__name__) + + +class FbgemmFp8Linear(torch.nn.Module): + def __init__(self, in_features, out_features, bias, weight_dtype=torch.float32): + super().__init__() + self.in_features = in_features + self.out_features = out_features + + self.register_buffer("weight", torch.zeros((out_features, in_features), dtype=torch.float8_e4m3fn)) + self.register_buffer("weight_scale", torch.zeros((out_features, 1), dtype=weight_dtype)) + self.register_buffer("input_scale_ub", torch.zeros([1], dtype=torch.float), persistent=False) + + if bias: + self.register_buffer("bias", torch.zeros((self.out_features), dtype=weight_dtype)) + else: + self.bias = None + + def forward(self, x): + num_tokens = None + # x_quantized and x_scale are not necessarily on the same device as x, this is an issue. + # https://github.com/pytorch/FBGEMM/blob/e08af8539c391437f447173863df0f3f6f6f1855/fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cu#L1237C3-L1237C45 + x_quantized, x_scale = torch.ops.fbgemm.quantize_fp8_per_row( + x.view(-1, x.shape[-1]), num_tokens, self.input_scale_ub + ) + # moving x_quantized, x_scale here creates glibberish output ... However, if we move the output, it works + # x_quantized, x_scale = x_quantized.to(x.device), x_scale.to(x.device) + + # The computation still happens on the device where self.weight is even if x_quantized is not on the same device as self.weight + output = torch.ops.fbgemm.f8f8bf16_rowwise( + x_quantized, self.weight, x_scale, self.weight_scale, use_fast_accum=True + ) + output = output + self.bias if self.bias is not None else output + # Hacky for now, we have the output to the device of x + output = output.to(x.device) + del x_quantized, x_scale + return output + + +def _replace_with_fbgemm_fp8_linear( + model, + modules_to_not_convert=None, + current_key_name=None, + quantization_config=None, + has_been_replaced=False, + pre_quantized=False, +): + """ + Private method that wraps the recursion for module replacement. + + Returns the converted model and a boolean that indicates if the conversion has been successfull or not. + """ + if current_key_name is None: + current_key_name = [] + + for name, module in model.named_children(): + current_key_name.append(name) + + if (isinstance(module, nn.Linear)) and name not in modules_to_not_convert: + # Check if the current key is not in the `modules_to_not_convert` + current_key_name_str = ".".join(current_key_name) + if not any( + (key + "." in current_key_name_str) or (key == current_key_name_str) for key in modules_to_not_convert + ): + with init_empty_weights(include_buffers=True): + in_features = module.in_features + out_features = module.out_features + model._modules[name] = FbgemmFp8Linear( + in_features, + out_features, + module.bias is not None, + ) + has_been_replaced = True + + # Force requires grad to False to avoid unexpected errors + model._modules[name].requires_grad_(False) + # set non persistant buffer outside of init_empty_weights + model._modules[name].input_scale_ub = torch.tensor( + [quantization_config.activation_scale_ub], dtype=torch.float + ) + if len(list(module.children())) > 0: + _, has_been_replaced = _replace_with_fbgemm_fp8_linear( + module, + modules_to_not_convert, + current_key_name, + quantization_config, + has_been_replaced=has_been_replaced, + pre_quantized=pre_quantized, + ) + # Remove the last key for recursion + current_key_name.pop(-1) + return model, has_been_replaced + + +def replace_with_fbgemm_fp8_linear( + model, modules_to_not_convert=None, current_key_name=None, quantization_config=None, pre_quantized=False +): + """ + A helper function to replace all `torch.nn.Linear` modules by `FbgemmFp8Linear` modules. + This will enable running your models using high performance fp8 kernel from FBGEMM library. + + The function will be run recursively and replace all `torch.nn.Linear` modules except for the `lm_head` that should + be kept as a `torch.nn.Linear` module. The replacement is done under `init_empty_weights` context manager so no + CPU/GPU memory is required to run this function. Each weight will be quantized along the channel. + + Parameters: + model (`torch.nn.Module`): + Input model or `torch.nn.Module` as the function is run recursively. + modules_to_not_convert (`List[`str`]`, *optional*, defaults to `["lm_head"]`): + Names of the modules to not convert in `FP8Linear`. In practice we keep the `lm_head` in full precision + for numerical stability reasons. + current_key_name (`List[`str`]`, *optional*): + An array to track the current key of the recursion. This is used to check whether the current key (part of + it) is not in the list of modules to not convert (for instances modules that are offloaded to `cpu` or + `disk`). + """ + + modules_to_not_convert = ["lm_head"] if modules_to_not_convert is None else modules_to_not_convert + + if quantization_config.modules_to_not_convert is not None: + modules_to_not_convert.extend(quantization_config.modules_to_not_convert) + modules_to_not_convert = list(set(modules_to_not_convert)) + model, has_been_replaced = _replace_with_fbgemm_fp8_linear( + model, modules_to_not_convert, current_key_name, quantization_config, pre_quantized=pre_quantized + ) + + if not has_been_replaced: + logger.warning( + "You are loading your model using FP8 quantization but no linear modules were found in your model." + " Please double check your model architecture, or submit an issue on github if you think this is" + " a bug." + ) + + return model diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index a2cea6dcdc2483..a20b7d941fbfe6 100755 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -868,7 +868,7 @@ def _load_state_dict_into_meta_model( # We convert floating dtypes to the `dtype` passed. We want to keep the buffers/params # in int/uint/bool and not cast them. - if dtype is not None and torch.is_floating_point(param): + if dtype is not None and torch.is_floating_point(param) and param.dtype != torch.float8_e4m3fn: if ( keep_in_fp32_modules is not None and any( @@ -894,7 +894,6 @@ def _load_state_dict_into_meta_model( old_param = getattr(old_param, split) if old_param is None: break - if old_param is not None: if dtype is None: param = param.to(old_param.dtype) @@ -3955,6 +3954,14 @@ def from_pretrained( and hf_quantizer.quantization_config.quant_method == QuantizationMethod.HQQ ): device_map_kwargs["force_hooks"] = True + if ( + hf_quantizer is not None + and hf_quantizer.quantization_config.quant_method == QuantizationMethod.FBGEMM_FP8 + and isinstance(device_map, dict) + and ("cpu" in device_map.values() or "disk" in device_map.values()) + ): + device_map_kwargs["offload_buffers"] = True + if not is_fsdp_enabled() and not is_deepspeed_zero3_enabled(): dispatch_model(model, **device_map_kwargs) @@ -4105,7 +4112,6 @@ def _fix_key(key): if cls._keys_to_ignore_on_load_unexpected is not None: for pat in cls._keys_to_ignore_on_load_unexpected: unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None] - if hf_quantizer is not None: missing_keys = hf_quantizer.update_missing_keys(model, missing_keys, prefix) diff --git a/src/transformers/quantizers/auto.py b/src/transformers/quantizers/auto.py index 2c65afa77e282c..40aa86fc37c733 100755 --- a/src/transformers/quantizers/auto.py +++ b/src/transformers/quantizers/auto.py @@ -20,6 +20,7 @@ AwqConfig, BitsAndBytesConfig, EetqConfig, + FbgemmFp8Config, GPTQConfig, HqqConfig, QuantizationConfigMixin, @@ -31,6 +32,7 @@ from .quantizer_bnb_4bit import Bnb4BitHfQuantizer from .quantizer_bnb_8bit import Bnb8BitHfQuantizer from .quantizer_eetq import EetqHfQuantizer +from .quantizer_fbgemm_fp8 import FbgemmFp8HfQuantizer from .quantizer_gptq import GptqHfQuantizer from .quantizer_hqq import HqqHfQuantizer from .quantizer_quanto import QuantoHfQuantizer @@ -45,6 +47,7 @@ "quanto": QuantoHfQuantizer, "eetq": EetqHfQuantizer, "hqq": HqqHfQuantizer, + "fbgemm_fp8": FbgemmFp8HfQuantizer, } AUTO_QUANTIZATION_CONFIG_MAPPING = { @@ -56,6 +59,7 @@ "aqlm": AqlmConfig, "quanto": QuantoConfig, "hqq": HqqConfig, + "fbgemm_fp8": FbgemmFp8Config, } @@ -156,8 +160,11 @@ def merge_quantization_configs( if isinstance(quantization_config, dict): quantization_config = AutoQuantizationConfig.from_dict(quantization_config) - if isinstance(quantization_config, (GPTQConfig, AwqConfig)) and quantization_config_from_args is not None: - # special case for GPTQ / AWQ config collision + if ( + isinstance(quantization_config, (GPTQConfig, AwqConfig, FbgemmFp8Config)) + and quantization_config_from_args is not None + ): + # special case for GPTQ / AWQ / FbgemmFp8 config collision loading_attr_dict = quantization_config_from_args.get_loading_attributes() for attr, val in loading_attr_dict.items(): setattr(quantization_config, attr, val) diff --git a/src/transformers/quantizers/quantizer_fbgemm_fp8.py b/src/transformers/quantizers/quantizer_fbgemm_fp8.py new file mode 100644 index 00000000000000..6591a56fce7840 --- /dev/null +++ b/src/transformers/quantizers/quantizer_fbgemm_fp8.py @@ -0,0 +1,205 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import importlib +from typing import TYPE_CHECKING, Any, Dict, List, Optional + +from packaging import version + +from .base import HfQuantizer + + +if TYPE_CHECKING: + from ..modeling_utils import PreTrainedModel + +from ..utils import is_accelerate_available, is_fbgemm_gpu_available, is_torch_available, logging +from .quantizers_utils import get_module_from_name + + +if is_torch_available(): + import torch + + +logger = logging.get_logger(__name__) + + +class FbgemmFp8HfQuantizer(HfQuantizer): + """ + FP8 quantization using fbgemm kernels + """ + + requires_parameters_quantization = True + requires_calibration = False + + required_packages = ["fbgemm-gpu", "accelerate"] + + def __init__(self, quantization_config, **kwargs): + super().__init__(quantization_config, **kwargs) + self.quantization_config = quantization_config + + def validate_environment(self, *args, **kwargs): + if not is_torch_available() or version.parse(importlib.metadata.version("torch")) < version.parse("2.1.0"): + raise ImportError( + "Using fbgemm fp8 quantization requires torch > 2.1.0" + "Please install the latest version of torch ( pip install --upgrade torch )" + ) + if not is_fbgemm_gpu_available(): + raise ImportError( + "Using fbgemm fp8 quantization requires fbgemm-gpu library" + "Please install the latest version of fbgemm-gpu library by following : https://pytorch.org/FBGEMM/fbgemm_gpu-development/InstallationInstructions.html#fbgemm-gpu-install-libraries" + ) + + if not is_accelerate_available("0.32.2"): + raise ImportError( + "Loading an FP8 quantized model requires accelerate > 0.32.1 (`pip install --upgrade accelerate`)" + ) + + if not torch.cuda.is_available(): + raise RuntimeError("Using FP8 quantized models with fbgemm kernels requires a GPU") + + compute_capability = torch.cuda.get_device_capability() + major, minor = compute_capability + if major < 9: + raise ValueError( + "FP8 quantized models is only supported on GPUs with compute capability >= 9.0 (e.g H100)" + ) + + device_map = kwargs.get("device_map", None) + if device_map is None: + logger.warning_once( + "You have loaded an FP8 model on CPU and have a CUDA device available, make sure to set " + "your model on a GPU device in order to run your model. To remove this warning, pass device_map = 'cuda'. " + ) + elif device_map is not None: + if ( + not self.pre_quantized + and isinstance(device_map, dict) + and ("cpu" in device_map.values() or "disk" in device_map.values()) + ): + raise ValueError( + "You are attempting to load an FP8 model with a device_map that contains a CPU or disk device." + "This is not supported when the model is quantized on the fly. " + "Please use a quantized checkpoint or remove the CPU or disk device from the device_map." + ) + + def update_torch_dtype(self, torch_dtype: "torch.dtype") -> "torch.dtype": + if torch_dtype is None: + torch_dtype = torch.bfloat16 + logger.info( + "Overriding torch_dtype=%s with `torch_dtype=torch.bloat16` due to " + "requirements of `fbgemm-gpu` to enable model loading in fp8. " + "Pass your own torch_dtype to specify the dtype of the remaining non-linear layers or pass" + " torch_dtype=torch.bfloat16 to remove this warning.", + torch_dtype, + ) + elif torch_dtype == torch.float16: + raise ValueError( + "You cannot use FP8 with torch_dtype=torch.float16." + "We recommend you passing torch_dtype=torch.bfloat16" + ) + return torch_dtype + + def check_quantized_param( + self, + model: "PreTrainedModel", + param_value: "torch.Tensor", + param_name: str, + state_dict: Dict[str, Any], + **kwargs, + ): + from ..integrations import FbgemmFp8Linear + + module, tensor_name = get_module_from_name(model, param_name) + + if isinstance(module, FbgemmFp8Linear): + if self.pre_quantized or tensor_name == "bias": + if tensor_name == "weight" and param_value.dtype != torch.float8_e4m3fn: + raise ValueError("Expect quantized weights but got an unquantized weight") + return False + else: + if tensor_name == "weight_scale": + raise ValueError("Expect unquantized weights but got a quantized weight_scale") + return True + return False + + def create_quantized_param( + self, + model: "PreTrainedModel", + param_value: "torch.Tensor", + param_name: str, + target_device: "torch.device", + state_dict: Dict[str, Any], + unexpected_keys: Optional[List[str]] = None, + ): + """ + Quantizes weights into weight and weight_scale + """ + new_value, weight_scale = torch.ops.fbgemm.quantize_fp8_per_row(param_value) + + module, tensor_name = get_module_from_name(model, param_name) + module._buffers[tensor_name] = new_value.to(target_device) + # to have the right output shape -> (out_features, 1) + module._buffers["weight_scale"] = weight_scale.view(weight_scale.shape[0], 1).to(target_device) + + if unexpected_keys is not None and param_name in unexpected_keys: + unexpected_keys.remove(param_name) + del param_name + + def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs): + return model + + def _process_model_before_weight_loading( + self, + model: "PreTrainedModel", + device_map, + keep_in_fp32_modules: List[str] = [], + **kwargs, + ): + from ..integrations import get_keys_to_not_convert, replace_with_fbgemm_fp8_linear + + self.modules_to_not_convert = get_keys_to_not_convert(model) + + if self.quantization_config.modules_to_not_convert is not None: + self.modules_to_not_convert.extend(self.quantization_config.modules_to_not_convert) + + model = replace_with_fbgemm_fp8_linear( + model, + modules_to_not_convert=self.modules_to_not_convert, + quantization_config=self.quantization_config, + pre_quantized=self.pre_quantized, + ) + + model.config.quantization_config = self.quantization_config + + def update_missing_keys(self, model, missing_keys: List[str], prefix: str) -> List[str]: + from ..integrations import FbgemmFp8Linear + + not_missing_keys = [] + for name, module in model.named_modules(): + if isinstance(module, FbgemmFp8Linear): + for missing in missing_keys: + if ( + (name in missing or name in f"{prefix}.{missing}") + and not missing.endswith(".weight") + and not missing.endswith(".bias") + ): + not_missing_keys.append(missing) + return [k for k in missing_keys if k not in not_missing_keys] + + @property + def is_serializable(self): + return True + + @property + def is_trainable(self) -> bool: + return False diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index 60ff7815a971ae..edfc9519963bee 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -68,6 +68,7 @@ is_eetq_available, is_essentia_available, is_faiss_available, + is_fbgemm_gpu_available, is_flash_attn_2_available, is_flax_available, is_fsdp_available, @@ -1116,6 +1117,13 @@ def require_quanto(test_case): return unittest.skipUnless(is_quanto_available(), "test requires quanto")(test_case) +def require_fbgemm_gpu(test_case): + """ + Decorator for fbgemm_gpu dependency + """ + return unittest.skipUnless(is_fbgemm_gpu_available(), "test requires fbgemm-gpu")(test_case) + + def require_phonemizer(test_case): """ Decorator marking a test that requires phonemizer diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py index 351ab0cf11ffba..efe473a6cdeda2 100755 --- a/src/transformers/utils/__init__.py +++ b/src/transformers/utils/__init__.py @@ -127,6 +127,7 @@ is_eetq_available, is_essentia_available, is_faiss_available, + is_fbgemm_gpu_available, is_flash_attn_2_available, is_flash_attn_greater_or_equal, is_flash_attn_greater_or_equal_2_10, diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py index bd14dd8cd7530c..f81b9d3dba41bd 100755 --- a/src/transformers/utils/import_utils.py +++ b/src/transformers/utils/import_utils.py @@ -98,6 +98,7 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[ _av_available = importlib.util.find_spec("av") is not None _bitsandbytes_available = _is_package_available("bitsandbytes") _eetq_available = _is_package_available("eetq") +_fbgemm_gpu_available = _is_package_available("fbgemm_gpu") _galore_torch_available = _is_package_available("galore_torch") _lomo_available = _is_package_available("lomo_optim") # `importlib.metadata.version` doesn't work with `bs4` but `beautifulsoup4`. For `importlib.util.find_spec`, reversed. @@ -888,6 +889,10 @@ def is_eetq_available(): return _eetq_available +def is_fbgemm_gpu_available(): + return _fbgemm_gpu_available + + def is_levenshtein_available(): return _levenshtein_available diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py index 506c4db447c7aa..5de8307c3bd79b 100755 --- a/src/transformers/utils/quantization_config.py +++ b/src/transformers/utils/quantization_config.py @@ -42,6 +42,7 @@ class QuantizationMethod(str, Enum): QUANTO = "quanto" EETQ = "eetq" HQQ = "hqq" + FBGEMM_FP8 = "fbgemm_fp8" class AWQLinearVersion(str, Enum): @@ -1047,3 +1048,34 @@ def post_init(self): accepted_weights = ["int8"] if self.weights not in accepted_weights: raise ValueError(f"Only support weights in {accepted_weights} but found {self.weights}") + + +@dataclass +class FbgemmFp8Config(QuantizationConfigMixin): + """ + This is a wrapper class about all possible attributes and features that you can play with a model that has been + loaded using fbgemm fp8 quantization. + + Args: + activation_scale_ub (`float`, *optional*, defaults to 1200.0): + The activation scale upper bound. This is used when quantizing the input activation. + modules_to_not_convert (`list`, *optional*, default to `None`): + The list of modules to not quantize, useful for quantizing models that explicitly require to have + some modules left in their original precision. + """ + + def __init__( + self, + activation_scale_ub: float = 1200.0, + modules_to_not_convert: Optional[List] = None, + **kwargs, + ): + self.quant_method = QuantizationMethod.FBGEMM_FP8 + self.activation_scale_ub = activation_scale_ub + self.modules_to_not_convert = modules_to_not_convert + + def get_loading_attributes(self): + attibutes_dict = copy.deepcopy(self.__dict__) + loading_attibutes = ["activation_scale_ub"] + loading_attibutes_dict = {i: j for i, j in attibutes_dict.items() if i in loading_attibutes} + return loading_attibutes_dict diff --git a/tests/quantization/fbgemm_fp8/__init__.py b/tests/quantization/fbgemm_fp8/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/tests/quantization/fbgemm_fp8/test_fbgemm_fp8.py b/tests/quantization/fbgemm_fp8/test_fbgemm_fp8.py new file mode 100644 index 00000000000000..61a1eecba8d3df --- /dev/null +++ b/tests/quantization/fbgemm_fp8/test_fbgemm_fp8.py @@ -0,0 +1,270 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import tempfile +import unittest + +from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, FbgemmFp8Config, OPTForCausalLM +from transformers.testing_utils import ( + require_accelerate, + require_fbgemm_gpu, + require_read_token, + require_torch_gpu, + require_torch_multi_gpu, + slow, + torch_device, +) +from transformers.utils import is_accelerate_available, is_torch_available + + +if is_torch_available(): + import torch + +if is_accelerate_available(): + from accelerate import init_empty_weights + + +@require_torch_gpu +class FbgemmFp8ConfigTest(unittest.TestCase): + def test_to_dict(self): + """ + Simple test that checks if one uses a config and converts it to a dict, the dict is the same as the config object + """ + quantization_config = FbgemmFp8Config() + config_to_dict = quantization_config.to_dict() + + for key in config_to_dict: + self.assertEqual(getattr(quantization_config, key), config_to_dict[key]) + + def test_from_dict(self): + """ + Simple test that checks if one uses a dict and converts it to a config object, the config object is the same as the dict + """ + dict = {"modules_to_not_convert": ["lm_head.weight"], "quant_method": "fbgemm_fp8"} + quantization_config = FbgemmFp8Config.from_dict(dict) + + self.assertEqual(dict["modules_to_not_convert"], quantization_config.modules_to_not_convert) + self.assertEqual(dict["quant_method"], quantization_config.quant_method) + + +@slow +@require_torch_gpu +@require_fbgemm_gpu +@require_accelerate +@require_read_token +class FbgemmFp8Test(unittest.TestCase): + model_name = "meta-llama/Meta-Llama-3-8B" + + input_text = "What are we having for dinner?" + max_new_tokens = 9 + + EXPECTED_OUTPUT = "What are we having for dinner?\nI'm having a steak and a salad" + + device_map = "cuda" + + offload_device_map = { + "model.embed_tokens": 0, + "model.layers.0": 0, + "model.layers.1": 0, + "model.layers.2": 0, + "model.layers.3": 0, + "model.layers.4": 0, + "model.layers.5": 0, + "model.layers.6": 0, + "model.layers.7": 0, + "model.layers.8": 0, + "model.layers.9": 0, + "model.layers.10": 0, + "model.layers.11": 0, + "model.layers.12": 0, + "model.layers.13": 0, + "model.layers.14": 0, + "model.layers.15": 0, + "model.layers.16": "cpu", + "model.layers.17": "cpu", + "model.layers.18": "cpu", + "model.layers.19": "cpu", + "model.layers.20": "disk", + "model.layers.21": "disk", + "model.layers.22": "disk", + "model.layers.23": "disk", + "model.layers.24": "disk", + "model.layers.25": "disk", + "model.layers.26": "disk", + "model.layers.27": "disk", + "model.layers.28": "disk", + "model.layers.29": "disk", + "model.layers.30": "disk", + "model.layers.31": "disk", + "model.norm": "disk", + "lm_head": "disk", + } + + # called only once for all test in this class + @classmethod + def setUpClass(cls): + """ + Setup quantized model + """ + quantization_config = FbgemmFp8Config() + cls.tokenizer = AutoTokenizer.from_pretrained(cls.model_name) + cls.quantized_model = AutoModelForCausalLM.from_pretrained( + cls.model_name, device_map=cls.device_map, quantization_config=quantization_config + ) + + def tearDown(self): + gc.collect() + torch.cuda.empty_cache() + gc.collect() + + def test_quantized_model_conversion(self): + """ + Simple test that checks if the quantized model has been converted properly + """ + + from transformers.integrations import FbgemmFp8Linear, replace_with_fbgemm_fp8_linear + + model_id = "facebook/opt-350m" + config = AutoConfig.from_pretrained(model_id, revision="cb32f77e905cccbca1d970436fb0f5e6b58ee3c5") + quantization_config = FbgemmFp8Config() + + with init_empty_weights(): + model = OPTForCausalLM(config) + + nb_linears = 0 + for module in model.modules(): + if isinstance(module, torch.nn.Linear): + nb_linears += 1 + + model = replace_with_fbgemm_fp8_linear(model, quantization_config=quantization_config) + nb_fbgemm_linear = 0 + for module in model.modules(): + if isinstance(module, FbgemmFp8Linear): + nb_fbgemm_linear += 1 + + self.assertEqual(nb_linears - 1, nb_fbgemm_linear) + + with init_empty_weights(): + model = OPTForCausalLM(config) + quantization_config = FbgemmFp8Config(modules_to_not_convert=["fc1"]) + model = replace_with_fbgemm_fp8_linear(model, quantization_config=quantization_config) + nb_fbgemm_linear = 0 + for module in model.modules(): + if isinstance(module, FbgemmFp8Linear): + nb_fbgemm_linear += 1 + + self.assertEqual(nb_linears - 25, nb_fbgemm_linear) + + def test_quantized_model(self): + """ + Simple test that checks if the quantized model is working properly + """ + input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device) + + output = self.quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens) + self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT) + + def test_save_pretrained(self): + """ + Simple test that checks if the quantized model is working properly after being saved and loaded + """ + with tempfile.TemporaryDirectory() as tmpdirname: + self.quantized_model.save_pretrained(tmpdirname) + + model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=self.device_map) + + input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device) + + output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens) + self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT) + + def test_change_loading_attributes(self): + """ + Simple test that checks if the quantized model is working properly after being saved and loaded + """ + with tempfile.TemporaryDirectory() as tmpdirname: + self.quantized_model.save_pretrained(tmpdirname) + + quantization_config = FbgemmFp8Config(activation_scale_ub=1000.0) + + model = AutoModelForCausalLM.from_pretrained( + tmpdirname, device_map=self.device_map, quantization_config=quantization_config + ) + + self.assertEqual(model.model.layers[1].mlp.down_proj.input_scale_ub.item(), 1000.0) + + input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device) + + output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens) + self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT) + + @require_torch_multi_gpu + def test_quantized_model_multi_gpu(self): + """ + Simple test that checks if the quantized model is working properly with multiple GPUs + set CUDA_VISIBLE_DEVICES=0,1 if you have more than 2 GPUS + """ + input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device) + quantization_config = FbgemmFp8Config() + quantized_model = AutoModelForCausalLM.from_pretrained( + self.model_name, device_map="auto", quantization_config=quantization_config + ) + self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1}) + + output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens) + self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT) + + def test_quantized_model_offload(self): + """ + Simple test that checks if the quantized model returns an error when loading with cpu/disk offloaded + """ + quantization_config = FbgemmFp8Config() + + with self.assertRaisesRegex( + ValueError, "You are attempting to load an FP8 model with a device_map that contains a CPU or disk device." + ): + AutoModelForCausalLM.from_pretrained( + self.model_name, device_map=self.offload_device_map, quantization_config=quantization_config + ) + + def test_save_pretrained_offload(self): + """ + Simple test that checks if the saved quantized model is working properly cpu/disk offload + """ + with tempfile.TemporaryDirectory() as tmpdirname: + self.quantized_model.save_pretrained(tmpdirname) + + input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device) + + quantized_model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map=self.offload_device_map) + output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens) + self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT) + + @require_torch_multi_gpu + def test_save_pretrained_multi_gpu(self): + """ + Simple test that checks if the quantized model is working properly after being saved and loaded + """ + with tempfile.TemporaryDirectory() as tmpdirname: + self.quantized_model.save_pretrained(tmpdirname) + + model = AutoModelForCausalLM.from_pretrained(tmpdirname, device_map="auto") + self.assertTrue(set(model.hf_device_map.values()) == {0, 1}) + + input_ids = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device) + + output = model.generate(**input_ids, max_new_tokens=self.max_new_tokens) + self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT) From 251a2409c694c29ee28e66c954670c483cf54961 Mon Sep 17 00:00:00 2001 From: James Thewlis Date: Tue, 23 Jul 2024 01:12:16 -0400 Subject: [PATCH 07/73] Add llama3-llava-next-8b to llava_next conversion script (#31395) * Add llama3-llava-next-8b to llava_next conversion script Adds support for the lmms-lab/llama3-llava-next-8b model to the convert_llava_next_weights_to_hf.py script, along with an example prompt generated from the llava_llama_3 conv_template in the LLaVA-NeXT repo. * Exclude <|begin_of_text|> from prompt example This token gets added automatically, so it should not be included in the prompt example. * Add llava-next-72b and llava-next-110b Adds the Qwen-based LLaVA-Next models to the conversion script, along with changes to load the models on multiple GPUs for inference. * Add llama3 and qwen prompt formats to docs * Chat prompt and padding side left for llama3 batched * update * Update src/transformers/models/llava_next/convert_llava_next_weights_to_hf.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/llava_next/convert_llava_next_weights_to_hf.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * remove code * better naming --------- Co-authored-by: raushan Co-authored-by: Raushan Turganbay Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> --- docs/source/en/model_doc/llava_next.md | 11 ++ .../convert_llava_next_weights_to_hf.py | 125 +++++++++++++----- 2 files changed, 101 insertions(+), 35 deletions(-) diff --git a/docs/source/en/model_doc/llava_next.md b/docs/source/en/model_doc/llava_next.md index b9d06ff97ffa53..9e7caa37d7b9bc 100644 --- a/docs/source/en/model_doc/llava_next.md +++ b/docs/source/en/model_doc/llava_next.md @@ -100,6 +100,17 @@ print(text_prompt) "<|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n\nWhat is shown in this image?<|im_end|><|im_start|>assistant\n" ``` +[llama3-llava-next-8b-hf](https://huggingface.co/llava-hf/llava-next-8b-hf) requires the following format: + +```bash +"<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.<|eot_id|><|start_header_id|><|start_header_id|>user<|end_header_id|>\n\n\nWhat is shown in this image?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" +``` + +[llava-next-72b-hf](https://huggingface.co/llava-hf/llava-next-72b-hf) and [llava-next-110b-hf](https://huggingface.co/llava-hf/llava-next-110b-hf) require the following format: + +```bash +"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n\nWhat is shown in this image?<|im_end|>\n<|im_start|>assistant\n" +``` ## Usage example diff --git a/src/transformers/models/llava_next/convert_llava_next_weights_to_hf.py b/src/transformers/models/llava_next/convert_llava_next_weights_to_hf.py index 2c8aefe39dc255..06edc5c9b1adbc 100644 --- a/src/transformers/models/llava_next/convert_llava_next_weights_to_hf.py +++ b/src/transformers/models/llava_next/convert_llava_next_weights_to_hf.py @@ -24,6 +24,7 @@ """ import argparse +import gc import glob import json from pathlib import Path @@ -111,6 +112,16 @@ def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False): elif model_id == "liuhaotian/llava-v1.6-34b": text_model_id = "NousResearch/Nous-Hermes-2-Yi-34B" image_token_index = 64000 + elif model_id == "lmms-lab/llama3-llava-next-8b": + text_model_id = "meta-llama/Meta-Llama-3-8B-Instruct" + image_token_index = 128256 + elif model_id == "lmms-lab/llava-next-72b": + text_model_id = "Qwen/Qwen1.5-72B-Chat" + image_token_index = 151646 + elif model_id == "lmms-lab/llava-next-110b": + text_model_id = "Qwen/Qwen1.5-110B-Chat" + image_token_index = 151646 + vision_model_id = data["mm_vision_tower"] torch.set_default_dtype(torch.float16) @@ -120,7 +131,7 @@ def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False): tokenizer = AutoTokenizer.from_pretrained(text_model_id, use_fast=use_fast) tokenizer.add_tokens(AddedToken("", special=True, normalized=False), special_tokens=True) - if model_id == "liuhaotian/llava-v1.6-mistral-7b": + if model_id in ("liuhaotian/llava-v1.6-mistral-7b", "lmms-lab/llama3-llava-next-8b"): # Mistral-7B doesn't have a padding token set yet tokenizer.add_special_tokens({"pad_token": ""}) @@ -151,28 +162,45 @@ def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False): # We add an image token so we resize the model # Pad to 64 for performance reasons - pad_shape = 64 - vocab_size = config.text_config.vocab_size - if model_id == "liuhaotian/llava-v1.6-34b": - # this one has 3 additional tokens, namely <|startoftext|>, <|endoftext|> and - num_tokens = vocab_size + 3 - else: - # this one has 2 additional tokens, namely and - num_tokens = vocab_size + 2 - model.resize_token_embeddings(num_tokens, pad_to_multiple_of=pad_shape) - model.language_model.model.embed_tokens.weight.data[vocab_size:] = torch.stack( - tuple( - (dist.sample() for _ in range(model.language_model.model.embed_tokens.weight.data[vocab_size:].shape[0])) - ), - dim=0, - ) - model.language_model.lm_head.weight.data[vocab_size:] = torch.stack( - tuple((dist.sample() for _ in range(model.language_model.lm_head.weight.data[vocab_size:].shape[0]))), - dim=0, - ) + # Qwen-based models have extra unused space in the vocab size already, so no need to resize + if model_id not in ["lmms-lab/llava-next-72b", "lmms-lab/llava-next-110b"]: + pad_shape = 64 + vocab_size = config.text_config.vocab_size + if model_id == "liuhaotian/llava-v1.6-34b": + # this one has 3 additional tokens, namely <|startoftext|>, <|endoftext|> and + num_tokens = vocab_size + 3 + else: + # this one has 2 additional tokens, namely and + num_tokens = vocab_size + 2 + model.resize_token_embeddings(num_tokens, pad_to_multiple_of=pad_shape) + model.language_model.model.embed_tokens.weight.data[vocab_size:] = torch.stack( + tuple( + ( + dist.sample() + for _ in range(model.language_model.model.embed_tokens.weight.data[vocab_size:].shape[0]) + ) + ), + dim=0, + ) + model.language_model.lm_head.weight.data[vocab_size:] = torch.stack( + tuple((dist.sample() for _ in range(model.language_model.lm_head.weight.data[vocab_size:].shape[0]))), + dim=0, + ) + + print(f"Saving model and processor for {model_id} to {pytorch_dump_folder_path}") + Path(pytorch_dump_folder_path).mkdir(exist_ok=True) + model.save_pretrained(pytorch_dump_folder_path) + processor.save_pretrained(pytorch_dump_folder_path) + + # Make space so we can load the model properly now. + del state_dict + gc.collect() - device = "cuda:2" - model.to(device) + # Load everything back for inference tests in float32 because prev script was written as that + # Though it's mostly loaded in fp16 as original weights are in fp16 + model = LlavaNextForConditionalGeneration.from_pretrained(pytorch_dump_folder_path, device_map="auto") + processor = LlavaNextProcessor.from_pretrained(pytorch_dump_folder_path) + device = model.device # prepare inputs image = load_image() @@ -182,6 +210,11 @@ def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False): prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: \nWhat is shown in this image? ASSISTANT:" elif model_id == "liuhaotian/llava-v1.6-34b": prompt = "<|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n\nWhat is shown in this image?<|im_end|><|im_start|>assistant\n" + elif model_id == "lmms-lab/llama3-llava-next-8b": + prompt = "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.<|eot_id|><|start_header_id|><|start_header_id|>user<|end_header_id|>\n\n\nWhat is shown in this image?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" + elif model_id in ["lmms-lab/llava-next-72b", "lmms-lab/llava-next-110b"]: + prompt = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n\nWhat is shown in this image?<|im_end|>\n<|im_start|>assistant\n" + inputs = processor(images=image, text=prompt, return_tensors="pt") # verify inputs @@ -194,8 +227,6 @@ def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False): original_input_ids = torch.load(filepath, map_location="cpu") # replace -200 by image_token_index (since we use token ID = 32000 for the image token) original_input_ids[original_input_ids == -200] = image_token_index - print(tokenizer.decode([id for id in original_input_ids.tolist()[0] if id != -200])) - assert original_input_ids[0].tolist() == inputs.input_ids[0].tolist() elif model_id == "liuhaotian/llava-v1.6-34b": @@ -243,6 +274,26 @@ def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False): dtype=torch.float32, device=device, ) + elif model_id == "lmms-lab/llama3-llava-next-8b": + expected_slice = torch.tensor( + [[-3.9648, 1.1396, 3.3145], [-5.3594, -1.5654, -1.9619], [-12.3750, -10.6797, -9.3125]], + dtype=torch.float32, + device=device, + ) + elif model_id == "lmms-lab/llava-next-72b": + # Not yet checked against reference + expected_slice = torch.tensor( + [[3.7148, 3.9277, 3.4395], [-0.4341, 1.1387, 6.5117], [3.2324, 3.4688, 4.1133]], + dtype=torch.float32, + device=device, + ) + elif model_id == "lmms-lab/llava-next-110b": + # Not yet checked against reference + expected_slice = torch.tensor( + [[-2.5449, -1.6738, -2.0371], [1.0811, 3.4961, 5.0312], [1.7803, 2.5137, 2.4277]], + dtype=torch.float32, + device=device, + ) else: raise ValueError(f"Model {model_id} not supported") @@ -268,6 +319,12 @@ def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False): expected_text = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: \nWhat is shown in this image? ASSISTANT: The image appears to be a radar chart, also known as a spider chart or star chart, which is a graphical method of displaying multivariate data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point.\n\nIn this particular radar chart, there are several variables represented:\n\n- MM-Vet\n- LLa-Va-Bench\n- SEED-Bench\n- MM" elif model_id == "liuhaotian/llava-v1.6-34b": expected_text = "<|im_start|> system\nAnswer the questions. <|im_start|> user\n\nWhat is shown in this image? <|im_start|> assistant\nThe image appears to be a radar chart, also known as a spider chart, which is a graphical method of displaying multivariate data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point.\n\nIn this particular chart, there are several datasets represented by different colors and labeled with various acronyms such as MM-Vet, LLaVA-Bench, SEED-Bench, MM-Bench-CN, MM-" + elif model_id == "lmms-lab/llama3-llava-next-8b": + expected_text = 'system\n\nYou are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.user\n\n\nWhat is shown in this image?assistant\n\n\nThe image shows a radar chart, also known as a spider chart or a web chart, which is a type of graph used to display multivariate data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point. Each axis represents a different variable, and the values are plotted along each axis and connected to form a polygon.\n\nIn this particular radar chart, there are several axes labeled with different variables, such as "MM-Vet," "LL' + elif model_id == "lmms-lab/llava-next-72b": + expected_text = "system\nYou are a helpful assistant.\nuser\n\nWhat is shown in this image?\nassistant\nThe image displays a radar chart, also known as a spider chart or a star chart, which is a graphical method of displaying multivariate data in the form of a two-dimensional chart of three or more quantitative variables represented on axes starting from the same point. Each axis represents a different variable, and the value of each variable is represented by the distance from the center of the chart to the point where the axis intersects with the line representing that variable's value.\n\nIn this particular chart, there are several axes" + elif model_id == "lmms-lab/llava-next-110b": + expected_text = "system\nYou are a helpful assistant.\nuser\n\nWhat is shown in this image?\nassistant\nThe image shows a radar chart comparing the performance of different models on various visual question answering (VQA) benchmarks. Each colored line represents a different model, and the distance from the center of the chart indicates the score or performance level of the model on a particular benchmark. The benchmarks are labeled around the edges of the chart, and include VQA v2, GQA, VizWiz, TextVQA, MMBench-CN, MME, and others. The chart allows for a" else: raise ValueError(f"Model {model_id} not supported") @@ -281,7 +338,7 @@ def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False): inputs = processor( images=[image, cats_image], - text=[prompt, "[INST] \nHow many cats are there? [/INST]"], + text=[prompt, prompt], padding=True, return_tensors="pt", ).to(device) @@ -305,16 +362,11 @@ def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False): outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True) print(outputs) - if pytorch_dump_folder_path is not None: - print(f"Saving model and processor for {model_id} to {pytorch_dump_folder_path}") - Path(pytorch_dump_folder_path).mkdir(exist_ok=True) - model.save_pretrained(pytorch_dump_folder_path) - processor.save_pretrained(pytorch_dump_folder_path) - if push_to_hub: - repo_id = model_id.split("/")[-1] - model.push_to_hub(f"llava-hf/{repo_id}-hf") - processor.push_to_hub(f"llava-hf/{repo_id}-hf") + checkpoint_name = model_id.split("/")[-1] + print(f"Pushing to repo llava-hf/{checkpoint_name}-hf") + model.push_to_hub(f"llava-hf/{checkpoint_name}-hf") + processor.push_to_hub(f"llava-hf/{checkpoint_name}-hf") if __name__ == "__main__": @@ -328,11 +380,14 @@ def convert_llava_to_hf(model_id, pytorch_dump_folder_path, push_to_hub=False): "liuhaotian/llava-v1.6-vicuna-7b", "liuhaotian/llava-v1.6-vicuna-13b", "liuhaotian/llava-v1.6-34b", + "lmms-lab/llama3-llava-next-8b", + "lmms-lab/llava-next-72b", + "lmms-lab/llava-next-110b", ], required=False, ) parser.add_argument( - "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." + "--pytorch_dump_folder_path", type=str, required=True, help="Path to the output PyTorch model directory." ) parser.add_argument( "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub." From 3aefb4ec7f957f9561a410eabc6f9d57b2f0384f Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Tue, 23 Jul 2024 10:23:55 +0500 Subject: [PATCH 08/73] LLaVaNeXT: pad on right if training (#32134) * pad on right if training * docs * add tests --- docs/source/en/model_doc/llava-next-video.md | 7 ++++ docs/source/en/model_doc/llava_next.md | 7 ++++ .../models/llava_next/modeling_llava_next.py | 4 +- .../modeling_llava_next_video.py | 4 +- .../llava_next/test_modeling_llava_next.py | 38 ++++++++++++++++-- .../test_modeling_llava_next_video.py | 40 +++++++++++++++++-- 6 files changed, 90 insertions(+), 10 deletions(-) diff --git a/docs/source/en/model_doc/llava-next-video.md b/docs/source/en/model_doc/llava-next-video.md index 88e41efc29c87c..48e50f950621e8 100644 --- a/docs/source/en/model_doc/llava-next-video.md +++ b/docs/source/en/model_doc/llava-next-video.md @@ -43,6 +43,13 @@ The original code can be found [here](https://github.com/LLaVA-VL/LLaVA-NeXT/tre - We advise users to use `padding_side="left"` when computing batched generation as it leads to more accurate results. Simply make sure to call `processor.tokenizer.padding_side = "left"` before generating. + + +- Llava-Next uses different number of patches for images and thus has to pad the inputs inside modeling code, aside from the padding done when processing the inputs. The default setting is "left-padding" if model is in `eval()` mode, otherwise "right-padding". + + + + - Note that each checkpoint has been trained with a specific prompt format, depending on which large language model (LLM) was used. You can use tokenizer's `apply_chat_template` to format your prompts correctly. Below is an example of how to do that. We will use [LLaVA-NeXT-Video-7B-hf](https://huggingface.co/llava-hf/LLaVA-NeXT-Video-7B-hf) and a conversation history of videos and images. Each content field has to be a list of dicts, as follows: diff --git a/docs/source/en/model_doc/llava_next.md b/docs/source/en/model_doc/llava_next.md index 9e7caa37d7b9bc..0c25ed32db5ab3 100644 --- a/docs/source/en/model_doc/llava_next.md +++ b/docs/source/en/model_doc/llava_next.md @@ -46,6 +46,13 @@ The original code can be found [here](https://github.com/haotian-liu/LLaVA/tree/ - We advise users to use `padding_side="left"` when computing batched generation as it leads to more accurate results. Simply make sure to call `processor.tokenizer.padding_side = "left"` before generating. + + +- Llava-Next uses different number of patches for images and thus has to pad the inputs inside modeling code, aside from the padding done when processing the inputs. The default setting is "left-padding" if model is in `eval()` mode, otherwise "right-padding". + + + + - Note that each checkpoint has been trained with a specific prompt format, depending on which large language model (LLM) was used. You can use the processor's `apply_chat_template` to format your prompts correctly. For that you have to construct a conversation history, passing a plain string will not format your prompt. Each message in the conversation history for chat templates is a dictionary with keys "role" and "content". The "content" should be a list of dictionaries, for "text" and "image" modalities. Below is an example of how to do that and the list of formats accepted by each checkpoint. We will use [llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-hf/llava-v1.6-mistral-7b-hf) and a conversation history of text and image. Each content field has to be a list of dicts, as follows: diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py index 5b897b817330b7..ad76561df54fd7 100644 --- a/src/transformers/models/llava_next/modeling_llava_next.py +++ b/src/transformers/models/llava_next/modeling_llava_next.py @@ -518,8 +518,8 @@ def _merge_input_ids_with_image_features( _left_padding = torch.any(attention_mask[:, 0] == 0) _right_padding = torch.any(attention_mask[:, -1] == 0) - left_padding = True - if batch_size > 1: + left_padding = True if not self.training else False + if batch_size > 1 and not self.training: if _left_padding and not _right_padding: left_padding = True elif not _left_padding and _right_padding: diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py index f2ccb99e618753..e3264dfd91e1a1 100644 --- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py @@ -562,8 +562,8 @@ def _merge_input_ids_with_image_features( _left_padding = torch.any(attention_mask[:, 0] == 0) _right_padding = torch.any(attention_mask[:, -1] == 0) - left_padding = True - if batch_size > 1: + left_padding = True if not self.training else False + if batch_size > 1 and not self.training: if _left_padding and not _right_padding: left_padding = True elif not _left_padding and _right_padding: diff --git a/tests/models/llava_next/test_modeling_llava_next.py b/tests/models/llava_next/test_modeling_llava_next.py index 69794a85d9fe39..70d91002a91bc3 100644 --- a/tests/models/llava_next/test_modeling_llava_next.py +++ b/tests/models/llava_next/test_modeling_llava_next.py @@ -123,7 +123,7 @@ def __init__( self.batch_size = 3 self.num_channels = 3 self.image_size = 30 - self.encoder_seq_length = 341 + self.encoder_seq_length = 342 self.image_grid_pinpoints = [[32, 32]] def get_config(self): @@ -156,9 +156,7 @@ def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() config, pixel_values = config_and_inputs input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 2 - # make attention mask left-padded to avoid issues with "model has no attribute padding_side" attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(torch_device) - attention_mask[:, :1] = 0 # we are giving 3 images let's make sure we pass in 3 image tokens input_ids[:, 1] = config.image_token_index labels = torch.zeros((self.batch_size, self.seq_length), dtype=torch.long, device=torch_device) @@ -473,3 +471,37 @@ def test_small_model_integration_test_batch_matches_single(self): self.processor.decode(output_batched[0], skip_special_tokens=True), self.processor.decode(output_single[0], skip_special_tokens=True), ) + + @slow + @require_bitsandbytes + def test_padding_side_when_merging_inputs(self): + model = LlavaNextForConditionalGeneration.from_pretrained( + "llava-hf/llava-v1.6-mistral-7b-hf", + load_in_4bit=True, + ) + + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + lowres_url = "https://4.img-dpreview.com/files/p/TS560x560~forums/56876524/03975b28741443319e9a94615e35667e" + cats_image = Image.open(requests.get(url, stream=True).raw) + lowres_img = Image.open(requests.get(lowres_url, stream=True).raw) + + inputs_batched = self.processor( + [self.prompt, self.prompt], images=[lowres_img, cats_image], return_tensors="pt", padding=True + ).to(torch_device) + + # model is in eval mode by default so we should get pad on the left side + # we can check the first hidden-states (aka inputs embeds) + # the first element was lo-res image and we expect the first 1414 tokens to be all pads + output_eval = model(**inputs_batched, output_hidden_states=True) + self.assertTrue((output_eval.hidden_states[0][0, :1414, ...] == 0).all().item()) + + # otherwise padding is on the right side, so it's last 1414 tokens + self.processor.padding_side = "right" + inputs_batched = self.processor( + [self.prompt, self.prompt], images=[lowres_img, cats_image], return_tensors="pt", padding=True + ).to(torch_device) + + model.train() + with torch.no_grad(): + output_train = model(**inputs_batched, output_hidden_states=True) + self.assertTrue((output_train.hidden_states[0][0, -1414:, ...] == 0).all().item()) diff --git a/tests/models/llava_next_video/test_modeling_llava_next_video.py b/tests/models/llava_next_video/test_modeling_llava_next_video.py index afe3062fb50e0e..9ba7ef869ddf00 100644 --- a/tests/models/llava_next_video/test_modeling_llava_next_video.py +++ b/tests/models/llava_next_video/test_modeling_llava_next_video.py @@ -124,7 +124,7 @@ def __init__( self.batch_size = 3 self.num_channels = 3 self.image_size = 30 - self.encoder_seq_length = 468 + self.encoder_seq_length = 469 self.image_grid_pinpoints = [[32, 32]] def get_config(self): @@ -166,9 +166,7 @@ def prepare_config_and_inputs(self): def prepare_config_and_inputs_for_common(self): config, pixel_values, pixel_values_videos = self.prepare_config_and_inputs() input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 2 - # make attention mask left-padded to avoid issues with "model has no attribute padding_side" attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(torch_device) - attention_mask[:, :1] = 0 # we are giving 3 images and videos let's make sure we pass in 3 special tokens input_ids[:, 1] = config.image_token_index input_ids[:, 2] = config.video_token_index @@ -453,3 +451,39 @@ def test_small_model_integration_test_batch_matches_single(self): self.processor.decode(output_batched[0], skip_special_tokens=True), self.processor.decode(output_single[0], skip_special_tokens=True), ) + + @slow + @require_bitsandbytes + def test_padding_side_when_merging_inputs(self): + model = LlavaNextVideoForConditionalGeneration.from_pretrained( + "llava-hf/LLaVA-NeXT-Video-7B-hf", load_in_4bit=True + ) + + inputs_batched = self.processor( + [self.prompt_video, self.prompt_image], + images=[self.image], + videos=[self.video], + return_tensors="pt", + padding=True, + ).to(torch_device) + + # model is in eval mode by default so we should get pad on the left side + # we can check the first hidden-states (aka inputs embeds) + # the first element was lo-res image and we expect the first 1482 tokens to be all pads + output_eval = model(**inputs_batched, output_hidden_states=True) + self.assertTrue((output_eval.hidden_states[0][0, :1482, ...] == 0).all().item()) + + # otherwise padding is on the right side, so it's last 1482 tokens + self.processor.padding_side = "right" + inputs_batched = self.processor( + [self.prompt_video, self.prompt_image], + images=[self.image], + videos=[self.video], + return_tensors="pt", + padding=True, + ).to(torch_device) + + model.train() + with torch.no_grad(): + output_train = model(**inputs_batched, output_hidden_states=True) + self.assertTrue((output_train.hidden_states[0][0, -1482:, ...] == 0).all().item()) From f83c6f1d02fba5e5ced9357b9c9196c76d937af3 Mon Sep 17 00:00:00 2001 From: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com> Date: Tue, 23 Jul 2024 14:54:38 +0800 Subject: [PATCH 09/73] Remove `trust_remote_code` when loading Libri Dummy (#31748) * [whisper integration] use parquet dataset for testing * propagate to others * more propagation * last one --- src/transformers/commands/pt_to_tf.py | 4 +- src/transformers/generation/logits_process.py | 6 +- src/transformers/models/clvp/modeling_clvp.py | 4 +- .../modeling_speech_to_text_2.py | 2 +- .../models/hubert/modeling_hubert.py | 2 +- .../models/hubert/modeling_tf_hubert.py | 4 +- .../modeling_speech_encoder_decoder.py | 2 +- .../speech_to_text/modeling_speech_to_text.py | 4 +- .../modeling_tf_speech_to_text.py | 2 +- .../models/univnet/modeling_univnet.py | 2 +- .../models/wav2vec2/modeling_flax_wav2vec2.py | 6 +- .../models/wav2vec2/modeling_tf_wav2vec2.py | 4 +- .../models/wav2vec2/modeling_wav2vec2.py | 2 +- .../modeling_wav2vec2_conformer.py | 2 +- .../models/whisper/generation_whisper.py | 2 +- .../models/whisper/modeling_flax_whisper.py | 8 +- .../models/whisper/modeling_tf_whisper.py | 6 +- .../models/whisper/modeling_whisper.py | 6 +- ...xtraction_audio_spectrogram_transformer.py | 4 +- .../clap/test_feature_extraction_clap.py | 4 +- tests/models/clap/test_modeling_clap.py | 16 +-- .../clvp/test_feature_extraction_clvp.py | 4 +- tests/models/clvp/test_modeling_clvp.py | 8 +- .../data2vec/test_modeling_data2vec_audio.py | 4 +- .../test_feature_extraction_encodec.py | 4 +- tests/models/encodec/test_modeling_encodec.py | 12 +-- tests/models/hubert/test_modeling_hubert.py | 4 +- .../models/hubert/test_modeling_tf_hubert.py | 4 +- .../test_feature_extraction_pop2piano.py | 4 +- .../pop2piano/test_processor_pop2piano.py | 4 +- .../test_feature_extraction_seamless_m4t.py | 4 +- tests/models/sew/test_modeling_sew.py | 4 +- tests/models/sew_d/test_modeling_sew_d.py | 4 +- .../test_feature_extraction_speech_to_text.py | 4 +- .../test_modeling_speech_to_text.py | 4 +- .../test_modeling_tf_speech_to_text.py | 4 +- .../test_feature_extraction_speecht5.py | 4 +- .../models/speecht5/test_modeling_speecht5.py | 8 +- .../unispeech/test_modeling_unispeech.py | 4 +- .../test_modeling_unispeech_sat.py | 4 +- .../test_feature_extraction_univnet.py | 4 +- tests/models/univnet/test_modeling_univnet.py | 4 +- .../wav2vec2/test_modeling_flax_wav2vec2.py | 4 +- .../wav2vec2/test_modeling_tf_wav2vec2.py | 4 +- .../models/wav2vec2/test_modeling_wav2vec2.py | 4 +- .../test_modeling_wav2vec2_bert.py | 4 +- .../test_modeling_wav2vec2_conformer.py | 4 +- tests/models/wavlm/test_modeling_wavlm.py | 4 +- .../test_feature_extraction_whisper.py | 4 +- .../whisper/test_modeling_flax_whisper.py | 4 +- .../whisper/test_modeling_tf_whisper.py | 2 +- tests/models/whisper/test_modeling_whisper.py | 26 ++--- .../test_pipelines_audio_classification.py | 4 +- ..._pipelines_automatic_speech_recognition.py | 100 +++++------------- tests/pipelines/test_pipelines_common.py | 4 +- tests/utils/test_audio_utils.py | 4 +- 56 files changed, 110 insertions(+), 254 deletions(-) diff --git a/src/transformers/commands/pt_to_tf.py b/src/transformers/commands/pt_to_tf.py index 4df45f7f086a51..4002b5e0eb85a4 100644 --- a/src/transformers/commands/pt_to_tf.py +++ b/src/transformers/commands/pt_to_tf.py @@ -202,9 +202,7 @@ def get_inputs(self, pt_model, tf_dummy_inputs, config): """ def _get_audio_input(): - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") speech_samples = ds.sort("id").select(range(2))[:2]["audio"] raw_samples = [x["array"] for x in speech_samples] return raw_samples diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py index c9a978f5ee8990..b226a059d106b1 100644 --- a/src/transformers/generation/logits_process.py +++ b/src/transformers/generation/logits_process.py @@ -1760,7 +1760,7 @@ class SuppressTokensAtBeginLogitsProcessor(LogitsProcessor): >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en") >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt") >>> # Whisper has `begin_suppress_tokens` set by default (= `[220, 50256]`). 50256 is the EOS token, so this means @@ -1812,7 +1812,7 @@ class SuppressTokensLogitsProcessor(LogitsProcessor): >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en") >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt") >>> # Whisper has a long list of suppressed tokens. For instance, in this case, the token 1 is suppressed by default. @@ -1901,7 +1901,7 @@ class WhisperTimeStampLogitsProcessor(LogitsProcessor): >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en") >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> inputs = processor(ds[3]["audio"]["array"], return_tensors="pt") >>> input_features = inputs.input_features diff --git a/src/transformers/models/clvp/modeling_clvp.py b/src/transformers/models/clvp/modeling_clvp.py index 4124e380a3d73d..d53bed2a5d12fd 100644 --- a/src/transformers/models/clvp/modeling_clvp.py +++ b/src/transformers/models/clvp/modeling_clvp.py @@ -1681,7 +1681,7 @@ def get_speech_features( >>> # Define the Text and Load the Audio (We are taking an audio example from HuggingFace Hub using `datasets` library) >>> text = "This is an example text." - >>> ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050)) >>> _, audio, sr = ds.sort("id").select(range(1))[:1]["audio"][0].values() @@ -1754,7 +1754,7 @@ def forward( >>> # Define the Text and Load the Audio (We are taking an audio example from HuggingFace Hub using `datasets` library) >>> text = "This is an example text." - >>> ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050)) >>> _, audio, sr = ds.sort("id").select(range(1))[:1]["audio"][0].values() diff --git a/src/transformers/models/deprecated/speech_to_text_2/modeling_speech_to_text_2.py b/src/transformers/models/deprecated/speech_to_text_2/modeling_speech_to_text_2.py index 4db60e0faeb4c1..8f1a8370933c91 100755 --- a/src/transformers/models/deprecated/speech_to_text_2/modeling_speech_to_text_2.py +++ b/src/transformers/models/deprecated/speech_to_text_2/modeling_speech_to_text_2.py @@ -831,7 +831,7 @@ def forward( >>> model.config.decoder_start_token_id = tokenizer.bos_token_id >>> # pre-process inputs and labels - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> inputs = feature_extractor( ... ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt" ... ) diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py index fd0c271b668109..da79c2894877b4 100755 --- a/src/transformers/models/hubert/modeling_hubert.py +++ b/src/transformers/models/hubert/modeling_hubert.py @@ -1325,7 +1325,7 @@ def forward( ... return batch - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = ds.map(map_to_array) >>> input_values = processor(ds["speech"][0], return_tensors="pt").input_values # Batch size 1 diff --git a/src/transformers/models/hubert/modeling_tf_hubert.py b/src/transformers/models/hubert/modeling_tf_hubert.py index 6c2a341927e200..2adfeea5b8b883 100644 --- a/src/transformers/models/hubert/modeling_tf_hubert.py +++ b/src/transformers/models/hubert/modeling_tf_hubert.py @@ -1471,7 +1471,7 @@ def call( ... return batch - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = ds.map(map_to_array) >>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values # Batch size 1 @@ -1583,7 +1583,7 @@ def call( ... return batch - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = ds.map(map_to_array) >>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values # Batch size 1 diff --git a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py index a46a1d62af1163..c2f5dd0259093b 100644 --- a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +++ b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py @@ -464,7 +464,7 @@ def forward( >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15") >>> model = SpeechEncoderDecoderModel.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> input_values = processor(ds[0]["audio"]["array"], return_tensors="pt").input_values >>> # Inference: Translate English speech to German diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py index 9832987f4e6433..8353a172b2120f 100755 --- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py +++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py @@ -1129,7 +1129,7 @@ def forward( >>> model = Speech2TextModel.from_pretrained("facebook/s2t-small-librispeech-asr") >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/s2t-small-librispeech-asr") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> inputs = feature_extractor( ... ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt" ... ) @@ -1270,7 +1270,7 @@ def forward( >>> processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> inputs = processor( ... ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt" diff --git a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py index 6ad680d4fc0725..bac1256ca4b672 100755 --- a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py +++ b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py @@ -1483,7 +1483,7 @@ def call( ... return batch - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = ds.map(map_to_array) >>> ds.set_format(type="tf") diff --git a/src/transformers/models/univnet/modeling_univnet.py b/src/transformers/models/univnet/modeling_univnet.py index 887493fdcf55f3..5b0c659c302a7c 100644 --- a/src/transformers/models/univnet/modeling_univnet.py +++ b/src/transformers/models/univnet/modeling_univnet.py @@ -525,7 +525,7 @@ def forward( >>> model = UnivNetModel.from_pretrained("dg845/univnet-dev") >>> feature_extractor = UnivNetFeatureExtractor.from_pretrained("dg845/univnet-dev") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> # Resample the audio to the feature extractor's sampling rate. >>> ds = ds.cast_column("audio", Audio(sampling_rate=feature_extractor.sampling_rate)) >>> inputs = feature_extractor( diff --git a/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py index 7a629e24572af3..9a24b9d39fdaac 100644 --- a/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py @@ -1076,7 +1076,7 @@ class FlaxWav2Vec2Model(FlaxWav2Vec2PreTrainedModel): ... return batch - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = ds.map(map_to_array) >>> input_values = processor( @@ -1195,7 +1195,7 @@ class FlaxWav2Vec2ForCTC(FlaxWav2Vec2PreTrainedModel): ... return batch - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = ds.map(map_to_array) >>> input_values = processor( @@ -1396,7 +1396,7 @@ def __call__( ... return batch - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = ds.map(map_to_array) >>> input_values = feature_extractor(ds["speech"][0], return_tensors="np").input_values # Batch size 1 diff --git a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py index cc8478d5b3c06a..a8338e363d94a2 100644 --- a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py @@ -1542,7 +1542,7 @@ def call( ... return batch - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = ds.map(map_to_array) >>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values # Batch size 1 @@ -1654,7 +1654,7 @@ def call( ... return batch - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> ds = ds.map(map_to_array) >>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values # Batch size 1 diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py index 16e50cc06c52ce..f1d021b58ee538 100755 --- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py @@ -1938,7 +1938,7 @@ def forward( >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base") >>> model = Wav2Vec2ForPreTraining.from_pretrained("facebook/wav2vec2-base") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> input_values = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt").input_values # Batch size 1 >>> # compute masked indices diff --git a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py index 6f631e4683ad3a..c37dd980d4ed3b 100644 --- a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +++ b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py @@ -1453,7 +1453,7 @@ def forward( >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-conformer-rel-pos-large") >>> model = Wav2Vec2ConformerForPreTraining.from_pretrained("facebook/wav2vec2-conformer-rel-pos-large") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> input_values = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt").input_values # Batch size 1 >>> # compute masked indices diff --git a/src/transformers/models/whisper/generation_whisper.py b/src/transformers/models/whisper/generation_whisper.py index 0467362ea2c7ec..4a28eb9203852c 100644 --- a/src/transformers/models/whisper/generation_whisper.py +++ b/src/transformers/models/whisper/generation_whisper.py @@ -464,7 +464,7 @@ def generate( >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en") >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt") >>> input_features = inputs.input_features diff --git a/src/transformers/models/whisper/modeling_flax_whisper.py b/src/transformers/models/whisper/modeling_flax_whisper.py index 9da592c107daef..cc4483963c6309 100644 --- a/src/transformers/models/whisper/modeling_flax_whisper.py +++ b/src/transformers/models/whisper/modeling_flax_whisper.py @@ -985,7 +985,7 @@ def encode( >>> processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en") >>> model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en", from_pt=True) - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="np") >>> input_features = inputs.input_features >>> encoder_outputs = model.encode(input_features=input_features) @@ -1045,7 +1045,7 @@ def decode( >>> processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en") >>> model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en", from_pt=True) - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> input_features = processor(ds[0]["audio"]["array"], return_tensors="np").input_features >>> encoder_outputs = model.encode(input_features=input_features) @@ -1297,7 +1297,7 @@ def decode( >>> processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en") >>> model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en", from_pt=True) - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="np") >>> input_features = inputs.input_features >>> encoder_outputs = model.encode(input_features=input_features) @@ -1516,7 +1516,7 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs): >>> processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en") >>> model = FlaxWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en", from_pt=True) - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="np") >>> input_features = inputs.input_features >>> generated_ids = model.generate(input_ids=input_features) diff --git a/src/transformers/models/whisper/modeling_tf_whisper.py b/src/transformers/models/whisper/modeling_tf_whisper.py index 6f50141bff9f40..18f55dce8a2224 100644 --- a/src/transformers/models/whisper/modeling_tf_whisper.py +++ b/src/transformers/models/whisper/modeling_tf_whisper.py @@ -1147,7 +1147,7 @@ def call( >>> model = TFWhisperModel.from_pretrained("openai/whisper-base") >>> feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-base") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="tf") >>> input_features = inputs.input_features >>> decoder_input_ids = tf.convert_to_tensor([[1, 1]]) * model.config.decoder_start_token_id @@ -1283,7 +1283,7 @@ def call( >>> model = TFWhisperModel.from_pretrained("openai/whisper-base") >>> feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-base") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="tf") >>> input_features = inputs.input_features >>> decoder_input_ids = tf.convert_to_tensor([[1, 1]]) * model.config.decoder_start_token_id @@ -1413,7 +1413,7 @@ def call( >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en") >>> model = TFWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="tf") >>> input_features = inputs.input_features diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py index 7ba2af00ad81c0..6db7da4b957cdc 100644 --- a/src/transformers/models/whisper/modeling_whisper.py +++ b/src/transformers/models/whisper/modeling_whisper.py @@ -1555,7 +1555,7 @@ def forward( >>> model = WhisperModel.from_pretrained("openai/whisper-base") >>> feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-base") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt") >>> input_features = inputs.input_features >>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id @@ -1698,7 +1698,7 @@ def forward( >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en") >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt") >>> input_features = inputs.input_features @@ -1959,7 +1959,7 @@ def forward( >>> assistant_model = WhisperForCausalLM.from_pretrained("distil-whisper/distil-large-v2") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> sample = ds[0]["audio"] >>> input_features = processor( ... sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt" diff --git a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py index 967f1936215e97..fbe250908633db 100644 --- a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py +++ b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py @@ -153,9 +153,7 @@ def test_double_precision_pad(self): def _load_datasamples(self, num_samples): from datasets import load_dataset - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] diff --git a/tests/models/clap/test_feature_extraction_clap.py b/tests/models/clap/test_feature_extraction_clap.py index 8f2d6df3cb6c78..d0e913df828b84 100644 --- a/tests/models/clap/test_feature_extraction_clap.py +++ b/tests/models/clap/test_feature_extraction_clap.py @@ -164,9 +164,7 @@ def test_double_precision_pad(self): # Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest._load_datasamples def _load_datasamples(self, num_samples): - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py index 8e3392133f1f26..9f8cc62d2e0fc3 100644 --- a/tests/models/clap/test_modeling_clap.py +++ b/tests/models/clap/test_modeling_clap.py @@ -665,9 +665,7 @@ def test_integration_unfused(self): "repeat": 0.0023, } - librispeech_dummy = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") audio_sample = librispeech_dummy[-1] model_id = "laion/clap-htsat-unfused" @@ -694,9 +692,7 @@ def test_integration_fused(self): "pad": -0.000379, } - librispeech_dummy = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") audio_sample = librispeech_dummy[-1] model_id = "laion/clap-htsat-fused" @@ -723,9 +719,7 @@ def test_batched_fused(self): "pad": 0.0006, } - librispeech_dummy = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") audio_samples = [sample["array"] for sample in librispeech_dummy[0:4]["audio"]] model_id = "laion/clap-htsat-fused" @@ -752,9 +746,7 @@ def test_batched_unfused(self): "pad": 0.0019, } - librispeech_dummy = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") audio_samples = [sample["array"] for sample in librispeech_dummy[0:4]["audio"]] model_id = "laion/clap-htsat-unfused" diff --git a/tests/models/clvp/test_feature_extraction_clvp.py b/tests/models/clvp/test_feature_extraction_clvp.py index 83be97e8675434..db641eaf6145cb 100644 --- a/tests/models/clvp/test_feature_extraction_clvp.py +++ b/tests/models/clvp/test_feature_extraction_clvp.py @@ -209,9 +209,7 @@ def test_double_precision_pad(self): self.assertTrue(pt_processed.input_features.dtype == torch.float32) def _load_datasamples(self, num_samples): - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = ds.cast_column("audio", Audio(sampling_rate=22050)) # automatic decoding with librispeech speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] diff --git a/tests/models/clvp/test_modeling_clvp.py b/tests/models/clvp/test_modeling_clvp.py index 5d17d3fed622cd..0cf89a74523364 100644 --- a/tests/models/clvp/test_modeling_clvp.py +++ b/tests/models/clvp/test_modeling_clvp.py @@ -371,9 +371,7 @@ def get_config(self): def prepare_config_and_inputs(self): _, input_ids, attention_mask = self.clvp_encoder_tester.prepare_config_and_inputs() - ds = datasets.load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050)) _, audio, sr = ds.sort("id").select(range(1))[:1]["audio"][0].values() @@ -555,9 +553,7 @@ def test_model_from_pretrained(self): class ClvpIntegrationTest(unittest.TestCase): def setUp(self): self.text = "This is an example text." - ds = datasets.load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050)) _, self.speech_samples, self.sr = ds.sort("id").select(range(1))[:1]["audio"][0].values() diff --git a/tests/models/data2vec/test_modeling_data2vec_audio.py b/tests/models/data2vec/test_modeling_data2vec_audio.py index 8bb16760ce61e9..d43128286853a5 100644 --- a/tests/models/data2vec/test_modeling_data2vec_audio.py +++ b/tests/models/data2vec/test_modeling_data2vec_audio.py @@ -694,9 +694,7 @@ def test_compute_mask_indices_short_audio(self): @slow class Data2VecAudioModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").filter( lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] diff --git a/tests/models/encodec/test_feature_extraction_encodec.py b/tests/models/encodec/test_feature_extraction_encodec.py index 73c5019b11edda..e56517ac410661 100644 --- a/tests/models/encodec/test_feature_extraction_encodec.py +++ b/tests/models/encodec/test_feature_extraction_encodec.py @@ -138,9 +138,7 @@ def test_double_precision_pad(self): def _load_datasamples(self, num_samples): from datasets import load_dataset - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech audio_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] diff --git a/tests/models/encodec/test_modeling_encodec.py b/tests/models/encodec/test_modeling_encodec.py index 0a023894d8a00a..cff297be8e0002 100644 --- a/tests/models/encodec/test_modeling_encodec.py +++ b/tests/models/encodec/test_modeling_encodec.py @@ -461,9 +461,7 @@ def test_integration_24kHz(self): "1.5": [371955], "24.0": [6659962], } - librispeech_dummy = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") model_id = "facebook/encodec_24khz" model = EncodecModel.from_pretrained(model_id).to(torch_device) @@ -517,9 +515,7 @@ def test_integration_48kHz(self): "3.0": [144259, 146765, 156435, 176871, 161971], "24.0": [1568553, 1294948, 1306190, 1464747, 1663150], } - librispeech_dummy = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") model_id = "facebook/encodec_48khz" model = EncodecModel.from_pretrained(model_id).to(torch_device) @@ -581,9 +577,7 @@ def test_batch_48kHz(self): [85561, 81870, 76953, 48967, 79315, 85442, 81479, 107241], ], } - librispeech_dummy = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") model_id = "facebook/encodec_48khz" model = EncodecModel.from_pretrained(model_id).to(torch_device) diff --git a/tests/models/hubert/test_modeling_hubert.py b/tests/models/hubert/test_modeling_hubert.py index cd801be41d7b3d..86f2b4119324ae 100644 --- a/tests/models/hubert/test_modeling_hubert.py +++ b/tests/models/hubert/test_modeling_hubert.py @@ -753,9 +753,7 @@ class HubertModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): from datasets import load_dataset - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").filter( lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] diff --git a/tests/models/hubert/test_modeling_tf_hubert.py b/tests/models/hubert/test_modeling_tf_hubert.py index 35a8d98c233f77..3685e6598740c5 100644 --- a/tests/models/hubert/test_modeling_tf_hubert.py +++ b/tests/models/hubert/test_modeling_tf_hubert.py @@ -609,9 +609,7 @@ class TFHubertModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): from datasets import load_dataset - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").filter( lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] diff --git a/tests/models/pop2piano/test_feature_extraction_pop2piano.py b/tests/models/pop2piano/test_feature_extraction_pop2piano.py index 5a4652ad577cd7..c6766147975962 100644 --- a/tests/models/pop2piano/test_feature_extraction_pop2piano.py +++ b/tests/models/pop2piano/test_feature_extraction_pop2piano.py @@ -136,9 +136,7 @@ def test_call(self): self.assertTrue(input_features.extrapolated_beatstep.ndim == 2) def test_integration(self): - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") speech_samples = ds.sort("id").select([0])["audio"] input_speech = [x["array"] for x in speech_samples][0] sampling_rate = [x["sampling_rate"] for x in speech_samples][0] diff --git a/tests/models/pop2piano/test_processor_pop2piano.py b/tests/models/pop2piano/test_processor_pop2piano.py index 634cdd26bd105c..06a8bacfd8a45c 100644 --- a/tests/models/pop2piano/test_processor_pop2piano.py +++ b/tests/models/pop2piano/test_processor_pop2piano.py @@ -111,9 +111,7 @@ def test_save_load_pretrained_additional_features(self): def get_inputs(self): """get inputs for both feature extractor and tokenizer""" - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") speech_samples = ds.sort("id").select([0])["audio"] input_speech = [x["array"] for x in speech_samples][0] sampling_rate = [x["sampling_rate"] for x in speech_samples][0] diff --git a/tests/models/seamless_m4t/test_feature_extraction_seamless_m4t.py b/tests/models/seamless_m4t/test_feature_extraction_seamless_m4t.py index d9919e0adea6cc..a8fca4b90ba941 100644 --- a/tests/models/seamless_m4t/test_feature_extraction_seamless_m4t.py +++ b/tests/models/seamless_m4t/test_feature_extraction_seamless_m4t.py @@ -258,9 +258,7 @@ def test_double_precision_pad(self): self.assertTrue(pt_processed.input_features.dtype == torch.float32) def _load_datasample(self, id): - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_sample = ds.sort("id")[id]["audio"]["array"] diff --git a/tests/models/sew/test_modeling_sew.py b/tests/models/sew/test_modeling_sew.py index fe10d994450be7..6b21c2e9f7128e 100644 --- a/tests/models/sew/test_modeling_sew.py +++ b/tests/models/sew/test_modeling_sew.py @@ -494,9 +494,7 @@ class SEWModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): from datasets import load_dataset - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").filter( lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] diff --git a/tests/models/sew_d/test_modeling_sew_d.py b/tests/models/sew_d/test_modeling_sew_d.py index 9fd94fbfef2668..b2efdccdf07c3f 100644 --- a/tests/models/sew_d/test_modeling_sew_d.py +++ b/tests/models/sew_d/test_modeling_sew_d.py @@ -508,9 +508,7 @@ class SEWDModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): from datasets import load_dataset - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").filter( lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] diff --git a/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py b/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py index 6c8861e3d8689f..9023e8467f736c 100644 --- a/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py +++ b/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py @@ -259,9 +259,7 @@ def test_double_precision_pad(self): def _load_datasamples(self, num_samples): from datasets import load_dataset - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] diff --git a/tests/models/speech_to_text/test_modeling_speech_to_text.py b/tests/models/speech_to_text/test_modeling_speech_to_text.py index 44672f1c588f31..4f19cc01b33c70 100644 --- a/tests/models/speech_to_text/test_modeling_speech_to_text.py +++ b/tests/models/speech_to_text/test_modeling_speech_to_text.py @@ -793,9 +793,7 @@ def default_processor(self): def _load_datasamples(self, num_samples): from datasets import load_dataset - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] diff --git a/tests/models/speech_to_text/test_modeling_tf_speech_to_text.py b/tests/models/speech_to_text/test_modeling_tf_speech_to_text.py index d12174533395df..c2fd215f388575 100644 --- a/tests/models/speech_to_text/test_modeling_tf_speech_to_text.py +++ b/tests/models/speech_to_text/test_modeling_tf_speech_to_text.py @@ -587,9 +587,7 @@ def default_processor(self): def _load_datasamples(self, num_samples): from datasets import load_dataset - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] diff --git a/tests/models/speecht5/test_feature_extraction_speecht5.py b/tests/models/speecht5/test_feature_extraction_speecht5.py index f8f7f53cac200c..5ec632e7e76c63 100644 --- a/tests/models/speecht5/test_feature_extraction_speecht5.py +++ b/tests/models/speecht5/test_feature_extraction_speecht5.py @@ -380,9 +380,7 @@ def test_attention_mask_with_truncation_target(self): def _load_datasamples(self, num_samples): from datasets import load_dataset - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] diff --git a/tests/models/speecht5/test_modeling_speecht5.py b/tests/models/speecht5/test_modeling_speecht5.py index 1d67bb4f8ab527..7a8aab83272bc8 100644 --- a/tests/models/speecht5/test_modeling_speecht5.py +++ b/tests/models/speecht5/test_modeling_speecht5.py @@ -744,9 +744,7 @@ def default_processor(self): def _load_datasamples(self, num_samples): from datasets import load_dataset - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] @@ -1771,9 +1769,7 @@ def default_processor(self): def _load_datasamples(self, num_samples): from datasets import load_dataset - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] diff --git a/tests/models/unispeech/test_modeling_unispeech.py b/tests/models/unispeech/test_modeling_unispeech.py index 1804e2c95ef4ce..d0a1d352243b19 100644 --- a/tests/models/unispeech/test_modeling_unispeech.py +++ b/tests/models/unispeech/test_modeling_unispeech.py @@ -549,9 +549,7 @@ def test_model_from_pretrained(self): @slow class UniSpeechModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").filter( lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] diff --git a/tests/models/unispeech_sat/test_modeling_unispeech_sat.py b/tests/models/unispeech_sat/test_modeling_unispeech_sat.py index f3d467f0795d11..1aa2da20d5ec85 100644 --- a/tests/models/unispeech_sat/test_modeling_unispeech_sat.py +++ b/tests/models/unispeech_sat/test_modeling_unispeech_sat.py @@ -806,9 +806,7 @@ def test_model_from_pretrained(self): @slow class UniSpeechSatModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").filter( lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] diff --git a/tests/models/univnet/test_feature_extraction_univnet.py b/tests/models/univnet/test_feature_extraction_univnet.py index 673faaae9adaca..dfa335d15383ee 100644 --- a/tests/models/univnet/test_feature_extraction_univnet.py +++ b/tests/models/univnet/test_feature_extraction_univnet.py @@ -327,9 +327,7 @@ def test_double_precision_pad(self): self.assertTrue(pt_processed.input_features.dtype == torch.float32) def _load_datasamples(self, num_samples): - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = ds.cast_column("audio", Audio(sampling_rate=self.feat_extract_tester.sampling_rate)) # automatic decoding with librispeech speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] diff --git a/tests/models/univnet/test_modeling_univnet.py b/tests/models/univnet/test_modeling_univnet.py index 4dc28b3c168b1d..e160c799b786cb 100644 --- a/tests/models/univnet/test_modeling_univnet.py +++ b/tests/models/univnet/test_modeling_univnet.py @@ -216,9 +216,7 @@ def tearDown(self): torch.cuda.empty_cache() def _load_datasamples(self, num_samples, sampling_rate=24000): - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") ds = ds.cast_column("audio", Audio(sampling_rate=sampling_rate)) # automatic decoding with librispeech speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] diff --git a/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py b/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py index 18252a17524337..b91d66654de6ae 100644 --- a/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py +++ b/tests/models/wav2vec2/test_modeling_flax_wav2vec2.py @@ -489,9 +489,7 @@ def test_sample_negatives_with_attn_mask(self): @slow class FlaxWav2Vec2ModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").filter( lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] diff --git a/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py b/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py index 2f10e3378d736a..7ef97290e61c9f 100644 --- a/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py +++ b/tests/models/wav2vec2/test_modeling_tf_wav2vec2.py @@ -716,9 +716,7 @@ def tearDown(self): gc.collect() def _load_datasamples(self, num_samples): - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").filter( lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] diff --git a/tests/models/wav2vec2/test_modeling_wav2vec2.py b/tests/models/wav2vec2/test_modeling_wav2vec2.py index 51d105a5ee3f80..ff7a85218d3a00 100644 --- a/tests/models/wav2vec2/test_modeling_wav2vec2.py +++ b/tests/models/wav2vec2/test_modeling_wav2vec2.py @@ -1464,9 +1464,7 @@ def tearDown(self): backend_empty_cache(torch_device) def _load_datasamples(self, num_samples): - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").filter( lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] diff --git a/tests/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py b/tests/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py index 0fbd000edc8c3d..80237fea9d1e43 100644 --- a/tests/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py +++ b/tests/models/wav2vec2_bert/test_modeling_wav2vec2_bert.py @@ -855,9 +855,7 @@ def test_sample_negatives_with_mask(self): @slow class Wav2Vec2BertModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").filter(lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]) speech_samples = speech_samples[:num_samples]["audio"] diff --git a/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py b/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py index ae13a8ecba9dea..096d1368ed02cb 100644 --- a/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py +++ b/tests/models/wav2vec2_conformer/test_modeling_wav2vec2_conformer.py @@ -863,9 +863,7 @@ def test_sample_negatives_with_mask(self): @slow class Wav2Vec2ConformerModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").filter(lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)]) speech_samples = speech_samples[:num_samples]["audio"] diff --git a/tests/models/wavlm/test_modeling_wavlm.py b/tests/models/wavlm/test_modeling_wavlm.py index 8f4d1e850e0056..b20792d83545d8 100644 --- a/tests/models/wavlm/test_modeling_wavlm.py +++ b/tests/models/wavlm/test_modeling_wavlm.py @@ -491,9 +491,7 @@ def test_model_from_pretrained(self): @slow class WavLMModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").filter( lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] diff --git a/tests/models/whisper/test_feature_extraction_whisper.py b/tests/models/whisper/test_feature_extraction_whisper.py index 579c42519ae033..a8295542f4e377 100644 --- a/tests/models/whisper/test_feature_extraction_whisper.py +++ b/tests/models/whisper/test_feature_extraction_whisper.py @@ -215,9 +215,7 @@ def test_double_precision_pad(self): self.assertTrue(pt_processed.input_features.dtype == torch.float32) def _load_datasamples(self, num_samples): - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] diff --git a/tests/models/whisper/test_modeling_flax_whisper.py b/tests/models/whisper/test_modeling_flax_whisper.py index d5e18d22c2f3c8..4b8092e800ad69 100644 --- a/tests/models/whisper/test_modeling_flax_whisper.py +++ b/tests/models/whisper/test_modeling_flax_whisper.py @@ -410,9 +410,7 @@ def default_processor(self): return WhisperProcessor.from_pretrained("openai/whisper-base") def _load_datasamples(self, num_samples): - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] diff --git a/tests/models/whisper/test_modeling_tf_whisper.py b/tests/models/whisper/test_modeling_tf_whisper.py index 97143cc4df5120..b200671e048f0e 100644 --- a/tests/models/whisper/test_modeling_tf_whisper.py +++ b/tests/models/whisper/test_modeling_tf_whisper.py @@ -704,7 +704,7 @@ def test_generate_with_prompt_ids_and_forced_decoder_ids(self): def _load_datasamples(num_samples): - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py index a11097fe7dc391..5a59f7a72517a4 100644 --- a/tests/models/whisper/test_modeling_whisper.py +++ b/tests/models/whisper/test_modeling_whisper.py @@ -1835,9 +1835,7 @@ def default_processor(self): return WhisperProcessor.from_pretrained("openai/whisper-base") def _load_datasamples(self, num_samples): - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") # automatic decoding with librispeech speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] @@ -2718,9 +2716,7 @@ def test_speculative_decoding_distil(self): ) assistant_model.to(torch_device) - dataset = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") sample = dataset[0]["audio"] input_features = processor(sample["array"], return_tensors="pt", sampling_rate=16_000).input_features @@ -2769,9 +2765,7 @@ def test_speculative_decoding_non_distil(self): ) assistant_model.to(torch_device) - dataset = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") sample = dataset[0]["audio"] input_features = processor(sample["array"], return_tensors="pt", sampling_rate=16_000).input_features @@ -2812,7 +2806,7 @@ def test_whisper_longform_single_batch(self): model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") model = model.to(torch_device) - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", trust_remote_code=True) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean") one_audio = np.concatenate([x["array"] for x in ds["validation"]["audio"]], dtype=np.float32) input_features = processor( @@ -2848,9 +2842,7 @@ def test_whisper_longform_prompt_ids(self): prompt = "Mr. Kilter, Brionno." # let's force Quilter -> Kilter, Brion -> Brionno prompt_ids = processor.get_prompt_ids(prompt, return_tensors="pt").to(torch_device) - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:-1]", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:-1]") one_audio = np.concatenate([x["array"] for x in ds["audio"]], dtype=np.float32) first_text = ds[0]["text"].lower() @@ -2901,7 +2893,7 @@ def test_whisper_longform_single_batch_prev_cond(self): model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") model = model.to(torch_device) - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", trust_remote_code=True) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean") one_audio = np.concatenate([x["array"] for x in ds["validation"]["audio"]], dtype=np.float32) input_features = processor( @@ -2983,7 +2975,7 @@ def test_whisper_longform_single_batch_beam(self): model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") model = model.to(torch_device) - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", trust_remote_code=True) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean") one_audio = np.concatenate([x["array"] for x in ds["validation"]["audio"]], dtype=np.float32) input_features = processor( @@ -3025,7 +3017,7 @@ def test_whisper_longform_multi_batch(self): model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") model = model.to(torch_device) - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", trust_remote_code=True) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean") one_audio = np.concatenate([x["array"] for x in ds["validation"]["audio"]], dtype=np.float32) audios = [] audios.append(one_audio[110000:]) @@ -3079,7 +3071,7 @@ def test_whisper_longform_multi_batch_prev_cond(self): model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny") model = model.to(torch_device) - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", trust_remote_code=True) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean") one_audio = np.concatenate([x["array"] for x in ds["validation"]["audio"]], dtype=np.float32) audios = [] audios.append(one_audio[110000:]) diff --git a/tests/pipelines/test_pipelines_audio_classification.py b/tests/pipelines/test_pipelines_audio_classification.py index a8c5deb2284452..1f403a8be05d2a 100644 --- a/tests/pipelines/test_pipelines_audio_classification.py +++ b/tests/pipelines/test_pipelines_audio_classification.py @@ -71,9 +71,7 @@ def run_torchaudio(self, audio_classifier): import datasets # test with a local file - dataset = datasets.load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + dataset = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") audio = dataset[0]["audio"]["array"] output = audio_classifier(audio) self.assertEqual( diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py index 82c5580f0ea2cc..d8810f67eec15e 100644 --- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py +++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py @@ -294,9 +294,7 @@ def test_torch_large(self): output = speech_recognizer(waveform) self.assertEqual(output, {"text": ""}) - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ).sort("id") + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") filename = ds[40]["file"] output = speech_recognizer(filename) self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"}) @@ -313,9 +311,7 @@ def test_torch_large_with_input_features(self): output = speech_recognizer(waveform) self.assertEqual(output, {"text": ""}) - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ).sort("id") + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") filename = ds[40]["file"] output = speech_recognizer(filename) self.assertEqual(output, {"text": "a man said to the universe sir i exist"}) @@ -545,9 +541,7 @@ def test_torch_whisper(self): model="openai/whisper-tiny", framework="pt", ) - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ).sort("id") + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") filename = ds[40]["file"] output = speech_recognizer(filename) self.assertEqual(output, {"text": " A man said to the universe, Sir, I exist."}) @@ -722,9 +716,7 @@ def test_find_longest_common_subsequence(self): @slow @require_torch def test_whisper_timestamp_prediction(self): - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ).sort("id") + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") array = np.concatenate( [ds[40]["audio"]["array"], ds[41]["audio"]["array"], ds[42]["audio"]["array"], ds[43]["audio"]["array"]] ) @@ -822,9 +814,7 @@ def test_whisper_timestamp_prediction(self): @slow @require_torch def test_whisper_large_timestamp_prediction(self): - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ).sort("id") + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") array = np.concatenate( [ds[40]["audio"]["array"], ds[41]["audio"]["array"], ds[42]["audio"]["array"], ds[43]["audio"]["array"]] ) @@ -918,9 +908,7 @@ def test_whisper_word_timestamps_batched(self): chunk_length_s=3, return_timestamps="word", ) - data = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + data = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") sample = data[0]["audio"] # not the same output as test_simple_whisper_asr because of chunking @@ -963,9 +951,7 @@ def test_whisper_large_word_timestamps_batched(self): model="openai/whisper-large-v3", return_timestamps="word", ) - data = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + data = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") sample = data[0]["audio"] # not the same output as test_simple_whisper_asr because of chunking @@ -1010,9 +996,7 @@ def test_torch_speech_encoder_decoder(self): framework="pt", ) - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ).sort("id") + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") filename = ds[40]["file"] output = speech_recognizer(filename) self.assertEqual(output, {"text": 'Ein Mann sagte zum Universum : " Sir, ich existiert! "'}) @@ -1030,9 +1014,7 @@ def test_simple_wav2vec2(self): output = asr(waveform) self.assertEqual(output, {"text": ""}) - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ).sort("id") + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") filename = ds[40]["file"] output = asr(filename) self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"}) @@ -1058,9 +1040,7 @@ def test_simple_s2t(self): output = asr(waveform) self.assertEqual(output, {"text": "(Applausi)"}) - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ).sort("id") + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") filename = ds[40]["file"] output = asr(filename) self.assertEqual(output, {"text": "Un uomo disse all'universo: \"Signore, io esisto."}) @@ -1080,9 +1060,7 @@ def test_simple_whisper_asr(self): model="openai/whisper-tiny.en", framework="pt", ) - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") filename = ds[0]["file"] output = speech_recognizer(filename) self.assertEqual( @@ -1151,9 +1129,7 @@ def test_simple_whisper_translation(self): model="openai/whisper-large", framework="pt", ) - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ).sort("id") + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") filename = ds[40]["file"] output = speech_recognizer(filename) self.assertEqual(output, {"text": " A man said to the universe, Sir, I exist."}) @@ -1188,9 +1164,7 @@ def test_whisper_language(self): model="openai/whisper-tiny.en", framework="pt", ) - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") filename = ds[0]["file"] # 1. English-only model compatible with no language argument @@ -1323,9 +1297,7 @@ def test_xls_r_to_en(self): framework="pt", ) - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ).sort("id") + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") filename = ds[40]["file"] output = speech_recognizer(filename) self.assertEqual(output, {"text": "A man said to the universe: “Sir, I exist."}) @@ -1341,9 +1313,7 @@ def test_xls_r_from_en(self): framework="pt", ) - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ).sort("id") + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") filename = ds[40]["file"] output = speech_recognizer(filename) self.assertEqual(output, {"text": "Ein Mann sagte zu dem Universum, Sir, ich bin da."}) @@ -1360,9 +1330,7 @@ def test_speech_to_text_leveraged(self): framework="pt", ) - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ).sort("id") + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") filename = ds[40]["file"] output = speech_recognizer(filename) @@ -1379,9 +1347,7 @@ def test_wav2vec2_conformer_float16(self): framework="pt", ) - dataset = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") sample = dataset[0]["audio"] output = speech_recognizer(sample) @@ -1398,9 +1364,7 @@ def test_chunking_fast(self): chunk_length_s=10.0, ) - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ).sort("id") + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") audio = ds[40]["audio"]["array"] n_repeats = 2 @@ -1416,9 +1380,7 @@ def test_return_timestamps_ctc_fast(self): model="hf-internal-testing/tiny-random-wav2vec2", ) - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ).sort("id") + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") # Take short audio to keep the test readable audio = ds[40]["audio"]["array"][:800] @@ -1462,9 +1424,7 @@ def test_chunking_fast_with_lm(self): chunk_length_s=10.0, ) - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ).sort("id") + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") audio = ds[40]["audio"]["array"] n_repeats = 2 @@ -1492,9 +1452,7 @@ def test_with_lm_fast(self): ) self.assertEqual(speech_recognizer.type, "ctc_with_lm") - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ).sort("id") + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") audio = ds[40]["audio"]["array"] n_repeats = 2 @@ -1522,9 +1480,7 @@ def test_with_local_lm_fast(self): ) self.assertEqual(speech_recognizer.type, "ctc_with_lm") - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ).sort("id") + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") audio = ds[40]["audio"]["array"] n_repeats = 2 @@ -1608,9 +1564,7 @@ def test_seamless_v2(self): device=torch_device, ) - dataset = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") sample = dataset[0]["audio"] result = pipe(sample, generate_kwargs={"tgt_lang": "eng"}) @@ -1633,9 +1587,7 @@ def test_chunking_and_timestamps(self): chunk_length_s=10.0, ) - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ).sort("id") + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") audio = ds[40]["audio"]["array"] n_repeats = 10 @@ -1747,9 +1699,7 @@ def test_chunking_with_lm(self): model="patrickvonplaten/wav2vec2-base-100h-with-lm", chunk_length_s=10.0, ) - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ).sort("id") + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") audio = ds[40]["audio"]["array"] n_repeats = 10 diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py index d4dbff218558e1..95349a8335de6d 100644 --- a/tests/pipelines/test_pipelines_common.py +++ b/tests/pipelines/test_pipelines_common.py @@ -840,9 +840,7 @@ def test_cached_pipeline_has_minimum_calls_to_head(self): def test_chunk_pipeline_batching_single_file(self): # Make sure we have cached the pipeline. pipe = pipeline(model="hf-internal-testing/tiny-random-Wav2Vec2ForCTC") - ds = datasets.load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ).sort("id") + ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") audio = ds[40]["audio"]["array"] pipe = pipeline(model="hf-internal-testing/tiny-random-Wav2Vec2ForCTC") diff --git a/tests/utils/test_audio_utils.py b/tests/utils/test_audio_utils.py index 47c384870d4a04..3e417bf7e3b450 100644 --- a/tests/utils/test_audio_utils.py +++ b/tests/utils/test_audio_utils.py @@ -262,9 +262,7 @@ def test_window_function(self): def _load_datasamples(self, num_samples): from datasets import load_dataset - ds = load_dataset( - "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True - ) + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] return [x["array"] for x in speech_samples] From 2782aadae2b0b0c313eac3ee70f84f0335577635 Mon Sep 17 00:00:00 2001 From: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com> Date: Tue, 23 Jul 2024 14:55:16 +0800 Subject: [PATCH 10/73] [modelling] remove un-necessary transpose for fa2 attention (#31749) * [whisper] remove un-necessary transpose for fa2 attention * propagate --- src/transformers/models/idefics2/modeling_idefics2.py | 6 ++---- src/transformers/models/jamba/modeling_jamba.py | 3 +-- src/transformers/models/whisper/modeling_whisper.py | 3 +-- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/src/transformers/models/idefics2/modeling_idefics2.py b/src/transformers/models/idefics2/modeling_idefics2.py index 4d978c053d3fa6..f57f1fc3d51a3b 100644 --- a/src/transformers/models/idefics2/modeling_idefics2.py +++ b/src/transformers/models/idefics2/modeling_idefics2.py @@ -301,7 +301,7 @@ def forward( # Flash attention requires the input to have the shape # batch_size x seq_length x head_dim x hidden_dim # therefore we just need to keep the original shape - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim) key_states = key_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) @@ -311,7 +311,6 @@ def forward( # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache # to be able to avoid many of these transpose/reshape/view. - query_states = query_states.transpose(1, 2) key_states = key_states.transpose(1, 2) value_states = value_states.transpose(1, 2) @@ -817,7 +816,7 @@ def forward( key_states = self.k_proj(torch.cat([context, latents], dim=-2)) value_states = self.v_proj(torch.cat([context, latents], dim=-2)) - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim) key_states = key_states.view(bsz, kv_seq_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, kv_seq_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) @@ -882,7 +881,6 @@ def forward( value_states = value_states.to(target_dtype) # Reashape to the expected shape for Flash Attention - query_states = query_states.transpose(1, 2) key_states = key_states.transpose(1, 2) value_states = value_states.transpose(1, 2) diff --git a/src/transformers/models/jamba/modeling_jamba.py b/src/transformers/models/jamba/modeling_jamba.py index 768e8e01607588..28d732628a28da 100755 --- a/src/transformers/models/jamba/modeling_jamba.py +++ b/src/transformers/models/jamba/modeling_jamba.py @@ -406,7 +406,7 @@ def forward( # Flash attention requires the input to have the shape # batch_size x seq_length x head_dim x hidden_dim # therefore we just need to keep the original shape - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim) key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) @@ -469,7 +469,6 @@ def forward( value_states = value_states.to(target_dtype) # Reashape to the expected shape for Flash Attention - query_states = query_states.transpose(1, 2) key_states = key_states.transpose(1, 2) value_states = value_states.transpose(1, 2) diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py index 6db7da4b957cdc..c4d59a360464b0 100644 --- a/src/transformers/models/whisper/modeling_whisper.py +++ b/src/transformers/models/whisper/modeling_whisper.py @@ -387,7 +387,7 @@ def forward( bsz, tgt_len, _ = hidden_states.size() # get query proj - query_states = self._shape(self.q_proj(hidden_states), tgt_len, bsz) + query_states = torch.reshape(self.q_proj(hidden_states), (bsz, tgt_len, self.num_heads, self.head_dim)) if past_key_value is not None: is_updated = past_key_value.is_updated.get(self.layer_idx) @@ -416,7 +416,6 @@ def forward( # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim] # We would need to refactor the KV cache to be able to avoid many of these transpose/reshape/view. - query_states = query_states.transpose(1, 2) key_states = key_states.transpose(1, 2) value_states = value_states.transpose(1, 2) From 605f3245dcca34381c35520c35ba0b701ed80d58 Mon Sep 17 00:00:00 2001 From: Anton Vlasjuk <73884904+vasqu@users.noreply.github.com> Date: Tue, 23 Jul 2024 10:11:12 +0200 Subject: [PATCH 11/73] Fix mask creations of `GPTNeoX` and `GPT2` (#31944) * fix mask creation of gpt2 and gpt_neox caused by me * forgot the reshape of masks when shape > 2 * add tests for gpt neox and gpt2 * nit on a comment --- src/transformers/models/gpt2/modeling_gpt2.py | 24 ++++++------- .../models/gpt_neox/modeling_gpt_neox.py | 36 +++++++++---------- tests/models/gpt2/test_modeling_gpt2.py | 34 ++++++++++++++++++ .../models/gpt_neox/test_modeling_gpt_neox.py | 34 ++++++++++++++++++ 4 files changed, 97 insertions(+), 31 deletions(-) diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py index 2b9d300aa9e1bc..7a51cb3eb2cdb8 100644 --- a/src/transformers/models/gpt2/modeling_gpt2.py +++ b/src/transformers/models/gpt2/modeling_gpt2.py @@ -1030,18 +1030,18 @@ def forward( # Attention mask. _use_sdpa = self._attn_implementation == "sdpa" and output_attentions is False and head_mask is None - if attention_mask is not None: - attention_mask = attention_mask.view(batch_size, -1) - if self._attn_implementation == "flash_attention_2": - attention_mask = attention_mask if 0 in attention_mask else None - elif _use_sdpa: - attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( - attention_mask=attention_mask, - input_shape=(batch_size, input_shape[-1]), - inputs_embeds=inputs_embeds, - past_key_values_length=past_length, - ) - else: + attention_mask = attention_mask.view(batch_size, -1) if attention_mask is not None else None + if self._attn_implementation == "flash_attention_2": + attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None + elif _use_sdpa: + attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( + attention_mask=attention_mask, + input_shape=(batch_size, input_shape[-1]), + inputs_embeds=inputs_embeds, + past_key_values_length=past_length, + ) + else: + if attention_mask is not None: # We create a 3D attention mask from a 2D tensor mask. # Sizes are [batch_size, 1, 1, to_seq_length] # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py index fd5e0b4fe62e25..32988e88df34a8 100755 --- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py +++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py @@ -824,25 +824,23 @@ def forward( inputs_embeds = self.embed_in(input_ids) # Attention mask. - if attention_mask is not None: - assert batch_size > 0, "batch_size has to be defined and > 0" - attention_mask = attention_mask.view(batch_size, -1) - if self._attn_implementation == "flash_attention_2": - attention_mask = attention_mask if 0 in attention_mask else None - elif self._attn_implementation == "sdpa" and not output_attentions and head_mask is None: - attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( - attention_mask=attention_mask, - input_shape=(batch_size, seq_length), - inputs_embeds=inputs_embeds, - past_key_values_length=past_length, - ) - else: - attention_mask = _prepare_4d_causal_attention_mask( - attention_mask=attention_mask, - input_shape=(batch_size, seq_length), - inputs_embeds=inputs_embeds, - past_key_values_length=past_length, - ) + attention_mask = attention_mask.view(batch_size, -1) if attention_mask is not None else None + if self._attn_implementation == "flash_attention_2": + attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None + elif self._attn_implementation == "sdpa" and not output_attentions and head_mask is None: + attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( + attention_mask=attention_mask, + input_shape=(batch_size, seq_length), + inputs_embeds=inputs_embeds, + past_key_values_length=past_length, + ) + else: + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask=attention_mask, + input_shape=(batch_size, seq_length), + inputs_embeds=inputs_embeds, + past_key_values_length=past_length, + ) # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head diff --git a/tests/models/gpt2/test_modeling_gpt2.py b/tests/models/gpt2/test_modeling_gpt2.py index 5755658288f568..3f96c20ab2dbd9 100644 --- a/tests/models/gpt2/test_modeling_gpt2.py +++ b/tests/models/gpt2/test_modeling_gpt2.py @@ -426,6 +426,36 @@ def create_and_check_gpt2_weight_initialization(self, config, *args): self.parent.assertLessEqual(abs(torch.std(model.state_dict()[key]) - model_std), 0.001) self.parent.assertLessEqual(abs(torch.mean(model.state_dict()[key]) - 0.0), 0.01) + def create_and_check_cached_forward_with_and_without_attention_mask(self, config, input_ids, *args): + # Relevant issue: https://github.com/huggingface/transformers/issues/31943 + model = GPT2Model(config) + model.to(torch_device) + model.eval() + + # We want this for SDPA, eager works with a `None` attention mask + assert ( + model.config._attn_implementation == "sdpa" + ), "This test assumes the model to have the SDPA implementation for its attention calculations." + + # Prepare cache and non_cache input, needs a full attention mask + cached_len = input_ids.shape[-1] // 2 + input_mask = torch.ones(size=input_ids.size()).to(torch_device) + cache_inputs = {"input_ids": input_ids[:, :cached_len], "attention_mask": input_mask[:, :cached_len]} + non_cache_inputs = {"input_ids": input_ids[:, cached_len:], "attention_mask": input_mask} + + # Cached forward once with the attention mask provided and the other time without it (which should assume full attention) + cache_outputs = model(**cache_inputs) + full_outputs_with_attention_mask = model( + **non_cache_inputs, past_key_values=cache_outputs.past_key_values + ).last_hidden_state + full_outputs_without_attention_mask = model( + non_cache_inputs["input_ids"], past_key_values=cache_outputs.past_key_values + ).last_hidden_state + + self.parent.assertTrue( + torch.allclose(full_outputs_with_attention_mask, full_outputs_without_attention_mask, atol=1e-5) + ) + def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() @@ -570,6 +600,10 @@ def test_gpt2_weight_initialization(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_gpt2_weight_initialization(*config_and_inputs) + def test_cached_forward_with_and_without_attention_mask(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_cached_forward_with_and_without_attention_mask(*config_and_inputs) + @unittest.skip( reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" ) diff --git a/tests/models/gpt_neox/test_modeling_gpt_neox.py b/tests/models/gpt_neox/test_modeling_gpt_neox.py index 51a4d235c3bc5f..af162f50713e96 100644 --- a/tests/models/gpt_neox/test_modeling_gpt_neox.py +++ b/tests/models/gpt_neox/test_modeling_gpt_neox.py @@ -219,6 +219,36 @@ def create_and_check_decoder_model_past_large_inputs(self, config, input_ids, in # test that outputs are equal for slice self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)) + def create_and_check_cached_forward_with_and_without_attention_mask(self, config, input_ids, *args): + # Relevant issue: https://github.com/huggingface/transformers/issues/31943 + model = GPTNeoXModel(config) + model.to(torch_device) + model.eval() + + # We want this for SDPA, eager works with a `None` attention mask + assert ( + model.config._attn_implementation == "sdpa" + ), "This test assumes the model to have the SDPA implementation for its attention calculations." + + # Prepare cache and non_cache input, needs a full attention mask + cached_len = input_ids.shape[-1] // 2 + input_mask = torch.ones(size=input_ids.size()).to(torch_device) + cache_inputs = {"input_ids": input_ids[:, :cached_len], "attention_mask": input_mask[:, :cached_len]} + non_cache_inputs = {"input_ids": input_ids[:, cached_len:], "attention_mask": input_mask} + + # Cached forward once with the attention mask provided and the other time without it (which should assume full attention) + cache_outputs = model(**cache_inputs) + full_outputs_with_attention_mask = model( + **non_cache_inputs, past_key_values=cache_outputs.past_key_values + ).last_hidden_state + full_outputs_without_attention_mask = model( + non_cache_inputs["input_ids"], past_key_values=cache_outputs.past_key_values + ).last_hidden_state + + self.parent.assertTrue( + torch.allclose(full_outputs_with_attention_mask, full_outputs_without_attention_mask, atol=1e-5) + ) + def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() config, input_ids, input_mask, token_labels = config_and_inputs @@ -300,6 +330,10 @@ def test_model_for_token_classification(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_token_classification(*config_and_inputs) + def test_cached_forward_with_and_without_attention_mask(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_cached_forward_with_and_without_attention_mask(*config_and_inputs) + @unittest.skip(reason="Feed forward chunking is not implemented") def test_feed_forward_chunking(self): pass From 7405c1c77e4637768ea0ad5d27d8a4d8d67bfb19 Mon Sep 17 00:00:00 2001 From: KonradSzafer <61851539+KonradSzafer@users.noreply.github.com> Date: Tue, 23 Jul 2024 10:56:21 +0200 Subject: [PATCH 12/73] Add method to retrieve used chat template (#32032) encapsulate chat template logic --- src/transformers/tokenization_utils_base.py | 120 ++++++++++++-------- 1 file changed, 72 insertions(+), 48 deletions(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 58052579f2be08..7ffd3bbcaa6be7 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1772,54 +1772,7 @@ def apply_chat_template( if tokenizer_kwargs is None: tokenizer_kwargs = {} - using_default_template = False - - # First, handle the cases when the model has a dict of multiple templates - if isinstance(self.chat_template, dict) or ( - self.chat_template is None and isinstance(self.default_chat_template, dict) - ): - if self.chat_template is not None: - template_dict = self.chat_template - using_default_dict = False - else: - template_dict = self.default_chat_template - using_default_dict = True - if chat_template is not None and chat_template in template_dict: - # The user can pass the name of a template to the chat template argument instead of an entire template - chat_template = template_dict[chat_template] - if using_default_dict: - using_default_template = True - elif chat_template is None: - if tools is not None and "tool_use" in template_dict: - chat_template = template_dict["tool_use"] - elif "default" in template_dict: - chat_template = template_dict["default"] - else: - raise ValueError( - "This model has multiple chat templates with no default specified! Please either pass a chat " - "template or the name of the template you wish to use to the `chat_template` argument. Available " - f"template names are {sorted(template_dict.keys())}." - ) - if using_default_dict: - using_default_template = True - - elif chat_template is None: - # These are the cases when the model has a single template - # priority: `chat_template` argument > `tokenizer.chat_template` > `tokenizer.default_chat_template - if self.chat_template is not None: - chat_template = self.chat_template - else: - chat_template = self.default_chat_template - using_default_template = True - - if using_default_template: - logger.warning_once( - "No chat template is set for this tokenizer, falling back to a default class-level template. This is " - "very error-prone, because models are often trained with templates different from the class default! " - "Default chat templates are a legacy feature and will be removed in Transformers v4.43, at which " - "point any code depending on them will stop working. We recommend setting a valid chat template before " - "then to ensure that this model continues working without issues." - ) + chat_template = self.get_chat_template(chat_template, tools) if return_assistant_tokens_mask and not re.search(r"\{\%-?\s*generation\s*-?\%\}", chat_template): logger.warning_once( @@ -2012,6 +1965,77 @@ def activate_tracker(self, rendered_blocks: list[int], generation_indices: list[ jinja_env.globals["raise_exception"] = raise_exception return jinja_env.from_string(chat_template) + def get_chat_template(self, chat_template: Optional[str] = None, tools: Optional[List[Dict]] = None) -> str: + """ + Retrieve the chat template string used for tokenizing chat messages. This template is used + internally by the `apply_chat_template` method and can also be used externally to retrieve the model's chat + template for better generation tracking. + + Args: + chat_template (`str`, *optional*): + A Jinja template or the name of a template to use for this conversion. + It is usually not necessary to pass anything to this argument, + as the model's template will be used by default. + tools (`List[Dict]`, *optional*): + A list of tools (callable functions) that will be accessible to the model. If the template does not + support function calling, this argument will have no effect. Each tool should be passed as a JSON Schema, + giving the name, description and argument types for the tool. See our + [chat templating guide](https://huggingface.co/docs/transformers/main/en/chat_templating#automated-function-conversion-for-tool-use) + for more information. + + Returns: + `str`: The chat template string. + """ + using_default_template = False + # First, handle the cases when the model has a dict of multiple templates + if isinstance(self.chat_template, dict) or ( + self.chat_template is None and isinstance(self.default_chat_template, dict) + ): + if self.chat_template is not None: + template_dict = self.chat_template + using_default_dict = False + else: + template_dict = self.default_chat_template + using_default_dict = True + if chat_template is not None and chat_template in template_dict: + # The user can pass the name of a template to the chat template argument instead of an entire template + chat_template = template_dict[chat_template] + if using_default_dict: + using_default_template = True + elif chat_template is None: + if tools is not None and "tool_use" in template_dict: + chat_template = template_dict["tool_use"] + elif "default" in template_dict: + chat_template = template_dict["default"] + else: + raise ValueError( + "This model has multiple chat templates with no default specified! Please either pass a chat " + "template or the name of the template you wish to use to the `chat_template` argument. Available " + f"template names are {sorted(template_dict.keys())}." + ) + if using_default_dict: + using_default_template = True + + elif chat_template is None: + # These are the cases when the model has a single template + # priority: `chat_template` argument > `tokenizer.chat_template` > `tokenizer.default_chat_template + if self.chat_template is not None: + chat_template = self.chat_template + else: + chat_template = self.default_chat_template + using_default_template = True + + if using_default_template: + logger.warning_once( + "No chat template is set for this tokenizer, falling back to a default class-level template. This is " + "very error-prone, because models are often trained with templates different from the class default! " + "Default chat templates are a legacy feature and will be removed in Transformers v4.43, at which " + "point any code depending on them will stop working. We recommend setting a valid chat template before " + "then to ensure that this model continues working without issues." + ) + + return chat_template + @property def default_chat_template(self): """ From 34b43211d782c00da6fef778dbfaff69bbf3f115 Mon Sep 17 00:00:00 2001 From: mig-mfreitas <132093787+mig-mfreitas@users.noreply.github.com> Date: Tue, 23 Jul 2024 10:07:58 +0100 Subject: [PATCH 13/73] Add YaRN and Dynamic-YaRN RoPE Scaling Methods (#30910) * Add YaRN and Dynamic-YaRN RoPE Scaling Methods YaRN (Yet another RoPE extension method) combines the NTK-By-Parts Interpolation and Attention Scaling methods, improving upon existing RoPE interpolation methods for longer context window sizes. Fine-tuned models maintain their original performance across benchmarks while enabling efficient extrapolation and transfer learning for quicker convergence, especially in compute-limited environments. We implement YaRN and Dynamic-YaRN for the following list of models: - LLaMA - Falcon - GPT-NeoX - Olmo - Persimmon - Phi - StableLM - OpenLLaMA New unit tests are added to assert YaRN's correct behavior on both short and long sequence inputs. For more details, please refer to https://arxiv.org/abs/2309.00071. Co-authored-by: Miguel Almeida * Refactor YaRN implementation for LLaMA Iterate on YaRN implementation for LLaMA and remove diff from remaining models for increased PR modularity. This commit includes the following changes: - Merge 'yarn_rope_scaling' and 'rope_scaling' dictionaries - Remove unnecessary attributes ('extrapolation_factor' and 'finetuned') from YaRN classes - Inherit 'forward' method in YaRN classes from superclass - Rename 'yarn' method to 'compute_yarn_scaling' - Extend YaRN tests with further assertions - Fix style inconsistencies Co-authored-by: Miguel Monte e Freitas * Refactor Tensor Building Logic for YaRN - Comply with the the tensor building logic introduced in #30743 - Add referencing to the optimized Attention Factor equation - Remove Dynamic YaRN for a more agile deployment Co-authored-by: mig-mfreitas * remove unwanted file --------- Co-authored-by: Miguel Almeida Co-authored-by: mig-mfreitas Co-authored-by: Joao Gante --- .../models/falcon/modeling_falcon.py | 1 - .../models/fuyu/configuration_fuyu.py | 1 - .../models/gpt_neox/configuration_gpt_neox.py | 1 - .../models/llama/configuration_llama.py | 58 ++++++++++-- .../models/llama/modeling_llama.py | 88 +++++++++++++++++++ .../models/olmo/configuration_olmo.py | 1 - src/transformers/models/olmo/modeling_olmo.py | 1 - .../persimmon/configuration_persimmon.py | 1 - .../models/phi/configuration_phi.py | 1 - .../models/stablelm/configuration_stablelm.py | 1 - tests/models/llama/test_modeling_llama.py | 23 ++++- 11 files changed, 162 insertions(+), 15 deletions(-) diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index d1050d542a2f38..663582c8a72a83 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -283,7 +283,6 @@ def __init__(self, config: FalconConfig): self.attention_dropout = nn.Dropout(config.attention_dropout) self.num_kv_heads = config.num_kv_heads if (self.new_decoder_architecture or not self.multi_query) else 1 - # Copied from transformers.models.llama.modeling_llama.LlamaAttention._init_rope with Llama->Falcon def _init_rope(self): if self.config.rope_scaling is None: self.rotary_emb = FalconRotaryEmbedding( diff --git a/src/transformers/models/fuyu/configuration_fuyu.py b/src/transformers/models/fuyu/configuration_fuyu.py index ffcdd2b61750a6..03d2aecc02b6c9 100644 --- a/src/transformers/models/fuyu/configuration_fuyu.py +++ b/src/transformers/models/fuyu/configuration_fuyu.py @@ -188,7 +188,6 @@ def __init__( **kwargs, ) - # Copied from transformers.models.llama.configuration_llama.LlamaConfig._rope_scaling_validation def _rope_scaling_validation(self): """ Validate the `rope_scaling` configuration. diff --git a/src/transformers/models/gpt_neox/configuration_gpt_neox.py b/src/transformers/models/gpt_neox/configuration_gpt_neox.py index 8e4c94692e0537..944dbb5e02f098 100644 --- a/src/transformers/models/gpt_neox/configuration_gpt_neox.py +++ b/src/transformers/models/gpt_neox/configuration_gpt_neox.py @@ -154,7 +154,6 @@ def __init__( "The hidden size is not divisble by the number of attention heads! Make sure to update them!" ) - # Copied from transformers.models.llama.configuration_llama.LlamaConfig._rope_scaling_validation def _rope_scaling_validation(self): """ Validate the `rope_scaling` configuration. diff --git a/src/transformers/models/llama/configuration_llama.py b/src/transformers/models/llama/configuration_llama.py index 1a059101e42492..843731eeffc8ee 100644 --- a/src/transformers/models/llama/configuration_llama.py +++ b/src/transformers/models/llama/configuration_llama.py @@ -84,13 +84,22 @@ class LlamaConfig(PretrainedConfig): rope_theta (`float`, *optional*, defaults to 10000.0): The base period of the RoPE embeddings. rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling - strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is + Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports three scaling + strategies: linear, dynamic and yarn. Their scaling factor must be a float greater than 1. The expected format is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update `max_position_embeddings` to the expected new maximum. See the following thread for more information on how these scaling strategies behave: https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an experimental feature, subject to breaking API changes in future versions. + For the `yarn` strategy, the dictionary may also contain the following fields: + `original_max_position_embeddings` (`int`, *optional*): + The original maximum sequence length. This is used to scale the RoPE embeddings. + `attention_factor` (`float`, *optional*): + The attention scaling factor. If unspecified, it defaults to `0.1 ln(s) + 1`, where `s` is the `original_max_position_embeddings/max_position_embeddings` ratio. + `beta_fast` (`float`, *optional*): + Parameter to set the boundary for extrapolation (only) in the linear ramp function. + `beta_slow` (`float`, *optional*): + Parameter to set the boundary for interpolation (only) in the linear ramp function. attention_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -178,15 +187,52 @@ def _rope_scaling_validation(self): if self.rope_scaling is None: return - if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2: + if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) < 2: raise ValueError( - "`rope_scaling` must be a dictionary with two fields, `type` and `factor`, " f"got {self.rope_scaling}" + "`rope_scaling` must be a dictionary with a minimum of two fields, `type` and `factor`, " + f"got {self.rope_scaling}" ) rope_scaling_type = self.rope_scaling.get("type", None) rope_scaling_factor = self.rope_scaling.get("factor", None) - if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]: + if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic", "yarn"]: raise ValueError( - f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}" + f"`rope_scaling`'s type field must be one of ['linear', 'dynamic', 'yarn'], got {rope_scaling_type}" ) if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0: raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}") + + if rope_scaling_type != "yarn": + return + + if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) > 6: + raise ValueError( + "`rope_scaling` with type " + f"{rope_scaling_type}" + " must be a dictionary with a maximum of six fields, `type`, `factor`," + "`original_max_position_embeddings`, `attention_factor`, `beta_fast`, `beta_slow`, " + f"got {self.rope_scaling}" + ) + original_max_position_embeddings = self.rope_scaling.get("original_max_position_embeddings", None) + attention_factor = self.rope_scaling.get("attention_factor", None) + beta_fast = self.rope_scaling.get("beta_fast", None) + beta_slow = self.rope_scaling.get("beta_slow", None) + + if original_max_position_embeddings is not None and not isinstance(original_max_position_embeddings, int): + raise ValueError( + f"`rope_scaling`'s original_max_position_embeddings field must be an int, got {original_max_position_embeddings}" + ) + if attention_factor is not None and not isinstance(attention_factor, float) or attention_factor < 0: + raise ValueError( + f"`rope_scaling`'s attention_factor field must be a float greater than 0, got {attention_factor}" + ) + if beta_fast is not None and not isinstance(beta_fast, float): + raise ValueError(f"`rope_scaling`'s beta_fast field must be a float, got {beta_fast}") + if beta_slow is not None and not isinstance(beta_slow, float): + raise ValueError(f"`rope_scaling`'s beta_slow field must be a float, got {beta_slow}") + + b_fast = beta_fast if beta_fast is not None else 32 + b_slow = beta_slow if beta_slow is not None else 1 + if b_fast < b_slow: + raise ValueError( + f"`rope_scaling`'s beta_fast field must be greater than beta_slow, got beta_fast={b_fast} and beta_slow={b_slow}" + ) diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index 5c0c57f3effe86..b624a2d92d0970 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -132,6 +132,77 @@ def forward(self, x, position_ids): return cos, sin +class LlamaYarnScalingRotaryEmbedding(LlamaRotaryEmbedding): + def __init__( + self, + dim, + max_position_embeddings=2048, + base=10000, + scaling_factor=1, + original_max_position_embeddings=2048, + attention_factor=None, + beta_fast=32, + beta_slow=1, + device=None, + ): + super().__init__(dim, max_position_embeddings, base, device, scaling_factor) + + self.original_max_position_embeddings = original_max_position_embeddings + self.attention_factor = attention_factor + self.beta_fast = beta_fast + self.beta_slow = beta_slow + + if self.attention_factor is None: + # Recommended attention factor for LLaMA models. + # For more details please refer to https://arxiv.org/pdf/2309.00071, Eq. 22. + self.attention_factor = 0.1 * math.log(scaling_factor) + 1.0 + + self.compute_yarn_scaling(device) + + # Inverse dimension formula to find the dimension based on the number of rotations + def find_correction_dim(self, num_rotations, dim, base=10000, max_position_embeddings=2048): + return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (2 * math.log(base)) + + # Find dimension range bounds based on rotations + def find_correction_range(self, low_rot, high_rot, dim, base=10000, max_position_embeddings=2048): + low = math.floor(self.find_correction_dim(low_rot, dim, base, max_position_embeddings)) + high = math.ceil(self.find_correction_dim(high_rot, dim, base, max_position_embeddings)) + return max(low, 0), min(high, dim - 1) + + def linear_ramp_mask(self, min, max, dim): + if min == max: + max += 0.001 # Prevent singularity + + linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min) + ramp_func = torch.clamp(linear_func, 0, 1) + return ramp_func + + def forward(self, x, position_ids=None): + # Difference to the original RoPE: applies a scaling factor computed with + # the YaRN method (NTK-by-Parts + Attn Scaling) + # x: [bs, num_attention_heads, seq_len, head_size] + cos, sin = super().forward(x, position_ids) + cos = cos * self.mscale + sin = sin * self.mscale + return cos, sin + + def compute_yarn_scaling(self, device): + pos_freqs = self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim) + inv_freq_extrapolation = 1.0 / pos_freqs + inv_freq_interpolation = 1.0 / (self.scaling_factor * pos_freqs) + + low, high = self.find_correction_range( + self.beta_fast, self.beta_slow, self.dim, self.base, self.original_max_position_embeddings + ) + # Get n-dimensional rotational scaling corrected for extrapolation + inv_freq_mask = 1 - self.linear_ramp_mask(low, high, self.dim // 2).float().to(device) + inv_freq = inv_freq_interpolation * (1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask + + self.register_buffer("inv_freq", inv_freq) + # Get n-dimensional magnitude scaling corrected for interpolation + self.mscale = self.attention_factor + + def rotate_half(x): """Rotates half the hidden dims of the input.""" x1 = x[..., : x.shape[-1] // 2] @@ -258,6 +329,15 @@ def _init_rope(self): else: scaling_type = self.config.rope_scaling["type"] scaling_factor = self.config.rope_scaling["factor"] + # Yarn parameters + kwargs = { + "dim": self.config.rope_scaling.get("original_max_position_embeddings", None), + "max_position_embeddings": self.config.rope_scaling.get("attention_factor", None), + "base": self.config.rope_scaling.get("beta_fast", None), + "scaling_factor": self.config.rope_scaling.get("beta_slow", None), + } + kwargs = {k: v for k, v in kwargs.items() if v is not None} + if scaling_type == "linear": self.rotary_emb = LlamaLinearScalingRotaryEmbedding( self.head_dim, @@ -272,6 +352,14 @@ def _init_rope(self): scaling_factor=scaling_factor, base=self.rope_theta, ) + elif scaling_type == "yarn": + self.rotary_emb = LlamaYarnScalingRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + base=self.rope_theta, + **kwargs, + ) else: raise ValueError(f"Unknown RoPE scaling type {scaling_type}") diff --git a/src/transformers/models/olmo/configuration_olmo.py b/src/transformers/models/olmo/configuration_olmo.py index a25ccd8cc09def..77a3b18e364ecf 100644 --- a/src/transformers/models/olmo/configuration_olmo.py +++ b/src/transformers/models/olmo/configuration_olmo.py @@ -160,7 +160,6 @@ def __init__( **kwargs, ) - # Copied from transformers.models.llama.configuration_llama.LlamaConfig._rope_scaling_validation def _rope_scaling_validation(self): """ Validate the `rope_scaling` configuration. diff --git a/src/transformers/models/olmo/modeling_olmo.py b/src/transformers/models/olmo/modeling_olmo.py index 4fd0c92686834b..59c9b3bf1b66a4 100644 --- a/src/transformers/models/olmo/modeling_olmo.py +++ b/src/transformers/models/olmo/modeling_olmo.py @@ -236,7 +236,6 @@ def __init__(self, config: OlmoConfig, layer_idx: Optional[int] = None): self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.attention_bias) self._init_rope() - # Copied from transformers.models.llama.modeling_llama.LlamaAttention._init_rope with Llama->Olmo def _init_rope(self): if self.config.rope_scaling is None: self.rotary_emb = OlmoRotaryEmbedding( diff --git a/src/transformers/models/persimmon/configuration_persimmon.py b/src/transformers/models/persimmon/configuration_persimmon.py index b8e02256de808a..11f4c66d73e6b3 100644 --- a/src/transformers/models/persimmon/configuration_persimmon.py +++ b/src/transformers/models/persimmon/configuration_persimmon.py @@ -138,7 +138,6 @@ def __init__( **kwargs, ) - # Copied from transformers.models.llama.configuration_llama.LlamaConfig._rope_scaling_validation def _rope_scaling_validation(self): """ Validate the `rope_scaling` configuration. diff --git a/src/transformers/models/phi/configuration_phi.py b/src/transformers/models/phi/configuration_phi.py index d1e3464ee48271..e54d400ae6e72e 100644 --- a/src/transformers/models/phi/configuration_phi.py +++ b/src/transformers/models/phi/configuration_phi.py @@ -165,7 +165,6 @@ def __init__( **kwargs, ) - # Copied from transformers.models.llama.configuration_llama.LlamaConfig._rope_scaling_validation def _rope_scaling_validation(self): """ Validate the `rope_scaling` configuration. diff --git a/src/transformers/models/stablelm/configuration_stablelm.py b/src/transformers/models/stablelm/configuration_stablelm.py index abea7483a67de6..c05ac9f036d62b 100644 --- a/src/transformers/models/stablelm/configuration_stablelm.py +++ b/src/transformers/models/stablelm/configuration_stablelm.py @@ -164,7 +164,6 @@ def __init__( **kwargs, ) - # Copied from transformers.models.llama.configuration_llama.LlamaConfig._rope_scaling_validation def _rope_scaling_validation(self): """ Validate the `rope_scaling` configuration. diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py index e0311b7cea4a0e..de7eb7e44156c1 100644 --- a/tests/models/llama/test_modeling_llama.py +++ b/tests/models/llama/test_modeling_llama.py @@ -55,6 +55,7 @@ LlamaDynamicNTKScalingRotaryEmbedding, LlamaLinearScalingRotaryEmbedding, LlamaRotaryEmbedding, + LlamaYarnScalingRotaryEmbedding, ) @@ -397,7 +398,7 @@ def test_llama_token_classification_model(self): def test_save_load_fast_init_from_base(self): pass - @parameterized.expand([("linear",), ("dynamic",)]) + @parameterized.expand([("linear",), ("dynamic",), ("yarn",)]) def test_model_rope_scaling_from_config(self, scaling_type): config, _ = self.model_tester.prepare_config_and_inputs_for_common() short_input = ids_tensor([1, 10], config.vocab_size) @@ -491,6 +492,26 @@ def test_model_rope_scaling(self): torch.testing.assert_close(ntk_sin_long, original_sin_long) self.assertTrue((ntk_scaling_rope.inv_freq <= original_rope.inv_freq).all()) + # Sanity check Yarn RoPE scaling + yarn_scaling_rope = LlamaYarnScalingRotaryEmbedding( + head_dim, + max_position_embeddings=config.max_position_embeddings, + base=config.rope_theta, + scaling_factor=scaling_factor, + ).to(torch_device) + yarn_cos_short, yarn_sin_short = yarn_scaling_rope(x, position_ids_short) + yarn_cos_long, yarn_sin_long = yarn_scaling_rope(x, position_ids_long) + torch.testing.assert_close(yarn_cos_short, yarn_cos_long[:, :short_input_length, :]) + torch.testing.assert_close(yarn_sin_short, yarn_sin_long[:, :short_input_length, :]) + with self.assertRaises(AssertionError): + torch.testing.assert_close(yarn_cos_short, original_cos_short) + with self.assertRaises(AssertionError): + torch.testing.assert_close(yarn_sin_short, original_sin_short) + with self.assertRaises(AssertionError): + torch.testing.assert_close(yarn_cos_long, original_cos_long) + with self.assertRaises(AssertionError): + torch.testing.assert_close(yarn_sin_long, original_sin_long) + @require_flash_attn @require_torch_gpu @require_bitsandbytes From 1535a2c93d325e529dc9a1907f99247fdf8a58e7 Mon Sep 17 00:00:00 2001 From: Daniel Lok Date: Tue, 23 Jul 2024 17:26:00 +0800 Subject: [PATCH 14/73] Disable quick init for TapasPreTrainedModel (#32149) add attribute to model Signed-off-by: Daniel Lok --- src/transformers/models/tapas/modeling_tapas.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/models/tapas/modeling_tapas.py b/src/transformers/models/tapas/modeling_tapas.py index a06770778e717e..17a01da150b5bd 100644 --- a/src/transformers/models/tapas/modeling_tapas.py +++ b/src/transformers/models/tapas/modeling_tapas.py @@ -724,6 +724,7 @@ class TapasPreTrainedModel(PreTrainedModel): config_class = TapasConfig base_model_prefix = "tapas" supports_gradient_checkpointing = True + _supports_param_buffer_assignment = False # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights def _init_weights(self, module): From 5a4a76edb7ac6bbc764392e89adc11adda91f3e5 Mon Sep 17 00:00:00 2001 From: bayllama <142558246+bayllama@users.noreply.github.com> Date: Tue, 23 Jul 2024 02:28:44 -0700 Subject: [PATCH 15/73] Modify resize_token_embeddings to ensure output type is same as input (#31979) * Change resize_token_embeddings to make it return same Class that is passed to it * Add explanatory comment as requested in review * Add explanatory comments for add resizing function in lxmert * Add comment for padding_idx and moving _resize_bias in lxmert to LxmertForPreTraining --------- Co-authored-by: Prashanth Sateesh Co-authored-by: Prashanth Sateesh --- src/transformers/modeling_utils.py | 13 ++++++++++++- .../models/lxmert/modeling_lxmert.py | 16 ++++++++++++++++ tests/test_modeling_common.py | 5 +++++ 3 files changed, 33 insertions(+), 1 deletion(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index a20b7d941fbfe6..81403f524f9e79 100755 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -2128,7 +2128,18 @@ def _get_resized_embeddings( else: new_embeddings.weight.data[:n, :] = old_embeddings.weight.data[:n, :] - return new_embeddings + # Replace weights in old_embeddings and return to maintain the same embedding type. + # This ensures correct functionality when a Custom Embedding class is passed as input. + # The input and output embedding types remain consistent. (c.f. https://github.com/huggingface/transformers/pull/31979) + old_embeddings.weight.data = new_embeddings.weight.data + old_embeddings.num_embeddings = new_embeddings.weight.data.shape[0] + + # If the new number of tokens is smaller than the original `padding_idx`, the `padding_idx` + # will be set to `None` in the resized embeddings. + if old_embeddings.padding_idx is not None and (new_num_tokens - 1) < old_embeddings.padding_idx: + old_embeddings.padding_idx = None + + return old_embeddings def _get_resized_lm_head( self, old_lm_head: nn.Linear, new_num_tokens: Optional[int] = None, transposed: Optional[bool] = False diff --git a/src/transformers/models/lxmert/modeling_lxmert.py b/src/transformers/models/lxmert/modeling_lxmert.py index b77b87318386e3..9113fc4fd0eb9d 100644 --- a/src/transformers/models/lxmert/modeling_lxmert.py +++ b/src/transformers/models/lxmert/modeling_lxmert.py @@ -1072,6 +1072,22 @@ def __init__(self, config): } self.visual_losses = visual_losses + def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None) -> nn.Embedding: + # Adding the following steps to resize bias to match the shape of resized embeddings + new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of) + self.cls.predictions.bias = self._resize_bias(self.cls.predictions.bias, new_num_tokens) + return new_embeddings + + def _resize_bias(self, bias, new_num_tokens: int): + old_num_tokens = bias.shape[0] + if new_num_tokens <= old_num_tokens: + new_bias = bias[:new_num_tokens] + else: + extra_bias = torch.zeros(new_num_tokens - old_num_tokens, device=bias.device) + new_bias = torch.cat([bias, extra_bias]) + new_bias = nn.Parameter(new_bias) + return new_bias + def resize_num_qa_labels(self, num_labels): """ Build a resized question answering linear layer Module from a provided new linear layer. Increasing the size diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index dd041188cdca3d..19a945aec52799 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -1755,6 +1755,8 @@ def test_resize_tokens_embeddings(self): config = copy.deepcopy(original_config) model = model_class(config) model.to(torch_device) + model_embed_pre_resize = model.get_input_embeddings() + type_model_embed_pre_resize = type(model_embed_pre_resize) if self.model_tester.is_training is False: model.eval() @@ -1774,6 +1776,9 @@ def test_resize_tokens_embeddings(self): self.assertEqual(new_model_vocab_size, model_vocab_size + 10) # Check that it actually resizes the embeddings matrix self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10) + # Check to make sure the type of embeddings returned post resizing is same as type of input + type_model_embed_post_resize = type(model_embed) + self.assertEqual(type_model_embed_pre_resize, type_model_embed_post_resize) # Check that the model can still do a forward pass successfully (every parameter should be resized) model(**self._prepare_for_class(inputs_dict, model_class)) From 2e113422b3504fe6de821bb9911b24273b11aa9c Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Tue, 23 Jul 2024 10:42:55 +0100 Subject: [PATCH 16/73] Llama: RoPE refactor (#32135) Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> --- src/transformers/__init__.py | 2 + src/transformers/modeling_rope_utils.py | 451 ++++++++++++++++++ .../models/chameleon/modeling_chameleon.py | 18 +- .../models/cohere/modeling_cohere.py | 9 +- .../models/jamba/modeling_jamba.py | 2 +- .../models/jetmoe/modeling_jetmoe.py | 2 +- .../models/llama/configuration_llama.py | 105 ++-- .../models/llama/modeling_llama.py | 284 ++++++----- .../models/mistral/modeling_mistral.py | 11 +- .../models/mixtral/modeling_mixtral.py | 2 +- src/transformers/models/olmo/modeling_olmo.py | 18 +- .../models/persimmon/modeling_persimmon.py | 2 +- src/transformers/models/phi/modeling_phi.py | 2 +- src/transformers/models/phi3/modeling_phi3.py | 2 +- .../models/qwen2_moe/modeling_qwen2_moe.py | 2 +- .../models/stablelm/modeling_stablelm.py | 2 +- .../models/starcoder2/modeling_starcoder2.py | 2 +- src/transformers/utils/dummy_pt_objects.py | 3 + tests/models/llama/test_modeling_llama.py | 78 +-- tests/utils/test_modeling_rope_utils.py | 120 +++++ 20 files changed, 831 insertions(+), 286 deletions(-) create mode 100644 src/transformers/modeling_rope_utils.py create mode 100644 tests/utils/test_modeling_rope_utils.py diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index fe31cd3c237b1f..bc6e786358b68d 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -1295,6 +1295,7 @@ ) _import_structure["modeling_flash_attention_utils"] = [] _import_structure["modeling_outputs"] = [] + _import_structure["modeling_rope_utils"] = ["ROPE_INIT_FUNCTIONS"] _import_structure["modeling_utils"] = ["PreTrainedModel"] # PyTorch models structure @@ -6010,6 +6011,7 @@ WatermarkLogitsProcessor, WhisperTimeStampLogitsProcessor, ) + from .modeling_rope_utils import ROPE_INIT_FUNCTIONS from .modeling_utils import PreTrainedModel from .models.albert import ( AlbertForMaskedLM, diff --git a/src/transformers/modeling_rope_utils.py b/src/transformers/modeling_rope_utils.py new file mode 100644 index 00000000000000..33055d2bf942e9 --- /dev/null +++ b/src/transformers/modeling_rope_utils.py @@ -0,0 +1,451 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from typing import Optional, Tuple + +from .configuration_utils import PretrainedConfig +from .utils import is_torch_available, logging + + +logger = logging.get_logger(__name__) + + +if is_torch_available(): + import torch + + +def _compute_default_rope_parameters( + config: Optional[PretrainedConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + **rope_kwargs, +) -> Tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PretrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + rope_kwargs (`Dict`, *optional*): + BC compatibility with the previous RoPE class instantiation, will be removed in v4.45. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + if config is not None and len(rope_kwargs) > 0: + raise ValueError( + "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in " + f"`_compute_default_rope_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}" + ) + if len(rope_kwargs) > 0: + base = rope_kwargs["base"] + dim = rope_kwargs["dim"] + elif config is not None: + base = config.rope_theta + partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 + dim = int((config.hidden_size // config.num_attention_heads) * partial_rotary_factor) + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float().to(device) / dim)) + return inv_freq, attention_factor + + +def _compute_linear_scaling_rope_parameters( + config: Optional[PretrainedConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + **rope_kwargs, +) -> Tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies with linear scaling. Credits to the Reddit user /u/kaiokendev + Args: + config ([`~transformers.PretrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + rope_kwargs (`Dict`, *optional*): + BC compatibility with the previous RoPE class instantiation, will be removed in v4.45. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + if config is not None and len(rope_kwargs) > 0: + raise ValueError( + "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in " + f"`_compute_linear_scaling_rope_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}" + ) + if len(rope_kwargs) > 0: + factor = rope_kwargs["factor"] + elif config is not None: + factor = config.rope_scaling["factor"] + + # Gets the default RoPE parameters + inv_freq, attention_factor = _compute_default_rope_parameters(config, device, seq_len, **rope_kwargs) + + # Then applies linear scaling to the frequencies. + # NOTE: originally, scaling was applied to the position_ids. However, we get `embs = inv_freq @ position_ids`, so + # applying scaling to the inverse frequencies is equivalent. + inv_freq /= factor + return inv_freq, attention_factor + + +def _compute_dynamic_ntk_parameters( + config: Optional[PretrainedConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + **rope_kwargs, +) -> Tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies with NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla + Args: + config ([`~transformers.PretrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length, used to update the dynamic RoPE at inference time. + rope_kwargs (`Dict`, *optional*): + BC compatibility with the previous RoPE class instantiation, will be removed in v4.45. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + if config is not None and len(rope_kwargs) > 0: + raise ValueError( + "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in " + f"`_compute_dynamic_ntk_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}" + ) + if len(rope_kwargs) > 0: + base = rope_kwargs["base"] + dim = rope_kwargs["dim"] + max_position_embeddings = rope_kwargs["max_position_embeddings"] + factor = rope_kwargs["factor"] + elif config is not None: + base = config.rope_theta + partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 + dim = int((config.hidden_size // config.num_attention_heads) * partial_rotary_factor) + max_position_embeddings = config.max_position_embeddings + factor = config.rope_scaling["factor"] + + attention_factor = 1.0 # Unused in this type of RoPE + + # seq_len: default to max_position_embeddings, e.g. at init time + seq_len = seq_len if seq_len is not None else max_position_embeddings + + # Compute the inverse frequencies + base = base * ((factor * seq_len / max_position_embeddings) - (factor - 1)) ** (dim / (dim - 2)) + inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float().to(device) / dim)) + return inv_freq, attention_factor + + +def _compute_yarn_parameters( + config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs +) -> Tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies with NTK scaling. Please refer to the + [original paper](https://arxiv.org/abs/2309.00071) + Args: + config ([`~transformers.PretrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + rope_kwargs (`Dict`, *optional*): + BC compatibility with the previous RoPE class instantiation, will be removed in v4.45. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin. + """ + # No need to keep BC with yarn, unreleased when this new pattern was created. + if len(rope_kwargs) > 0: + raise ValueError( + f"Unexpected arguments: `**rope_kwargs` should be unset in `_compute_yarn_parameters`, got {rope_kwargs}" + ) + + base = config.rope_theta + partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 + dim = int((config.hidden_size // config.num_attention_heads) * partial_rotary_factor) + max_position_embeddings = config.max_position_embeddings + factor = config.rope_scaling["factor"] + + # Sets the attention factor as suggested in the paper + attention_factor = config.rope_scaling.get("attention_factor") + if attention_factor is None: + attention_factor = 0.1 * math.log(factor) + 1.0 + + # Optional config options + # beta_fast/beta_slow: as suggested in the paper, default to 32/1 (correspondingly) + beta_fast = config.rope_scaling.get("beta_fast") or 32 + beta_slow = config.rope_scaling.get("beta_slow") or 1 + + # Compute the inverse frequencies + def find_correction_dim(num_rotations, dim, base, max_position_embeddings): + """Inverse dimension formula to find the dimension based on the number of rotations""" + return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (2 * math.log(base)) + + def find_correction_range(low_rot, high_rot, dim, base, max_position_embeddings): + """Find dimension range bounds based on rotations""" + low = math.floor(find_correction_dim(low_rot, dim, base, max_position_embeddings)) + high = math.ceil(find_correction_dim(high_rot, dim, base, max_position_embeddings)) + return max(low, 0), min(high, dim - 1) + + def linear_ramp_mask(min, max, dim): + if min == max: + max += 0.001 # Prevent singularity + + linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min) + ramp_func = torch.clamp(linear_func, 0, 1) + return ramp_func + + pos_freqs = base ** (torch.arange(0, dim, 2).float().to(device) / dim) + inv_freq_extrapolation = 1.0 / pos_freqs + inv_freq_interpolation = 1.0 / (factor * pos_freqs) + + low, high = find_correction_range(beta_fast, beta_slow, dim, base, max_position_embeddings) + + # Get n-dimensional rotational scaling corrected for extrapolation + inv_freq_mask = 1 - linear_ramp_mask(low, high, dim // 2).float().to(device) + inv_freq = inv_freq_interpolation * (1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask + + return inv_freq, attention_factor + + +def _compute_longrope_parameters( + config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs +) -> Tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies with LongRoPE scaling. Please refer to the + [original implementation](https://github.com/microsoft/LongRoPE) + Args: + config ([`~transformers.PretrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + rope_kwargs (`Dict`, *optional*): + BC compatibility with the previous RoPE class instantiation, will be removed in v4.45. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin. + """ + # No need to keep BC with longrope, unreleased when this new pattern was created. + if len(rope_kwargs) > 0: + raise ValueError( + "Unexpected arguments: `**rope_kwargs` should be unset in `_compute_longrope_parameters`, got " + f"{rope_kwargs}" + ) + + base = config.rope_theta + partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 + dim = int((config.hidden_size // config.num_attention_heads) * partial_rotary_factor) + long_factor = config.rope_scaling["long_factor"] + short_factor = config.rope_scaling["short_factor"] + factor = config.rope_scaling.get("factor") + attention_factor = config.rope_scaling.get("attention_factor") + + # NOTE: Phi3 (and potentially other models) modify `max_position_embeddings` and have a + # `original_max_position_embeddings` field containing the pretrained value. They use the ratio between these two + # values to compute the default attention scaling factor, instead of using `factor`. + if hasattr(config, "original_max_position_embeddings"): + max_position_embeddings = config.original_max_position_embeddings + expanded_max_position_embeddings = config.max_position_embeddings + factor = expanded_max_position_embeddings / max_position_embeddings + else: + max_position_embeddings = config.max_position_embeddings + expanded_max_position_embeddings = max_position_embeddings * factor + + # Sets the attention factor as suggested in the paper + if attention_factor is None: + if factor <= 1.0: + attention_factor = 1.0 + else: + attention_factor = math.sqrt(1 + math.log(factor) / math.log(max_position_embeddings)) + + # Compute the inverse frequencies -- scaled based on the target sequence length + if expanded_max_position_embeddings > max_position_embeddings: + ext_factors = torch.tensor(long_factor, dtype=torch.float32, device=device) + else: + ext_factors = torch.tensor(short_factor, dtype=torch.float32, device=device) + inv_freq_shape = torch.arange(0, dim, 2, dtype=torch.int64, device=device).float() / dim + inv_freq = 1.0 / (ext_factors * base**inv_freq_shape) + + return inv_freq, attention_factor + + +# This maps the "rope_type" string field in rope config to the corresponding function to compute the RoPE parameters +# from the model config. You can append new {'rope_type': callable} pairs to this dictionary to enable custom RoPE +# parameterizations, as long as the callable has the same signature. +ROPE_INIT_FUNCTIONS = { + "default": _compute_default_rope_parameters, + "linear": _compute_linear_scaling_rope_parameters, + "dynamic": _compute_dynamic_ntk_parameters, + "yarn": _compute_yarn_parameters, + "longrope": _compute_longrope_parameters, +} + + +def _check_received_keys(rope_type: str, received_keys: set, required_keys: set, optional_keys: Optional[set] = None): + """Compare the received keys in `config.rope_scaling` against the expected and optional keys""" + missing_keys = required_keys - received_keys + if missing_keys: + raise KeyError(f"Missing required keys in `rope_scaling` for 'rope_type'='{rope_type}': {missing_keys}") + + if optional_keys is not None: + unused_keys = received_keys - required_keys - optional_keys + else: + unused_keys = received_keys - received_keys + if unused_keys: + raise KeyError(f"Unrecognized keys in `rope_scaling` for 'rope_type'='{rope_type}': {unused_keys}") + + +def _validate_default_rope_parameters(config: PretrainedConfig): + rope_scaling = config.rope_scaling + rope_type = rope_scaling["rope_type"] + required_keys = {"rope_type"} + received_keys = set(rope_scaling.keys()) + _check_received_keys(rope_type, received_keys, required_keys) + + +def _validate_linear_scaling_rope_parameters(config: PretrainedConfig): + rope_scaling = config.rope_scaling + rope_type = rope_scaling["rope_type"] + required_keys = {"rope_type", "factor"} + received_keys = set(rope_scaling.keys()) + _check_received_keys(rope_type, received_keys, required_keys) + + factor = rope_scaling["factor"] + if factor is None or not isinstance(factor, float) or factor < 1.0: + raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") + + +def _validate_yarn_parameters(config: PretrainedConfig): + rope_scaling = config.rope_scaling + rope_type = rope_scaling["rope_type"] + required_keys = {"rope_type", "factor"} + optional_keys = {"attention_factor", "beta_fast", "beta_slow"} + received_keys = set(rope_scaling.keys()) + _check_received_keys(rope_type, received_keys, required_keys, optional_keys) + + factor = rope_scaling["factor"] + if factor is None or not isinstance(factor, float) or factor < 1.0: + raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") + + attention_factor = rope_scaling.get("attention_factor") + if attention_factor is not None and (not isinstance(attention_factor, float) or attention_factor < 0): + raise ValueError( + f"`rope_scaling`'s attention_factor field must be a float greater than 0, got {attention_factor}" + ) + beta_fast = rope_scaling.get("beta_fast") + if beta_fast is not None and not isinstance(beta_fast, float): + raise ValueError(f"`rope_scaling`'s beta_fast field must be a float, got {beta_fast}") + beta_slow = rope_scaling.get("beta_slow") + if beta_slow is not None and not isinstance(beta_slow, float): + raise ValueError(f"`rope_scaling`'s beta_slow field must be a float, got {beta_slow}") + + if (beta_fast or 32) < (beta_slow or 1): + raise ValueError( + f"`rope_scaling`'s beta_fast field must be greater than beta_slow, got beta_fast={beta_fast} " + f"(defaults to 32 if None) and beta_slow={beta_slow} (defaults to 1 if None)" + ) + + +def _validate_longrope_parameters(config: PretrainedConfig): + rope_scaling = config.rope_scaling + rope_type = rope_scaling["rope_type"] + required_keys = {"rope_type", "short_factor", "long_factor"} + optional_keys = {"attention_factor", "factor"} + received_keys = set(rope_scaling.keys()) + _check_received_keys(rope_type, received_keys, required_keys, optional_keys) + + partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 + dim = int((config.hidden_size // config.num_attention_heads) * partial_rotary_factor) + + short_factor = rope_scaling.get("short_factor") + if not isinstance(short_factor, list) and all(isinstance(x, (int, float)) for x in short_factor): + raise ValueError(f"`rope_scaling`'s short_factor field must be a list of numbers, got {short_factor}") + if not len(short_factor) == dim // 2: + raise ValueError(f"`rope_scaling`'s short_factor field must have length {dim // 2}, got {len(short_factor)}") + + long_factor = rope_scaling.get("long_factor") + if not isinstance(long_factor, list) and all(isinstance(x, (int, float)) for x in long_factor): + raise ValueError(f"`rope_scaling`'s long_factor field must be a list of numbers, got {long_factor}") + if not len(long_factor) == dim // 2: + raise ValueError(f"`rope_scaling`'s long_factor field must have length {dim // 2}, got {len(long_factor)}") + + # Handle Phi3 divergence: prefer the use of `attention_factor` and/or `factor` over + # `original_max_position_embeddings` to compute internal variables. The latter lives outside `rope_scaling` and is + # unique to longrope (= undesirable) + if hasattr(config, "original_max_position_embeddings"): + logger.warning_once( + "This model has set a `original_max_position_embeddings` field, to be used together with " + "`max_position_embeddings` to determine a scaling factor. Please set the `factor` field of `rope_scaling`" + "with this ratio instead -- we recommend the use of this field over `original_max_position_embeddings`, " + "as it is compatible with most model architectures." + ) + else: + factor = rope_scaling.get("factor") + if factor is None: + raise ValueError("Missing required keys in `rope_scaling`: 'factor'") + elif not isinstance(factor, float) or factor < 1.0: + raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") + + attention_factor = rope_scaling.get("attention_factor") + if attention_factor is not None and not isinstance(attention_factor, float) or attention_factor < 0: + raise ValueError( + f"`rope_scaling`'s attention_factor field must be a float greater than 0, got {attention_factor}" + ) + + +# Like `ROPE_INIT_FUNCTIONS`, this validation function mapping can be dynamically updated for custom RoPE types. +ROPE_VALIDATION_FUNCTIONS = { + "default": _validate_default_rope_parameters, + "linear": _validate_linear_scaling_rope_parameters, + "dynamic": _validate_linear_scaling_rope_parameters, # `dynamic` has the same validation pattern as `linear` + "yarn": _validate_yarn_parameters, + "longrope": _validate_longrope_parameters, +} + + +def rope_config_validation(config: PretrainedConfig): + """ + Validate the RoPE config arguments, given a `PretrainedConfig` object + """ + rope_scaling = getattr(config, "rope_scaling", None) # not a default parameter in `PretrainedConfig` + if rope_scaling is None: + return + + possible_rope_types = set(ROPE_INIT_FUNCTIONS.keys()) + rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type" + if rope_type is None: + raise ValueError( + f"rope_scaling must contain a non-None 'rope_type' field. Possible options are {possible_rope_types}" + ) + + validation_fn = ROPE_VALIDATION_FUNCTIONS.get(rope_type) + if validation_fn is not None: + validation_fn(config) + else: + raise ValueError( + f"Missing validation function mapping in `ROPE_VALIDATION_FUNCTIONS` for 'rope_type'='{rope_type}'" + ) diff --git a/src/transformers/models/chameleon/modeling_chameleon.py b/src/transformers/models/chameleon/modeling_chameleon.py index 1eea9b224958b1..cd10850ae36b7b 100644 --- a/src/transformers/models/chameleon/modeling_chameleon.py +++ b/src/transformers/models/chameleon/modeling_chameleon.py @@ -80,7 +80,8 @@ def forward(self, hidden_states): ALL_LAYERNORM_LAYERS.append(ChameleonRMSNorm) -# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Chameleon +# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Chameleon +# TODO(joao): add me back asap :) class ChameleonRotaryEmbedding(nn.Module): def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): super().__init__() @@ -110,7 +111,8 @@ def forward(self, x, position_ids): return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) -# Copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->Chameleon +# copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->Chameleon +# TODO(joao): add me back asap :) class ChameleonLinearScalingRotaryEmbedding(ChameleonRotaryEmbedding): """ChameleonRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" @@ -121,7 +123,8 @@ def forward(self, x, position_ids): return cos, sin -# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->Chameleon +# copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->Chameleon +# TODO(joao): add me back asap :) class ChameleonDynamicNTKScalingRotaryEmbedding(ChameleonRotaryEmbedding): """ChameleonRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla""" @@ -265,7 +268,8 @@ def __init__(self, config: ChameleonConfig, layer_idx: Optional[int] = None): self.k_norm = ChameleonLayerNorm((self.num_key_value_heads, self.head_dim)) self._init_rope() - # Copied from transformers.models.llama.modeling_llama.LlamaAttention._init_rope with Llama->Chameleon + # copied from transformers.models.llama.modeling_llama.LlamaAttention._init_rope with Llama->Chameleon + # TODO(joao): add me back asap :) def _init_rope(self): if self.config.rope_scaling is None: self.rotary_emb = ChameleonRotaryEmbedding( @@ -358,7 +362,8 @@ def forward( return attn_output, attn_weights, past_key_value -# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->Chameleon +# copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->Chameleon +# TODO(joao): add me back asap :) class ChameleonFlashAttention2(ChameleonAttention): """ Chameleon flash attention module. This module inherits from `ChameleonAttention` as the weights of the module stays @@ -576,7 +581,8 @@ def forward( } -# Copied from transformers.models.llama.modeling_llama.LlamaDecoderLayer with Llama->Chameleon, LLAMA->CHAMELEON +# copied from transformers.models.llama.modeling_llama.LlamaDecoderLayer with Llama->Chameleon, LLAMA->CHAMELEON +# TODO(joao): add me back asap :) class ChameleonDecoderLayer(nn.Module): def __init__(self, config: ChameleonConfig, layer_idx: int): super().__init__() diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py index 5322c2334d37b5..6532c656d453e0 100644 --- a/src/transformers/models/cohere/modeling_cohere.py +++ b/src/transformers/models/cohere/modeling_cohere.py @@ -295,7 +295,8 @@ def forward( return attn_output, attn_weights, past_key_value -# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->Cohere +# copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2 with Llama->Cohere +# TODO(joao): add me back asap :) class CohereFlashAttention2(CohereAttention): """ Cohere flash attention module. This module inherits from `CohereAttention` as the weights of the module stays @@ -409,7 +410,8 @@ def forward( return attn_output, attn_weights, past_key_value -# Copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention Llama->Cohere +# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention Llama->Cohere +# TODO(joao): add me back asap :) class CohereSdpaAttention(CohereAttention): """ Cohere attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from @@ -697,7 +699,8 @@ def _init_weights(self, module): "The bare Cohere Model outputting raw hidden-states without any specific head on top.", COHERE_START_DOCSTRING, ) -# Copied from transformers.models.llama.modeling_llama.LlamaModel with Llama->Cohere +# copied from transformers.models.llama.modeling_llama.LlamaModel with Llama->Cohere +# TODO(joao): add me back asap :) class CohereModel(CoherePreTrainedModel): """ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`CohereDecoderLayer`] diff --git a/src/transformers/models/jamba/modeling_jamba.py b/src/transformers/models/jamba/modeling_jamba.py index 28d732628a28da..6a03dc82a6983c 100755 --- a/src/transformers/models/jamba/modeling_jamba.py +++ b/src/transformers/models/jamba/modeling_jamba.py @@ -1624,7 +1624,7 @@ def set_input_embeddings(self, value): @add_start_docstrings_to_model_forward(JAMBA_INPUTS_DOCSTRING) def forward( self, - input_ids: torch.LongTensor = None, + input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, diff --git a/src/transformers/models/jetmoe/modeling_jetmoe.py b/src/transformers/models/jetmoe/modeling_jetmoe.py index 16d8335e0a52e7..fa15393a40a5f3 100644 --- a/src/transformers/models/jetmoe/modeling_jetmoe.py +++ b/src/transformers/models/jetmoe/modeling_jetmoe.py @@ -1363,7 +1363,7 @@ def set_input_embeddings(self, value): @add_start_docstrings_to_model_forward(JETMOE_INPUTS_DOCSTRING) def forward( self, - input_ids: torch.LongTensor = None, + input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, diff --git a/src/transformers/models/llama/configuration_llama.py b/src/transformers/models/llama/configuration_llama.py index 843731eeffc8ee..7c987ec85a0409 100644 --- a/src/transformers/models/llama/configuration_llama.py +++ b/src/transformers/models/llama/configuration_llama.py @@ -20,10 +20,7 @@ """LLaMA model configuration""" from ...configuration_utils import PretrainedConfig -from ...utils import logging - - -logger = logging.get_logger(__name__) +from ...modeling_rope_utils import rope_config_validation class LlamaConfig(PretrainedConfig): @@ -84,22 +81,35 @@ class LlamaConfig(PretrainedConfig): rope_theta (`float`, *optional*, defaults to 10000.0): The base period of the RoPE embeddings. rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports three scaling - strategies: linear, dynamic and yarn. Their scaling factor must be a float greater than 1. The expected format is - `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update - `max_position_embeddings` to the expected new maximum. See the following thread for more information on how - these scaling strategies behave: - https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an - experimental feature, subject to breaking API changes in future versions. - For the `yarn` strategy, the dictionary may also contain the following fields: - `original_max_position_embeddings` (`int`, *optional*): - The original maximum sequence length. This is used to scale the RoPE embeddings. + Dictionary containing the scaling configuration for the RoPE embeddings. IMPORTANT: RoPE scaling expects + `max_position_embeddings` to remain unchanged -- some methods, like 'longrope', require the original value + to determine which scaling to apply. + Expected contents: + `rope_type` (`str`): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope'], + with 'default' being the original RoPE implementation. + `factor` (`float`, *optional*): + Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In + most scaling types, a `factor` of x will enable the model to handle sequences of length x * + `max_position_embeddings`. `attention_factor` (`float`, *optional*): - The attention scaling factor. If unspecified, it defaults to `0.1 ln(s) + 1`, where `s` is the `original_max_position_embeddings/max_position_embeddings` ratio. + Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention + computation. If unspecified, it defaults to value recommended by the implementation, using the + `factor` field to infer the suggested value. `beta_fast` (`float`, *optional*): - Parameter to set the boundary for extrapolation (only) in the linear ramp function. + Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear + ramp function. If unspecified, it defaults to 32. `beta_slow` (`float`, *optional*): - Parameter to set the boundary for interpolation (only) in the linear ramp function. + Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear + ramp function. If unspecified, it defaults to 1. + `short_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to short contexts (< + `max_position_embeddings` * `factor`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `long_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to short contexts (< + `max_position_embeddings` * `factor`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 attention_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -167,11 +177,13 @@ def __init__( self.use_cache = use_cache self.rope_theta = rope_theta self.rope_scaling = rope_scaling - self._rope_scaling_validation() self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.mlp_bias = mlp_bias + # Validate the correctness of rotary position embeddings parameters + rope_config_validation(self) + super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, @@ -179,60 +191,3 @@ def __init__( tie_word_embeddings=tie_word_embeddings, **kwargs, ) - - def _rope_scaling_validation(self): - """ - Validate the `rope_scaling` configuration. - """ - if self.rope_scaling is None: - return - - if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) < 2: - raise ValueError( - "`rope_scaling` must be a dictionary with a minimum of two fields, `type` and `factor`, " - f"got {self.rope_scaling}" - ) - rope_scaling_type = self.rope_scaling.get("type", None) - rope_scaling_factor = self.rope_scaling.get("factor", None) - if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic", "yarn"]: - raise ValueError( - f"`rope_scaling`'s type field must be one of ['linear', 'dynamic', 'yarn'], got {rope_scaling_type}" - ) - if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0: - raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}") - - if rope_scaling_type != "yarn": - return - - if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) > 6: - raise ValueError( - "`rope_scaling` with type " - f"{rope_scaling_type}" - " must be a dictionary with a maximum of six fields, `type`, `factor`," - "`original_max_position_embeddings`, `attention_factor`, `beta_fast`, `beta_slow`, " - f"got {self.rope_scaling}" - ) - original_max_position_embeddings = self.rope_scaling.get("original_max_position_embeddings", None) - attention_factor = self.rope_scaling.get("attention_factor", None) - beta_fast = self.rope_scaling.get("beta_fast", None) - beta_slow = self.rope_scaling.get("beta_slow", None) - - if original_max_position_embeddings is not None and not isinstance(original_max_position_embeddings, int): - raise ValueError( - f"`rope_scaling`'s original_max_position_embeddings field must be an int, got {original_max_position_embeddings}" - ) - if attention_factor is not None and not isinstance(attention_factor, float) or attention_factor < 0: - raise ValueError( - f"`rope_scaling`'s attention_factor field must be a float greater than 0, got {attention_factor}" - ) - if beta_fast is not None and not isinstance(beta_fast, float): - raise ValueError(f"`rope_scaling`'s beta_fast field must be a float, got {beta_fast}") - if beta_slow is not None and not isinstance(beta_slow, float): - raise ValueError(f"`rope_scaling`'s beta_slow field must be a float, got {beta_slow}") - - b_fast = beta_fast if beta_fast is not None else 32 - b_slow = beta_slow if beta_slow is not None else 1 - if b_fast < b_slow: - raise ValueError( - f"`rope_scaling`'s beta_fast field must be greater than beta_slow, got beta_fast={b_fast} and beta_slow={b_slow}" - ) diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index b624a2d92d0970..3115cee78f7677 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -37,6 +37,7 @@ SequenceClassifierOutputWithPast, TokenClassifierOutput, ) +from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS from ...modeling_utils import PreTrainedModel from ...pytorch_utils import ALL_LAYERNORM_LAYERS from ...utils import ( @@ -75,24 +76,77 @@ def forward(self, hidden_states): class LlamaRotaryEmbedding(nn.Module): - def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + def __init__( + self, + dim=None, + max_position_embeddings=2048, + base=10000, + device=None, + scaling_factor=1.0, + rope_type="default", + config: Optional[LlamaConfig] = None, + ): super().__init__() - self.scaling_factor = scaling_factor - self.dim = dim - self.max_position_embeddings = max_position_embeddings - self.base = base - inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim)) + # TODO (joao): remove the `if` below, only used for BC + self.rope_kwargs = {} + if config is None: + logger.warning_once( + "`LlamaRotaryEmbedding` can now be fully parameterized by passing the model config through the " + "`config` argument. All other arguments will be removed in v4.45" + ) + self.rope_kwargs = { + "rope_type": rope_type, + "factor": scaling_factor, + "dim": dim, + "base": base, + "max_position_embeddings": max_position_embeddings, + } + self.rope_type = rope_type + self.max_seq_len_cached = max_position_embeddings + self.original_max_seq_len = max_position_embeddings + else: + # BC: "rope_type" was originally "type" + if config.rope_scaling is not None: + self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling["type"]) + else: + self.rope_type = "default" + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs) self.register_buffer("inv_freq", inv_freq, persistent=False) - # For BC we register cos and sin cached - self.max_seq_len_cached = max_position_embeddings + self.original_inv_freq = self.inv_freq + + def _dynamic_frequency_update(self, position_ids, device): + """ + dynamic RoPE layers should recompute `inv_freq` in the following situations: + 1 - growing beyond the cached sequence length (allow scaling) + 2 - the current sequence length is in the original scale (avoid losing precision with small sequences) + """ + seq_len = torch.max(position_ids) + 1 + if seq_len > self.max_seq_len_cached: # growth + inv_freq, self.attention_scaling = self.rope_init_fn( + self.config, device, seq_len=seq_len, **self.rope_kwargs + ) + self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: may break with compilation + self.max_seq_len_cached = seq_len + + if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len: # reset + self.register_buffer("inv_freq", self.original_inv_freq, persistent=False) + self.max_seq_len_cached = self.original_max_seq_len @torch.no_grad() def forward(self, x, position_ids): - # x: [bs, num_attention_heads, seq_len, head_size] + if "dynamic" in self.rope_type: + self._dynamic_frequency_update(position_ids, device=x.device) + + # Core RoPE block inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) position_ids_expanded = position_ids[:, None, :].float() - # Force float32 since bfloat16 loses precision on long contexts - # See https://github.com/huggingface/transformers/pull/29285 + # Force float32 (see https://github.com/huggingface/transformers/pull/29285) device_type = x.device.type device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu" with torch.autocast(device_type=device_type, enabled=False): @@ -100,107 +154,37 @@ def forward(self, x, position_ids): emb = torch.cat((freqs, freqs), dim=-1) cos = emb.cos() sin = emb.sin() + + # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention + cos = cos * self.attention_scaling + sin = sin * self.attention_scaling + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) class LlamaLinearScalingRotaryEmbedding(LlamaRotaryEmbedding): """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" - def forward(self, x, position_ids): - # difference to the original RoPE: a scaling factor is aplied to the position ids - position_ids = position_ids.float() / self.scaling_factor - cos, sin = super().forward(x, position_ids) - return cos, sin + def __init__(self, *args, **kwargs): + logger.warning_once( + "`LlamaLinearScalingRotaryEmbedding` is deprecated an will be removed in v4.45. Please use " + "`LlamaRotaryEmbedding`, which now also does linear scaling (simply pass the model config to __init__)." + ) + kwargs["rope_type"] = "linear" + super().__init__(*args, **kwargs) class LlamaDynamicNTKScalingRotaryEmbedding(LlamaRotaryEmbedding): """LlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla""" - def forward(self, x, position_ids): - # difference to the original RoPE: inv_freq is recomputed when the sequence length > original length - seq_len = torch.max(position_ids) + 1 - if seq_len > self.max_position_embeddings: - base = self.base * ( - (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1) - ) ** (self.dim / (self.dim - 2)) - inv_freq = 1.0 / ( - base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(x.device) / self.dim) - ) - self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: this may break with compilation - - cos, sin = super().forward(x, position_ids) - return cos, sin - - -class LlamaYarnScalingRotaryEmbedding(LlamaRotaryEmbedding): - def __init__( - self, - dim, - max_position_embeddings=2048, - base=10000, - scaling_factor=1, - original_max_position_embeddings=2048, - attention_factor=None, - beta_fast=32, - beta_slow=1, - device=None, - ): - super().__init__(dim, max_position_embeddings, base, device, scaling_factor) - - self.original_max_position_embeddings = original_max_position_embeddings - self.attention_factor = attention_factor - self.beta_fast = beta_fast - self.beta_slow = beta_slow - - if self.attention_factor is None: - # Recommended attention factor for LLaMA models. - # For more details please refer to https://arxiv.org/pdf/2309.00071, Eq. 22. - self.attention_factor = 0.1 * math.log(scaling_factor) + 1.0 - - self.compute_yarn_scaling(device) - - # Inverse dimension formula to find the dimension based on the number of rotations - def find_correction_dim(self, num_rotations, dim, base=10000, max_position_embeddings=2048): - return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (2 * math.log(base)) - - # Find dimension range bounds based on rotations - def find_correction_range(self, low_rot, high_rot, dim, base=10000, max_position_embeddings=2048): - low = math.floor(self.find_correction_dim(low_rot, dim, base, max_position_embeddings)) - high = math.ceil(self.find_correction_dim(high_rot, dim, base, max_position_embeddings)) - return max(low, 0), min(high, dim - 1) - - def linear_ramp_mask(self, min, max, dim): - if min == max: - max += 0.001 # Prevent singularity - - linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min) - ramp_func = torch.clamp(linear_func, 0, 1) - return ramp_func - - def forward(self, x, position_ids=None): - # Difference to the original RoPE: applies a scaling factor computed with - # the YaRN method (NTK-by-Parts + Attn Scaling) - # x: [bs, num_attention_heads, seq_len, head_size] - cos, sin = super().forward(x, position_ids) - cos = cos * self.mscale - sin = sin * self.mscale - return cos, sin - - def compute_yarn_scaling(self, device): - pos_freqs = self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim) - inv_freq_extrapolation = 1.0 / pos_freqs - inv_freq_interpolation = 1.0 / (self.scaling_factor * pos_freqs) - - low, high = self.find_correction_range( - self.beta_fast, self.beta_slow, self.dim, self.base, self.original_max_position_embeddings + def __init__(self, *args, **kwargs): + logger.warning_once( + "`LlamaDynamicNTKScalingRotaryEmbedding` is deprecated an will be removed in v4.45. Please use " + "`LlamaRotaryEmbedding`, which now also does dynamic ntk scaling (simply pass the model config to " + "__init__)." ) - # Get n-dimensional rotational scaling corrected for extrapolation - inv_freq_mask = 1 - self.linear_ramp_mask(low, high, self.dim // 2).float().to(device) - inv_freq = inv_freq_interpolation * (1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask - - self.register_buffer("inv_freq", inv_freq) - # Get n-dimensional magnitude scaling corrected for interpolation - self.mscale = self.attention_factor + kwargs["rope_type"] = "dynamic" + super().__init__(*args, **kwargs) def rotate_half(x): @@ -317,51 +301,9 @@ def __init__(self, config: LlamaConfig, layer_idx: Optional[int] = None): self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.attention_bias) - self._init_rope() - - def _init_rope(self): - if self.config.rope_scaling is None: - self.rotary_emb = LlamaRotaryEmbedding( - self.head_dim, - max_position_embeddings=self.max_position_embeddings, - base=self.rope_theta, - ) - else: - scaling_type = self.config.rope_scaling["type"] - scaling_factor = self.config.rope_scaling["factor"] - # Yarn parameters - kwargs = { - "dim": self.config.rope_scaling.get("original_max_position_embeddings", None), - "max_position_embeddings": self.config.rope_scaling.get("attention_factor", None), - "base": self.config.rope_scaling.get("beta_fast", None), - "scaling_factor": self.config.rope_scaling.get("beta_slow", None), - } - kwargs = {k: v for k, v in kwargs.items() if v is not None} - - if scaling_type == "linear": - self.rotary_emb = LlamaLinearScalingRotaryEmbedding( - self.head_dim, - max_position_embeddings=self.max_position_embeddings, - scaling_factor=scaling_factor, - base=self.rope_theta, - ) - elif scaling_type == "dynamic": - self.rotary_emb = LlamaDynamicNTKScalingRotaryEmbedding( - self.head_dim, - max_position_embeddings=self.max_position_embeddings, - scaling_factor=scaling_factor, - base=self.rope_theta, - ) - elif scaling_type == "yarn": - self.rotary_emb = LlamaYarnScalingRotaryEmbedding( - self.head_dim, - max_position_embeddings=self.max_position_embeddings, - scaling_factor=scaling_factor, - base=self.rope_theta, - **kwargs, - ) - else: - raise ValueError(f"Unknown RoPE scaling type {scaling_type}") + + # TODO (joao): remove in v4.45 (RoPE is computed in the model, not in the decoder layers) + self.rotary_emb = LlamaRotaryEmbedding(config=self.config) def forward( self, @@ -372,6 +314,7 @@ def forward( output_attentions: bool = False, use_cache: bool = False, cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.45 **kwargs, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: bsz, q_len, _ = hidden_states.size() @@ -402,7 +345,16 @@ def forward( key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - cos, sin = self.rotary_emb(value_states, position_ids) + if position_embeddings is None: + logger.warning_once( + "The attention layers in this model are transitioning from computing the RoPE embeddings internally " + "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed " + "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.45 `position_ids` will be " + "removed and `position_embeddings` will be mandatory." + ) + cos, sin = self.rotary_emb(value_states, position_ids) + else: + cos, sin = position_embeddings query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) if past_key_value is not None: @@ -471,6 +423,7 @@ def forward( output_attentions: bool = False, use_cache: bool = False, cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.45 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: if isinstance(past_key_value, StaticCache): raise ValueError( @@ -493,7 +446,16 @@ def forward( key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - cos, sin = self.rotary_emb(value_states, position_ids) + if position_embeddings is None: + logger.warning_once( + "The attention layers in this model are transitioning from computing the RoPE embeddings internally " + "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed " + "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.45 `position_ids` will be " + "removed and `position_embeddings` will be mandatory." + ) + cos, sin = self.rotary_emb(value_states, position_ids) + else: + cos, sin = position_embeddings query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) if past_key_value is not None: @@ -573,6 +535,7 @@ def forward( output_attentions: bool = False, use_cache: bool = False, cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.45 **kwargs, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: if output_attentions: @@ -589,6 +552,7 @@ def forward( output_attentions=output_attentions, use_cache=use_cache, cache_position=cache_position, + position_embeddings=position_embeddings, ) bsz, q_len, _ = hidden_states.size() @@ -601,7 +565,16 @@ def forward( key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - cos, sin = self.rotary_emb(value_states, position_ids) + if position_embeddings is None: + logger.warning_once( + "The attention layers in this model are transitioning from computing the RoPE embeddings internally " + "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed " + "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.45 `position_ids` will be " + "removed and `position_embeddings` will be mandatory." + ) + cos, sin = self.rotary_emb(value_states, position_ids) + else: + cos, sin = position_embeddings query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) if past_key_value is not None: @@ -671,6 +644,7 @@ def forward( output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.45 **kwargs, ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: """ @@ -688,6 +662,9 @@ def forward( past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): Indices depicting the position of the input sequence tokens in the sequence + position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*): + Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`, + with `head_dim` being the embedding dimension of each attention head. kwargs (`dict`, *optional*): Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code into the model @@ -705,6 +682,7 @@ def forward( output_attentions=output_attentions, use_cache=use_cache, cache_position=cache_position, + position_embeddings=position_embeddings, **kwargs, ) hidden_states = residual + hidden_states @@ -867,6 +845,7 @@ def __init__(self, config: LlamaConfig): [LlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] ) self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.rotary_emb = LlamaRotaryEmbedding(config=config) self.gradient_checkpointing = False # Initialize weights and apply final processing @@ -933,10 +912,11 @@ def forward( causal_mask = self._update_causal_mask( attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions ) - - # embed positions hidden_states = inputs_embeds + # create position embeddings to be shared across the decoder layers + position_embeddings = self.rotary_emb(hidden_states, position_ids) + # decoder layers all_hidden_states = () if output_hidden_states else None all_self_attns = () if output_attentions else None @@ -956,6 +936,7 @@ def forward( output_attentions, use_cache, cache_position, + position_embeddings, ) else: layer_outputs = decoder_layer( @@ -966,6 +947,7 @@ def forward( output_attentions=output_attentions, use_cache=use_cache, cache_position=cache_position, + position_embeddings=position_embeddings, ) hidden_states = layer_outputs[0] @@ -1280,7 +1262,7 @@ def set_input_embeddings(self, value): @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING) def forward( self, - input_ids: torch.LongTensor = None, + input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py index 8e2f4dd5a44a79..dd814cd75fb112 100644 --- a/src/transformers/models/mistral/modeling_mistral.py +++ b/src/transformers/models/mistral/modeling_mistral.py @@ -85,7 +85,8 @@ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): self.register_buffer("inv_freq", inv_freq, persistent=False) @torch.no_grad() - # Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding.forward + # copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding.forward + # TODO(joao): add me back asap :) def forward(self, x, position_ids): # x: [bs, num_attention_heads, seq_len, head_size] inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) @@ -396,7 +397,8 @@ def forward( return attn_output, attn_weights, past_key_value -# Copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Mistral +# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Mistral +# TODO(joao): add me back asap :) class MistralSdpaAttention(MistralAttention): """ Mistral attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from @@ -492,7 +494,8 @@ def forward( } -# Copied from transformers.models.llama.modeling_llama.LlamaDecoderLayer with Llama->Mistral, LLAMA->MISTRAL +# copied from transformers.models.llama.modeling_llama.LlamaDecoderLayer with Llama->Mistral, LLAMA->MISTRAL +# TODO(joao): add me back asap :) class MistralDecoderLayer(nn.Module): def __init__(self, config: MistralConfig, layer_idx: int): super().__init__() @@ -1146,7 +1149,7 @@ def set_input_embeddings(self, value): @add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING) def forward( self, - input_ids: torch.LongTensor = None, + input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py index 4b88afcded370c..82320de79386b5 100644 --- a/src/transformers/models/mixtral/modeling_mixtral.py +++ b/src/transformers/models/mixtral/modeling_mixtral.py @@ -1362,7 +1362,7 @@ def set_input_embeddings(self, value): @add_start_docstrings_to_model_forward(MIXTRAL_INPUTS_DOCSTRING) def forward( self, - input_ids: torch.LongTensor = None, + input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, diff --git a/src/transformers/models/olmo/modeling_olmo.py b/src/transformers/models/olmo/modeling_olmo.py index 59c9b3bf1b66a4..a56baf0653ecd3 100644 --- a/src/transformers/models/olmo/modeling_olmo.py +++ b/src/transformers/models/olmo/modeling_olmo.py @@ -74,7 +74,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: ALL_LAYERNORM_LAYERS.append(OlmoLayerNorm) -# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Olmo +# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Olmo +# TODO(joao): add me back asap :) class OlmoRotaryEmbedding(nn.Module): def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): super().__init__() @@ -104,7 +105,8 @@ def forward(self, x, position_ids): return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) -# Copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->Olmo +# copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->Olmo +# TODO(joao): add me back asap :) class OlmoLinearScalingRotaryEmbedding(OlmoRotaryEmbedding): """OlmoRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" @@ -115,7 +117,8 @@ def forward(self, x, position_ids): return cos, sin -# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->Olmo +# copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->Olmo +# TODO(joao): add me back asap :) class OlmoDynamicNTKScalingRotaryEmbedding(OlmoRotaryEmbedding): """OlmoRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla""" @@ -202,7 +205,8 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: class OlmoAttention(nn.Module): """Multi-headed attention from 'Attention Is All You Need' paper""" - # Copied from transformers.models.llama.modeling_llama.LlamaAttention.__init__ with Llama->Olmo + # copied from transformers.models.llama.modeling_llama.LlamaAttention.__init__ with Llama->Olmo + # TODO(joao): add me back asap :) def __init__(self, config: OlmoConfig, layer_idx: Optional[int] = None): super().__init__() self.config = config @@ -549,7 +553,8 @@ def __init__(self, config: OlmoConfig, layer_idx: int): self.input_layernorm = OlmoLayerNorm(config.hidden_size) self.post_attention_layernorm = OlmoLayerNorm(config.hidden_size) - # Copied from transformers.models.llama.modeling_llama.LlamaDecoderLayer.forward + # copied from transformers.models.llama.modeling_llama.LlamaDecoderLayer.forward + # TODO(joao): add me back asap :) def forward( self, hidden_states: torch.Tensor, @@ -768,7 +773,8 @@ def set_input_embeddings(self, value): self.embed_tokens = value @add_start_docstrings_to_model_forward(OLMO_INPUTS_DOCSTRING) - # Copied from transformers.models.llama.modeling_llama.LlamaModel.forward + # copied from transformers.models.llama.modeling_llama.LlamaModel.forward + # TODO(joao): add me back asap :) def forward( self, input_ids: torch.LongTensor = None, diff --git a/src/transformers/models/persimmon/modeling_persimmon.py b/src/transformers/models/persimmon/modeling_persimmon.py index fc1b729fa654b0..af22145e3e9de9 100644 --- a/src/transformers/models/persimmon/modeling_persimmon.py +++ b/src/transformers/models/persimmon/modeling_persimmon.py @@ -999,7 +999,7 @@ def set_input_embeddings(self, value): @add_start_docstrings_to_model_forward(PERSIMMON_INPUTS_DOCSTRING) def forward( self, - input_ids: torch.LongTensor = None, + input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, diff --git a/src/transformers/models/phi/modeling_phi.py b/src/transformers/models/phi/modeling_phi.py index 7ad34a5780835b..f80453d3f7d990 100644 --- a/src/transformers/models/phi/modeling_phi.py +++ b/src/transformers/models/phi/modeling_phi.py @@ -1282,7 +1282,7 @@ def set_input_embeddings(self, value): @add_start_docstrings_to_model_forward(PHI_INPUTS_DOCSTRING) def forward( self, - input_ids: torch.LongTensor = None, + input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py index b7d05bbed6ca15..a32f8531e4870c 100644 --- a/src/transformers/models/phi3/modeling_phi3.py +++ b/src/transformers/models/phi3/modeling_phi3.py @@ -1278,7 +1278,7 @@ def set_input_embeddings(self, value): @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING) def forward( self, - input_ids: torch.LongTensor = None, + input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, diff --git a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py index c20d74fb18c4d7..d88b5c357e86da 100644 --- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py +++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py @@ -1370,7 +1370,7 @@ def set_input_embeddings(self, value): @add_start_docstrings_to_model_forward(QWEN2MOE_INPUTS_DOCSTRING) def forward( self, - input_ids: torch.LongTensor = None, + input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, diff --git a/src/transformers/models/stablelm/modeling_stablelm.py b/src/transformers/models/stablelm/modeling_stablelm.py index a172183618021a..ea50a20edea8a8 100755 --- a/src/transformers/models/stablelm/modeling_stablelm.py +++ b/src/transformers/models/stablelm/modeling_stablelm.py @@ -1275,7 +1275,7 @@ def set_input_embeddings(self, value): @add_start_docstrings_to_model_forward(STABLELM_INPUTS_DOCSTRING) def forward( self, - input_ids: torch.LongTensor = None, + input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, diff --git a/src/transformers/models/starcoder2/modeling_starcoder2.py b/src/transformers/models/starcoder2/modeling_starcoder2.py index 430befd24ae364..af532b139ca392 100644 --- a/src/transformers/models/starcoder2/modeling_starcoder2.py +++ b/src/transformers/models/starcoder2/modeling_starcoder2.py @@ -1153,7 +1153,7 @@ def set_input_embeddings(self, value): @add_start_docstrings_to_model_forward(STARCODER2_INPUTS_DOCSTRING) def forward( self, - input_ids: torch.LongTensor = None, + input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 81d4c2105586f0..de739c6e70044a 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -485,6 +485,9 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +ROPE_INIT_FUNCTIONS = None + + class PreTrainedModel(metaclass=DummyObject): _backends = ["torch"] diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py index de7eb7e44156c1..85d352fc814f6f 100644 --- a/tests/models/llama/test_modeling_llama.py +++ b/tests/models/llama/test_modeling_llama.py @@ -51,12 +51,7 @@ LlamaModel, LlamaTokenizer, ) - from transformers.models.llama.modeling_llama import ( - LlamaDynamicNTKScalingRotaryEmbedding, - LlamaLinearScalingRotaryEmbedding, - LlamaRotaryEmbedding, - LlamaYarnScalingRotaryEmbedding, - ) + from transformers.models.llama.modeling_llama import LlamaLinearScalingRotaryEmbedding, LlamaRotaryEmbedding class LlamaModelTester: @@ -431,9 +426,6 @@ def test_model_rope_scaling_from_config(self, scaling_type): def test_model_rope_scaling(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() - hidden_size = config.hidden_size - num_heads = config.num_attention_heads - head_dim = hidden_size // num_heads scaling_factor = 10 short_input_length = 10 long_input_length = int(config.max_position_embeddings * 1.5) @@ -446,11 +438,7 @@ def test_model_rope_scaling(self): position_ids_long = position_ids_long.unsqueeze(0) # Sanity check original RoPE - original_rope = LlamaRotaryEmbedding( - head_dim, - max_position_embeddings=config.max_position_embeddings, - base=config.rope_theta, - ).to(torch_device) + original_rope = LlamaRotaryEmbedding(config=config).to(torch_device) original_cos_short, original_sin_short = original_rope(x, position_ids_short) original_cos_long, original_sin_long = original_rope(x, position_ids_long) torch.testing.assert_close(original_cos_short, original_cos_long[:, :short_input_length, :]) @@ -458,12 +446,8 @@ def test_model_rope_scaling(self): # Sanity check linear RoPE scaling # New position "x" should match original position with index "x/scaling_factor" - linear_scaling_rope = LlamaLinearScalingRotaryEmbedding( - head_dim, - max_position_embeddings=config.max_position_embeddings, - base=config.rope_theta, - scaling_factor=scaling_factor, - ).to(torch_device) + config.rope_scaling = {"type": "linear", "factor": scaling_factor} + linear_scaling_rope = LlamaRotaryEmbedding(config=config).to(torch_device) linear_cos_short, linear_sin_short = linear_scaling_rope(x, position_ids_short) linear_cos_long, linear_sin_long = linear_scaling_rope(x, position_ids_long) torch.testing.assert_close(linear_cos_short, linear_cos_long[:, :short_input_length, :]) @@ -476,12 +460,8 @@ def test_model_rope_scaling(self): # Sanity check Dynamic NTK RoPE scaling # Scaling should only be observed after a long input is fed. We can observe that the frequencies increase # with scaling_factor (or that `inv_freq` decreases) - ntk_scaling_rope = LlamaDynamicNTKScalingRotaryEmbedding( - head_dim, - max_position_embeddings=config.max_position_embeddings, - base=config.rope_theta, - scaling_factor=scaling_factor, - ).to(torch_device) + config.rope_scaling = {"type": "dynamic", "factor": scaling_factor} + ntk_scaling_rope = LlamaRotaryEmbedding(config=config).to(torch_device) ntk_cos_short, ntk_sin_short = ntk_scaling_rope(x, position_ids_short) ntk_cos_long, ntk_sin_long = ntk_scaling_rope(x, position_ids_long) torch.testing.assert_close(ntk_cos_short, original_cos_short) @@ -493,12 +473,9 @@ def test_model_rope_scaling(self): self.assertTrue((ntk_scaling_rope.inv_freq <= original_rope.inv_freq).all()) # Sanity check Yarn RoPE scaling - yarn_scaling_rope = LlamaYarnScalingRotaryEmbedding( - head_dim, - max_position_embeddings=config.max_position_embeddings, - base=config.rope_theta, - scaling_factor=scaling_factor, - ).to(torch_device) + # Scaling should be over the entire input + config.rope_scaling = {"type": "yarn", "factor": scaling_factor} + yarn_scaling_rope = LlamaRotaryEmbedding(config=config).to(torch_device) yarn_cos_short, yarn_sin_short = yarn_scaling_rope(x, position_ids_short) yarn_cos_long, yarn_sin_long = yarn_scaling_rope(x, position_ids_long) torch.testing.assert_close(yarn_cos_short, yarn_cos_long[:, :short_input_length, :]) @@ -512,6 +489,43 @@ def test_model_rope_scaling(self): with self.assertRaises(AssertionError): torch.testing.assert_close(yarn_sin_long, original_sin_long) + def test_rope_class_retrocompatibility(self): + # Delete me when we remove compatibility for the old API :) + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + scaling_factor = 10 + short_input_length = 10 + long_input_length = int(config.max_position_embeddings * 1.5) + config.rope_scaling = {"type": "linear", "factor": 10} + + # Inputs + x = torch.randn(1, dtype=torch.float32, device=torch_device) # used exlusively to get the dtype and the device + position_ids_short = torch.arange(short_input_length, dtype=torch.long, device=torch_device) + position_ids_short = position_ids_short.unsqueeze(0) + position_ids_long = torch.arange(long_input_length, dtype=torch.long, device=torch_device) + position_ids_long = position_ids_long.unsqueeze(0) + + # Old API -- under the hood, "type": "linear" is set and `LlamaRotaryEmbedding` is called + old_api_rope = LlamaLinearScalingRotaryEmbedding( + config.hidden_size // config.num_attention_heads, + max_position_embeddings=config.max_position_embeddings, + base=config.rope_theta, + scaling_factor=scaling_factor, + ).to(torch_device) + old_cos_short, old_sin_short = old_api_rope(x, position_ids_short) + old_cos_long, old_sin_long = old_api_rope(x, position_ids_long) + + # New API + config.rope_scaling = {"type": "linear", "factor": scaling_factor} + new_api_rope = LlamaRotaryEmbedding(config=config).to(torch_device) + new_cos_short, new_sin_short = new_api_rope(x, position_ids_short) + new_cos_long, new_sin_long = new_api_rope(x, position_ids_long) + + # The results should match + torch.testing.assert_close(old_cos_short, new_cos_short) + torch.testing.assert_close(old_sin_short, new_sin_short) + torch.testing.assert_close(old_cos_long, new_cos_long) + torch.testing.assert_close(old_sin_long, new_sin_long) + @require_flash_attn @require_torch_gpu @require_bitsandbytes diff --git a/tests/utils/test_modeling_rope_utils.py b/tests/utils/test_modeling_rope_utils.py new file mode 100644 index 00000000000000..847323d9bf23e4 --- /dev/null +++ b/tests/utils/test_modeling_rope_utils.py @@ -0,0 +1,120 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest + +from transformers import LlamaConfig +from transformers.testing_utils import is_torch_available, require_torch, torch_device + + +if is_torch_available(): + import torch + + from transformers import ROPE_INIT_FUNCTIONS + from transformers.modeling_rope_utils import rope_config_validation + + +@require_torch +class RopeTest(unittest.TestCase): + def test_rope_validation(self): + config = LlamaConfig() + all_rope_types = ROPE_INIT_FUNCTIONS.keys() + + # The base config is always valid (default RoPE) + rope_config_validation(config) + + # If we explicitly set the other RoPE types, then validation should fail + for rope_type in all_rope_types: + if rope_type != "default": + config.rope_scaling = {"rope_type": rope_type} + with self.assertRaises(KeyError): + rope_config_validation(config) + + # Parameters are exclusive to their own RoPE type, and should raise an exception if incorrectly passed + valid_param_mapping = { + "factor": ["linear", "dynamic", "yarn", "longrope"], + "attention_factor": ["yarn", "longrope"], + "beta_fast": ["yarn"], + "beta_slow": ["yarn"], + "short_factor": ["longrope"], + "long_factor": ["longrope"], + } + for rope_type in all_rope_types: + if rope_type == "default": + continue # checked above + for param, valid_rope_types in valid_param_mapping.items(): + # Set `param` with a dummy value -- we want to test the dict key + config.rope_scaling = {"rope_type": rope_type, param: True} + if rope_type in valid_rope_types: + continue + else: + with self.assertRaises(KeyError): + rope_config_validation(config) + + def test_default_rope_function_bc(self): + config = LlamaConfig() + device = torch_device + + rope_kwargs = { + "rope_type": "default", + "dim": config.hidden_size // config.num_attention_heads, + "max_position_embeddings": config.max_position_embeddings, + "base": config.rope_theta, + } + + rope_fn = ROPE_INIT_FUNCTIONS["default"] + config_freqs = rope_fn(config=config, device=device)[0] + kwargs_freqs = rope_fn(**rope_kwargs, device=device)[0] + torch.testing.assert_close(config_freqs, kwargs_freqs) + + def test_linear_rope_function_bc(self): + config = LlamaConfig() + config.rope_scaling = {"rope_type": "linear", "factor": 10.0} + device = torch_device + + rope_kwargs = { + "rope_type": "linear", + "dim": config.hidden_size // config.num_attention_heads, + "max_position_embeddings": config.max_position_embeddings, + "base": config.rope_theta, + "factor": 10.0, + } + + rope_fn = ROPE_INIT_FUNCTIONS["linear"] + config_freqs = rope_fn(config=config, device=device)[0] + kwargs_freqs = rope_fn(**rope_kwargs, device=device)[0] + torch.testing.assert_close(config_freqs, kwargs_freqs) + + def test_dynamic_rope_function_bc(self): + config = LlamaConfig() + config.rope_scaling = {"rope_type": "dynamic", "factor": 10.0} + device = torch_device + + rope_kwargs = { + "rope_type": "dynamic", + "dim": config.hidden_size // config.num_attention_heads, + "max_position_embeddings": config.max_position_embeddings, + "base": config.rope_theta, + "factor": 10.0, + } + + rope_fn = ROPE_INIT_FUNCTIONS["dynamic"] + config_freqs = rope_fn(config=config, device=device)[0] + kwargs_freqs = rope_fn(**rope_kwargs, device=device)[0] + torch.testing.assert_close(config_freqs, kwargs_freqs) + + +# TODO(joao): numerical checks for the different RoPE fns From a1844a3209eb7e75582684809203bc189931a90c Mon Sep 17 00:00:00 2001 From: Ita Zaporozhets <31893021+itazap@users.noreply.github.com> Date: Tue, 23 Jul 2024 11:45:54 +0200 Subject: [PATCH 17/73] gguf conversion add_prefix_space=None for llama3 (#31937) * gguf conversion forces add_prefix_space=False for llama3, this is not required and forces from_slow, which fails. changing to None + test * typo * clean test --- src/transformers/integrations/ggml.py | 2 +- tests/quantization/ggml/test_ggml.py | 11 +++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/transformers/integrations/ggml.py b/src/transformers/integrations/ggml.py index 5c2d72c345ecf9..71aa87afa94b5d 100644 --- a/src/transformers/integrations/ggml.py +++ b/src/transformers/integrations/ggml.py @@ -609,7 +609,7 @@ def tokenizer(self, proto): self.additional_kwargs["bos_token"] = eos_token if self.is_llama_3_tokenizer: - self.additional_kwargs["add_prefix_space"] = False + self.additional_kwargs["add_prefix_space"] = None self.additional_kwargs["clean_up_tokenization_spaces"] = True self.additional_kwargs["legacy"] = False diff --git a/tests/quantization/ggml/test_ggml.py b/tests/quantization/ggml/test_ggml.py index db96e9052c5f36..a5866094a1cc6f 100644 --- a/tests/quantization/ggml/test_ggml.py +++ b/tests/quantization/ggml/test_ggml.py @@ -174,10 +174,13 @@ def test_qwen2_q4_0(self): self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT) def test_llama3_q4_0_tokenizer(self): - tokenizer_gguf = AutoTokenizer.from_pretrained(self.llama3_model_id, gguf_file=self.q4_llama3_model_id) - special_sentence = "สวัสดี" - predicted_text = tokenizer_gguf.decode(tokenizer_gguf.encode(special_sentence, return_tensors="pt")[0]) - self.assertEqual(predicted_text, "<|begin_of_text|>" + special_sentence) + tokenizer = AutoTokenizer.from_pretrained(self.llama3_model_id, gguf_file=self.q4_llama3_model_id) + with tempfile.TemporaryDirectory() as tmpdirname: + tokenizer.save_pretrained(tmpdirname) + tokenizer = AutoTokenizer.from_pretrained(tmpdirname) + special_sentence = "สวัสดี" + predicted_text = tokenizer.decode(tokenizer.encode(special_sentence, return_tensors="pt")[0]) + self.assertEqual(predicted_text, "<|begin_of_text|>" + special_sentence) def test_llama3_q4_0(self): tokenizer = AutoTokenizer.from_pretrained(self.llama3_model_id, gguf_file=self.q4_llama3_model_id) From a5b226ce9811aa6b31af0bc9c09c54493a4e67c1 Mon Sep 17 00:00:00 2001 From: Cyril Vallez Date: Tue, 23 Jul 2024 12:21:23 +0200 Subject: [PATCH 18/73] Fix flash attention speed issue (#32028) Add the lru_cache for speed --- src/transformers/utils/import_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py index f81b9d3dba41bd..a5ea4eb1850c57 100755 --- a/src/transformers/utils/import_utils.py +++ b/src/transformers/utils/import_utils.py @@ -820,6 +820,7 @@ def is_flash_attn_greater_or_equal_2_10(): return version.parse(importlib.metadata.version("flash_attn")) >= version.parse("2.1.0") +@lru_cache() def is_flash_attn_greater_or_equal(library_version: str): if not _is_package_available("flash_attn"): return False From 9ced33ca7f909d9ace743dac083daba99c904d46 Mon Sep 17 00:00:00 2001 From: Merve Noyan Date: Tue, 23 Jul 2024 13:23:23 +0300 Subject: [PATCH 19/73] Fix video batching to videollava (#32139) --------- Co-authored-by: Merve Noyan --- .../image_processing_video_llava.py | 7 ++- .../test_image_processing_video_llava.py | 43 ++++++++++++++----- 2 files changed, 37 insertions(+), 13 deletions(-) diff --git a/src/transformers/models/video_llava/image_processing_video_llava.py b/src/transformers/models/video_llava/image_processing_video_llava.py index 82ac5869c01740..943c2fe51a0ef4 100644 --- a/src/transformers/models/video_llava/image_processing_video_llava.py +++ b/src/transformers/models/video_llava/image_processing_video_llava.py @@ -55,8 +55,11 @@ def make_batched_videos(videos) -> List[VideoInput]: if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]): return videos - elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]) and len(videos[0].shape) == 4: - return [list(video) for video in videos] + elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]): + if isinstance(videos[0], PIL.Image.Image): + return [videos] + elif len(videos[0].shape) == 4: + return [list(video) for video in videos] elif is_valid_image(videos) and len(videos.shape) == 4: return [list(videos)] diff --git a/tests/models/video_llava/test_image_processing_video_llava.py b/tests/models/video_llava/test_image_processing_video_llava.py index 4a5c2516267e13..03cfb033ffb91f 100644 --- a/tests/models/video_llava/test_image_processing_video_llava.py +++ b/tests/models/video_llava/test_image_processing_video_llava.py @@ -97,8 +97,7 @@ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=F torchify=torchify, ) - def prepare_video_inputs(self, equal_resolution=False, torchify=False): - numpify = not torchify + def prepare_video_inputs(self, equal_resolution=False, numpify=False, torchify=False): images = prepare_image_inputs( batch_size=self.batch_size, num_channels=self.num_channels, @@ -108,15 +107,19 @@ def prepare_video_inputs(self, equal_resolution=False, torchify=False): numpify=numpify, torchify=torchify, ) - # let's simply copy the frames to fake a long video-clip - videos = [] - for image in images: - if numpify: - video = image[None, ...].repeat(8, 0) - else: - video = image[None, ...].repeat(8, 1, 1, 1) - videos.append(video) + if numpify or torchify: + videos = [] + for image in images: + if numpify: + video = image[None, ...].repeat(8, 0) + else: + video = image[None, ...].repeat(8, 1, 1, 1) + videos.append(video) + else: + videos = [] + for pil_image in images: + videos.append([pil_image] * 8) return videos @@ -197,7 +200,7 @@ def test_call_numpy_videos(self): # Initialize image_processing image_processing = self.image_processing_class(**self.image_processor_dict) # create random numpy tensors - video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=True) + video_inputs = self.image_processor_tester.prepare_video_inputs(numpify=True, equal_resolution=True) for video in video_inputs: self.assertIsInstance(video, np.ndarray) @@ -211,6 +214,24 @@ def test_call_numpy_videos(self): expected_output_video_shape = (5, 8, 3, 18, 18) self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape) + def test_call_pil_videos(self): + # Initialize image_processing + image_processing = self.image_processing_class(**self.image_processor_dict) + # the inputs come in list of lists batched format + video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=True) + for video in video_inputs: + self.assertIsInstance(video[0], Image.Image) + + # Test not batched input + encoded_videos = image_processing(images=None, videos=video_inputs[0], return_tensors="pt").pixel_values_videos + expected_output_video_shape = (1, 8, 3, 18, 18) + self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape) + + # Test batched + encoded_videos = image_processing(images=None, videos=video_inputs, return_tensors="pt").pixel_values_videos + expected_output_video_shape = (5, 8, 3, 18, 18) + self.assertEqual(tuple(encoded_videos.shape), expected_output_video_shape) + def test_call_pytorch(self): # Initialize image_processing image_processing = self.image_processing_class(**self.image_processor_dict) From bab32d6fe932a3372fbd6d5a84e3cacb12a61ae0 Mon Sep 17 00:00:00 2001 From: Alexandre TL Date: Tue, 23 Jul 2024 12:32:19 +0200 Subject: [PATCH 20/73] Added mamba.py backend (#30139) * Update README.md * tests: forward ok * backward test done * done testing * removed check. scripts * Update README.md * added use_mambapy arg * fixed typo in warning * protected imports w/ mambapy package * delete pscan.py + raise rather than assert * Update import_utils.py * fix whitespaces and unused import * trailing whitespace + import block unformatted * Update modeling_mamba.py * transpose before pscan * shape comment * ran make style * use_mambapy=False by default Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * ran make fix-copies --------- Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> --- .../models/mamba/configuration_mamba.py | 4 ++ .../models/mamba/modeling_mamba.py | 58 ++++++++++++++----- src/transformers/utils/import_utils.py | 6 ++ 3 files changed, 52 insertions(+), 16 deletions(-) diff --git a/src/transformers/models/mamba/configuration_mamba.py b/src/transformers/models/mamba/configuration_mamba.py index 460c1f3b32acbf..89f08dd3cd3276 100644 --- a/src/transformers/models/mamba/configuration_mamba.py +++ b/src/transformers/models/mamba/configuration_mamba.py @@ -79,6 +79,8 @@ class MambaConfig(PretrainedConfig): Whether or not to rescale `out_proj` weights when initializing. use_cache (`bool`, *optional*, defaults to `True`): Whether or not the cache should be used. + use_mambapy (`bool`, *optional*, defaults to `False`): + Determines the fallback strategy during training if the CUDA-based official implementation of Mamba is not avaiable. If `True`, the mamba.py implementation is used. If `False`, the naive and slower implementation is used. Consider switching to the naive version if memory is limited. Example: @@ -123,6 +125,7 @@ def __init__( time_step_floor=1e-4, rescale_prenorm_residual=False, use_cache=True, + use_mambapy=False, **kwargs, ): self.vocab_size = vocab_size @@ -149,5 +152,6 @@ def __init__( self.rescale_prenorm_residual = rescale_prenorm_residual self.residual_in_fp32 = residual_in_fp32 self.use_cache = use_cache + self.use_mambapy = use_mambapy super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, pad_token_id=pad_token_id, **kwargs) diff --git a/src/transformers/models/mamba/modeling_mamba.py b/src/transformers/models/mamba/modeling_mamba.py index 5edb28ad7416e3..50c0f9ebe4a580 100644 --- a/src/transformers/models/mamba/modeling_mamba.py +++ b/src/transformers/models/mamba/modeling_mamba.py @@ -33,12 +33,17 @@ add_start_docstrings_to_model_forward, logging, ) -from ...utils.import_utils import is_causal_conv1d_available, is_mamba_ssm_available +from ...utils.import_utils import is_causal_conv1d_available, is_mamba_ssm_available, is_mambapy_available from .configuration_mamba import MambaConfig logger = logging.get_logger(__name__) +if is_mambapy_available(): + from mambapy.pscan import pscan +else: + pscan = None + if is_mamba_ssm_available(): from mamba_ssm.ops.selective_scan_interface import mamba_inner_fn, selective_scan_fn from mamba_ssm.ops.triton.selective_state_update import selective_state_update @@ -87,6 +92,8 @@ def __init__(self, config: MambaConfig, layer_idx: int): self.activation = config.hidden_act self.act = ACT2FN[config.hidden_act] + self.use_mambapy = config.use_mambapy + # projection of the input hidden states self.in_proj = nn.Linear(self.hidden_size, self.intermediate_size * 2, bias=config.use_bias) # selective projection used to make dt, B and C input dependant @@ -105,11 +112,23 @@ def __init__(self, config: MambaConfig, layer_idx: int): self.use_bias = config.use_bias if not is_fast_path_available: - logger.warning_once( - "The fast path is not available because on of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)`" - " is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and" - " https://github.com/Dao-AILab/causal-conv1d" - ) + if self.use_mambapy: + if is_mambapy_available(): + logger.warning_once( + "The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)`" + " is None. Falling back to the mamba.py backend. To install follow https://github.com/state-spaces/mamba/#installation and" + " https://github.com/Dao-AILab/causal-conv1d" + ) + else: + raise ImportError( + "use_mambapy is set to True but the mambapy package is not installed. To install it follow https://github.com/alxndrTL/mamba.py." + ) + else: + logger.warning_once( + "The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)`" + " is None. Falling back to the sequential implementation of Mamba, as use_mambapy is set to False. To install follow https://github.com/state-spaces/mamba/#installation and" + " https://github.com/Dao-AILab/causal-conv1d. For the mamba.py backend, follow https://github.com/alxndrTL/mamba.py." + ) def cuda_kernels_forward( self, @@ -257,17 +276,24 @@ def slow_forward(self, input_states, cache_params: Optional[MambaCache]=None, ca deltaB_u = discrete_B * hidden_states[:, :, :, None].float() # 3.c perform the recurrence y ← SSM(A, B, C)(x) - scan_outputs = [] - for i in range(seq_len): - ssm_state = discrete_A[:, :, i, :] * ssm_state + deltaB_u[:, :, i, :] # [batch, intermediate_size, ssm_state] - scan_output = torch.matmul(ssm_state.to(dtype), C[:, i, :].unsqueeze(-1)) # [batch, intermediate_size, 1] - scan_outputs.append(scan_output[:, :, 0]) - scan_output = torch.stack(scan_outputs, dim=-1) # [batch, intermediate_size, seq_len] - scan_output = scan_output + (hidden_states * self.D[None, :, None]) - scan_output = (scan_output * self.act(gate)) + if self.use_mambapy and self.training and cache_params is None: + hs = pscan(discrete_A.transpose(1, 2), deltaB_u.transpose(1, 2)) # [batch, seq_len, intermediate_size, ssm_state_size] - if cache_params is not None: - cache_params.update_ssm_state(self.layer_idx, ssm_state) + scan_output = (hs @ C.unsqueeze(-1)).squeeze(3).transpose(1, 2) # [batch, intermediate_size, seq_len] + scan_output = scan_output + hidden_states * self.D[None, :, None] + scan_output = scan_output * self.act(gate) + else: + scan_outputs = [] + for i in range(seq_len): + ssm_state = discrete_A[:, :, i, :] * ssm_state + deltaB_u[:, :, i, :] # [batch, intermediade_size, ssm_state] + scan_output = torch.matmul(ssm_state.to(dtype), C[:, i, :].unsqueeze(-1)) # [batch, intermediade_size, 1] + scan_outputs.append(scan_output[:, :, 0]) + scan_output = torch.stack(scan_outputs, dim=-1) # [batch, seq_len, intermediade_size] + scan_output = scan_output + (hidden_states * self.D[None, :, None]) + scan_output = (scan_output * self.act(gate)) + + if cache_params is not None: + cache_params.ssm_states[self.layer_idx].copy_(ssm_state) # 4. Final linear projection contextualized_states = self.out_proj(scan_output.transpose(1, 2)) # [batch, seq_len, hidden_size] diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py index a5ea4eb1850c57..c52da62c1de8e5 100755 --- a/src/transformers/utils/import_utils.py +++ b/src/transformers/utils/import_utils.py @@ -395,6 +395,12 @@ def is_causal_conv1d_available(): return False +def is_mambapy_available(): + if is_torch_available(): + return _is_package_available("mambapy") + return False + + def is_torch_mps_available(): if is_torch_available(): import torch From 034b47784765e37ecc20f7ad43640f1a2c0094fd Mon Sep 17 00:00:00 2001 From: Amit Garg Date: Tue, 23 Jul 2024 03:33:22 -0700 Subject: [PATCH 21/73] Rename Phi-3 rope scaling type (#31436) * renamed phi3 rope_scaling type * fixed trailing whitespaces * fixed test * added warning * fixed format --- .../models/phi3/configuration_phi3.py | 20 ++++++- src/transformers/models/phi3/modeling_phi3.py | 58 ++++++++++++++++--- tests/models/phi3/test_modeling_phi3.py | 2 +- 3 files changed, 69 insertions(+), 11 deletions(-) diff --git a/src/transformers/models/phi3/configuration_phi3.py b/src/transformers/models/phi3/configuration_phi3.py index 8e1ac3628c2b23..4940f43e5bffe3 100644 --- a/src/transformers/models/phi3/configuration_phi3.py +++ b/src/transformers/models/phi3/configuration_phi3.py @@ -78,7 +78,7 @@ class Phi3Config(PretrainedConfig): The base period of the RoPE embeddings. rope_scaling (`dict`, *optional*): The scaling strategy for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must - contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be either `su` or `yarn` and + contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be `longrope` and the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size divided by the number of attention heads divided by 2. bos_token_id (`int`, *optional*, defaults to 1): @@ -155,6 +155,7 @@ def __init__( self.use_cache = use_cache self.rope_theta = rope_theta self.rope_scaling = rope_scaling + self._rope_scaling_adjustment() self._rope_scaling_validation() self.sliding_window = sliding_window @@ -166,6 +167,19 @@ def __init__( **kwargs, ) + def _rope_scaling_adjustment(self): + """ + Adjust the `type` of the `rope_scaling` configuration for backward compatibility. + """ + if self.rope_scaling is None: + return + + rope_scaling_type = self.rope_scaling.get("type", None) + + # For backward compatibility if previous version used "su" or "yarn" + if rope_scaling_type is not None and rope_scaling_type in ["su", "yarn"]: + self.rope_scaling["type"] = "longrope" + def _rope_scaling_validation(self): """ Validate the `rope_scaling` configuration. @@ -181,8 +195,8 @@ def _rope_scaling_validation(self): rope_scaling_type = self.rope_scaling.get("type", None) rope_scaling_short_factor = self.rope_scaling.get("short_factor", None) rope_scaling_long_factor = self.rope_scaling.get("long_factor", None) - if rope_scaling_type is None or rope_scaling_type not in ["su", "yarn"]: - raise ValueError(f"`rope_scaling`'s type field must be one of ['su', 'yarn'], got {rope_scaling_type}") + if rope_scaling_type is None or rope_scaling_type not in ["longrope"]: + raise ValueError(f"`rope_scaling`'s type field must be one of ['longrope'], got {rope_scaling_type}") if not ( isinstance(rope_scaling_short_factor, list) and all(isinstance(x, (int, float)) for x in rope_scaling_short_factor) diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py index a32f8531e4870c..76e3fbf514f6d6 100644 --- a/src/transformers/models/phi3/modeling_phi3.py +++ b/src/transformers/models/phi3/modeling_phi3.py @@ -16,6 +16,7 @@ """PyTorch Phi-3 model.""" import math +import warnings from typing import List, Optional, Tuple, Union import torch @@ -106,6 +107,51 @@ def forward(self, x, position_ids, seq_len=None): class Phi3SuScaledRotaryEmbedding(Phi3RotaryEmbedding): def __init__(self, dim, config, device=None): + warnings.warn( + "The class Phi3SuScaledRotaryEmbedding is deprecated and will be removed in version 5 of Transformers. Please" + " use Phi3LongRoPEScaledRotaryEmbedding instead.", + FutureWarning, + ) + super().__init__(dim, config.max_position_embeddings, config.rope_theta, device) + + self.short_factor = config.rope_scaling["short_factor"] + self.long_factor = config.rope_scaling["long_factor"] + self.original_max_position_embeddings = config.original_max_position_embeddings + + @torch.no_grad() + def forward(self, x, position_ids, seq_len=None): + seq_len = torch.max(position_ids) + 1 + if seq_len > self.original_max_position_embeddings: + ext_factors = torch.tensor(self.long_factor, dtype=torch.float32, device=x.device) + else: + ext_factors = torch.tensor(self.short_factor, dtype=torch.float32, device=x.device) + inv_freq_shape = torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim + self.inv_freq = 1.0 / (ext_factors * self.base**inv_freq_shape) + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) + position_ids_expanded = position_ids[:, None, :].float() + # Force float32 since bfloat16 loses precision on long contexts + # See https://github.com/huggingface/transformers/pull/29285 + device_type = x.device.type + device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + scale = self.max_position_embeddings / self.original_max_position_embeddings + if scale <= 1.0: + scaling_factor = 1.0 + else: + scaling_factor = math.sqrt(1 + math.log(scale) / math.log(self.original_max_position_embeddings)) + cos = emb.cos() * scaling_factor + sin = emb.sin() * scaling_factor + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +class Phi3YarnScaledRotaryEmbedding(Phi3RotaryEmbedding): + def __init__(self, dim, config, device=None): + warnings.warn( + "The class Phi3YarnScaledRotaryEmbedding is deprecated and will be removed in version 5 of Transformers", + FutureWarning, + ) super().__init__(dim, config.max_position_embeddings, config.rope_theta, device) self.short_factor = config.rope_scaling["short_factor"] @@ -138,14 +184,14 @@ def forward(self, x, position_ids, seq_len=None): if scale <= 1.0: scaling_factor = 1.0 else: - scaling_factor = math.sqrt(1 + math.log(scale) / math.log(self.original_max_position_embeddings)) + scaling_factor = 0.1 * math.log(scale) + 1.0 cos = emb.cos() * scaling_factor sin = emb.sin() * scaling_factor return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) -class Phi3YarnScaledRotaryEmbedding(Phi3RotaryEmbedding): +class Phi3LongRoPEScaledRotaryEmbedding(Phi3RotaryEmbedding): def __init__(self, dim, config, device=None): super().__init__(dim, config.max_position_embeddings, config.rope_theta, device) @@ -179,7 +225,7 @@ def forward(self, x, position_ids, seq_len=None): if scale <= 1.0: scaling_factor = 1.0 else: - scaling_factor = 0.1 * math.log(scale) + 1.0 + scaling_factor = math.sqrt(1 + math.log(scale) / math.log(self.original_max_position_embeddings)) cos = emb.cos() * scaling_factor sin = emb.sin() * scaling_factor @@ -300,10 +346,8 @@ def _init_rope(self): ) else: scaling_type = self.config.rope_scaling["type"] - if scaling_type == "su": - self.rotary_emb = Phi3SuScaledRotaryEmbedding(self.head_dim, self.config) - elif scaling_type == "yarn": - self.rotary_emb = Phi3YarnScaledRotaryEmbedding(self.head_dim, self.config) + if scaling_type == "longrope": + self.rotary_emb = Phi3LongRoPEScaledRotaryEmbedding(self.head_dim, self.config) else: raise ValueError(f"Unknown RoPE scaling type {scaling_type}") diff --git a/tests/models/phi3/test_modeling_phi3.py b/tests/models/phi3/test_modeling_phi3.py index ad9c4c46aa931e..1ddc73961bfedf 100644 --- a/tests/models/phi3/test_modeling_phi3.py +++ b/tests/models/phi3/test_modeling_phi3.py @@ -362,7 +362,7 @@ def test_phi3_sequence_classification_model_for_multi_label(self): result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels) self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels)) - @parameterized.expand([("su",), ("yarn",)]) + @parameterized.expand([("longrope",)]) def test_model_rope_scaling_from_config(self, scaling_type): config, _ = self.model_tester.prepare_config_and_inputs_for_common() short_input = ids_tensor([1, 10], config.vocab_size) From 3263b3435473cbb5dc66925bc29c1d32b5b8d431 Mon Sep 17 00:00:00 2001 From: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com> Date: Tue, 23 Jul 2024 18:34:30 +0800 Subject: [PATCH 22/73] Revert "Incorrect Whisper long-form decoding timestamps " (#32148) Revert "Incorrect Whisper long-form decoding timestamps (#32003)" This reverts commit cd48553fc8375e1a28d4d82cfe231dedf6a23af8. --- .../models/clvp/processing_clvp.py | 1 + .../models/whisper/processing_whisper.py | 7 -- .../models/whisper/tokenization_whisper.py | 42 ++++-------- .../whisper/tokenization_whisper_fast.py | 42 ++++-------- tests/models/whisper/test_modeling_whisper.py | 66 ------------------- 5 files changed, 25 insertions(+), 133 deletions(-) diff --git a/src/transformers/models/clvp/processing_clvp.py b/src/transformers/models/clvp/processing_clvp.py index ebccab89d0fca3..4e015cea1f8475 100644 --- a/src/transformers/models/clvp/processing_clvp.py +++ b/src/transformers/models/clvp/processing_clvp.py @@ -73,6 +73,7 @@ def __call__(self, *args, **kwargs): inputs["attention_mask"] = encodings["attention_mask"] return inputs + # Copied from transformers.models.whisper.processing_whisper.WhisperProcessor.batch_decode with Whisper->Clvp def batch_decode(self, *args, **kwargs): """ This method forwards all its arguments to ClvpTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please diff --git a/src/transformers/models/whisper/processing_whisper.py b/src/transformers/models/whisper/processing_whisper.py index 07ece4314b249b..f22aae143e6bc4 100644 --- a/src/transformers/models/whisper/processing_whisper.py +++ b/src/transformers/models/whisper/processing_whisper.py @@ -84,13 +84,6 @@ def batch_decode(self, *args, **kwargs): This method forwards all its arguments to WhisperTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please refer to the docstring of this method for more information. """ - - # If segments are present in args, we are performing long-form generation and need to return long form timestamps. - # The long-form timestamps are already present in segments and should be passed as kwargs to batch_decode. - if isinstance(args[0], dict) and "segments" in args[0]: - kwargs["longform_timestamps"] = args[0].pop("segments") - args = tuple(args[0]["sequences"].unsqueeze(0)) - return self.tokenizer.batch_decode(*args, **kwargs) def decode(self, *args, **kwargs): diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py index 0ff0bc8245edac..85e7dd04d8b249 100644 --- a/src/transformers/models/whisper/tokenization_whisper.py +++ b/src/transformers/models/whisper/tokenization_whisper.py @@ -558,7 +558,7 @@ def _decode_with_timestamps(self, token_ids, skip_special_tokens=False, time_pre ] return "".join(outputs) - def _compute_offsets(self, token_ids, time_precision=0.02, longform_timestamps=None): + def _compute_offsets(self, token_ids, time_precision=0.02): """ Compute offsets for a given tokenized input @@ -567,8 +567,6 @@ def _compute_offsets(self, token_ids, time_precision=0.02, longform_timestamps=N List of tokenized input ids. Can be obtained using the `__call__` method. time_precision (`float`, `optional`, defaults to 0.02): The time ratio to convert from token to time. - longform_timestamps (List[dict], *optional*): - Timestamps obtained using long form generation in Whisper, to be used to replace predicted timestamps in token_ids. """ offsets = [] # ensure torch tensor of token ids is placed on cpu @@ -589,7 +587,7 @@ def _compute_offsets(self, token_ids, time_precision=0.02, longform_timestamps=N consecutive = np.append(consecutive, np.where(timestamp_tokens)[0][-1] + 1) last_slice = np.where(timestamp_tokens)[0][0] - for i, current_slice in enumerate(consecutive): + for current_slice in consecutive: sliced_tokens = token_ids[last_slice:current_slice] if len(sliced_tokens) > 1: start_timestamp_position = sliced_tokens[0].item() - timestamp_begin @@ -598,27 +596,15 @@ def _compute_offsets(self, token_ids, time_precision=0.02, longform_timestamps=N sliced_tokens = self._preprocess_token_ids(sliced_tokens) text = self._decode(sliced_tokens) text = self._filter_timestamp_ids(text) - - if longform_timestamps is not None: - offsets.append( - { - "text": text, - "timestamp": ( - longform_timestamps[0][i]["start"].item(), - longform_timestamps[0][i]["end"].item(), - ), - } - ) - else: - offsets.append( - { - "text": text, - "timestamp": ( - start_timestamp_position * time_precision, - end_timestamp_position * time_precision, - ), - } - ) + offsets.append( + { + "text": text, + "timestamp": ( + start_timestamp_position * time_precision, + end_timestamp_position * time_precision, + ), + } + ) last_slice = current_slice return offsets @@ -727,11 +713,7 @@ def decode( # retrieve offsets if output_offsets: - longform_timestamps = kwargs.get("longform_timestamps") - offsets = self._compute_offsets( - token_ids, time_precision=time_precision, longform_timestamps=longform_timestamps - ) - + offsets = self._compute_offsets(token_ids, time_precision=time_precision) return {"text": text, "offsets": offsets} return text diff --git a/src/transformers/models/whisper/tokenization_whisper_fast.py b/src/transformers/models/whisper/tokenization_whisper_fast.py index 540056df8bd807..d1205d1a8ec01b 100644 --- a/src/transformers/models/whisper/tokenization_whisper_fast.py +++ b/src/transformers/models/whisper/tokenization_whisper_fast.py @@ -200,7 +200,7 @@ def _decode_with_timestamps(self, token_ids, skip_special_tokens=False, time_pre return "".join(outputs) # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._compute_offsets - def _compute_offsets(self, token_ids, time_precision=0.02, longform_timestamps=None): + def _compute_offsets(self, token_ids, time_precision=0.02): """ Compute offsets for a given tokenized input @@ -209,8 +209,6 @@ def _compute_offsets(self, token_ids, time_precision=0.02, longform_timestamps=N List of tokenized input ids. Can be obtained using the `__call__` method. time_precision (`float`, `optional`, defaults to 0.02): The time ratio to convert from token to time. - longform_timestamps (List[dict], *optional*): - Timestamps obtained using long form generation in Whisper, to be used to replace predicted timestamps in token_ids. """ offsets = [] # ensure torch tensor of token ids is placed on cpu @@ -231,7 +229,7 @@ def _compute_offsets(self, token_ids, time_precision=0.02, longform_timestamps=N consecutive = np.append(consecutive, np.where(timestamp_tokens)[0][-1] + 1) last_slice = np.where(timestamp_tokens)[0][0] - for i, current_slice in enumerate(consecutive): + for current_slice in consecutive: sliced_tokens = token_ids[last_slice:current_slice] if len(sliced_tokens) > 1: start_timestamp_position = sliced_tokens[0].item() - timestamp_begin @@ -240,27 +238,15 @@ def _compute_offsets(self, token_ids, time_precision=0.02, longform_timestamps=N sliced_tokens = self._preprocess_token_ids(sliced_tokens) text = self._decode(sliced_tokens) text = self._filter_timestamp_ids(text) - - if longform_timestamps is not None: - offsets.append( - { - "text": text, - "timestamp": ( - longform_timestamps[0][i]["start"].item(), - longform_timestamps[0][i]["end"].item(), - ), - } - ) - else: - offsets.append( - { - "text": text, - "timestamp": ( - start_timestamp_position * time_precision, - end_timestamp_position * time_precision, - ), - } - ) + offsets.append( + { + "text": text, + "timestamp": ( + start_timestamp_position * time_precision, + end_timestamp_position * time_precision, + ), + } + ) last_slice = current_slice return offsets @@ -373,11 +359,7 @@ def decode( # retrieve offsets if output_offsets: - longform_timestamps = kwargs.get("longform_timestamps") - offsets = self._compute_offsets( - token_ids, time_precision=time_precision, longform_timestamps=longform_timestamps - ) - + offsets = self._compute_offsets(token_ids, time_precision=time_precision) return {"text": text, "offsets": offsets} return text diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py index 5a59f7a72517a4..38ccf82af4627e 100644 --- a/tests/models/whisper/test_modeling_whisper.py +++ b/tests/models/whisper/test_modeling_whisper.py @@ -2243,72 +2243,6 @@ def test_tiny_timestamp_generation(self): transcript = processor.batch_decode(generated_ids, skip_special_tokens=True, output_offsets=True) self.assertEqual(transcript, EXPECTED_TRANSCRIPT) - @slow - def test_tiny_longform_timestamps_generation(self): - set_seed(0) - processor = WhisperProcessor.from_pretrained("openai/whisper-tiny") - model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny") - model.to(torch_device) - - sample = self._load_datasamples(1) - input_speech = np.concatenate(sample * 10) - - input_features = processor(input_speech, return_tensors="pt", truncation=False, sampling_rate=16_000) - input_features = input_features.to(torch_device) - - generated_ids = model.generate(**input_features, return_timestamps=True, return_segments=True) - - EXPECTED_TRANSCRIPT = [ - { - "text": " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel. Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel. Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel. Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel. Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel. Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel. Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel. Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel. Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel. Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.", - "offsets": [ - { - "text": " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.", - "timestamp": (0.0, 6.0), - }, - { - "text": " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.", - "timestamp": (6.0, 12.0), - }, - { - "text": " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.", - "timestamp": (12.0, 18.0), - }, - { - "text": " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.", - "timestamp": (18.0, 24.0), - }, - { - "text": " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.", - "timestamp": (24.0, 29.0), - }, - { - "text": " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.", - "timestamp": (29.0, 35.0), - }, - { - "text": " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.", - "timestamp": (35.0, 41.0), - }, - { - "text": " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.", - "timestamp": (41.0, 47.0), - }, - { - "text": " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.", - "timestamp": (47.0, 53.0), - }, - { - "text": " Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.", - "timestamp": (53.0, 58.20000076293945), - }, - ], - } - ] - - transcript = processor.batch_decode(generated_ids, skip_special_tokens=True, output_offsets=True) - self.assertEqual(transcript, EXPECTED_TRANSCRIPT) - @slow def test_large_timestamp_generation(self): set_seed(0) From a009fbdab32a4b068c24052a4dfe7a7bc0fc89f9 Mon Sep 17 00:00:00 2001 From: amyeroberts <22614925+amyeroberts@users.noreply.github.com> Date: Tue, 23 Jul 2024 12:23:34 +0100 Subject: [PATCH 23/73] Fix typing to be compatible with later py versions (#32155) --- src/transformers/tokenization_utils_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 7ffd3bbcaa6be7..9e478b63b284ad 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1948,7 +1948,7 @@ def is_active(self) -> bool: return self._rendered_blocks or self._generation_indices @contextmanager - def activate_tracker(self, rendered_blocks: list[int], generation_indices: list[int]): + def activate_tracker(self, rendered_blocks: List[int], generation_indices: List[int]): try: if self.is_active(): raise ValueError("AssistantTracker should not be reused before closed") From 63700628adb91600c84fe3bbbc4c667cd3e3aa71 Mon Sep 17 00:00:00 2001 From: Alvaro Moran <6949769+tengomucho@users.noreply.github.com> Date: Tue, 23 Jul 2024 14:18:19 +0200 Subject: [PATCH 24/73] feat(cache): StaticCache uses index_copy_ to avoid useless copy (#31857) * feat(cache): StaticCache uses index_copy_ to avoid useless copy Using index_copy_ allows for explicit in-place change of the tensor. Some backends (XLA) will otherwise copy the tensor, making the code slower and using more memory. Proposed implementation will end up using less memory and on XLA will result in less compilation, but the change is also quite generic, making no change whatsoever on CUDA or CPU backend. * feat(cache): SlidingWindowCache uses index_copy_ to avoid useless copy Applying the same change done in StaticCache. * fix(cache): fallback of index_copy_ when not implemented * fix(cache): in index_copy_ ensure tensors are on same device * [run slow] llama * fix(cache): add move of cache_position to same device in SlidingWindowCache * Revert "[run slow] llama" This reverts commit 02608dd14253ccd464e31c108e0cd94364f0e8b9. --- src/transformers/cache_utils.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py index 34b457ce018956..0c03ea2735db3f 100644 --- a/src/transformers/cache_utils.py +++ b/src/transformers/cache_utils.py @@ -862,8 +862,18 @@ def update( k_out.copy_(key_states) v_out.copy_(value_states) else: - k_out[:, :, cache_position] = key_states - v_out[:, :, cache_position] = value_states + # Note: here we use `tensor.index_copy_(dim, index, tensor)` that is equivalent to + # `tensor[:, :, index] = tensor`, but the first one is compile-friendly and it does explicitly an in-place + # operation, that avoids copies and uses less memory. + try: + # If using several devices (e.g.: multiple GPUs), we need to ensure everything is on the same one + cache_position.to(device=k_out.device) + k_out.index_copy_(2, cache_position, key_states) + v_out.index_copy_(2, cache_position, value_states) + except NotImplementedError: + # The operator 'aten::index_copy.out' is not currently implemented for the MPS device. + k_out[:, :, cache_position] = key_states + v_out[:, :, cache_position] = value_states return k_out, v_out @@ -958,8 +968,14 @@ def update( k_out = k_out[:, :, indices] v_out = v_out[:, :, indices] - k_out[:, :, cache_position] = key_states - v_out[:, :, cache_position] = value_states + try: + cache_position.to(device=k_out.device) + k_out.index_copy_(2, cache_position, key_states) + v_out.index_copy_(2, cache_position, value_states) + except NotImplementedError: + # The operator 'aten::index_copy.out' is not currently implemented for the MPS device. + k_out[:, :, cache_position] = key_states + v_out[:, :, cache_position] = value_states # `_.zero()` followed by `+=` is equivalent `=`, but compile-friendly (without graph breaks due to assignment) self.key_cache[layer_idx].zero_() From 7d92009af647167bae338e9d4af8bc0452c62fbf Mon Sep 17 00:00:00 2001 From: Deep Gandhi <97520292+DeF0017@users.noreply.github.com> Date: Tue, 23 Jul 2024 19:11:52 +0530 Subject: [PATCH 25/73] Added additional kwarg for successful running of optuna hyperparameter search (#31924) Update integration_utils.py Added additional kwarg --- src/transformers/integrations/integration_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py index e9c91192ecf9a2..399c9f60cfa685 100755 --- a/src/transformers/integrations/integration_utils.py +++ b/src/transformers/integrations/integration_utils.py @@ -252,10 +252,11 @@ def _objective(trial, checkpoint_dir=None): timeout = kwargs.pop("timeout", None) n_jobs = kwargs.pop("n_jobs", 1) + gc_after_trial = kwargs.pop("gc_after_trial", False) directions = direction if isinstance(direction, list) else None direction = None if directions is not None else direction study = optuna.create_study(direction=direction, directions=directions, **kwargs) - study.optimize(_objective, n_trials=n_trials, timeout=timeout, n_jobs=n_jobs) + study.optimize(_objective, n_trials=n_trials, timeout=timeout, n_jobs=n_jobs, gc_after_trial=gc_after_trial) if not study._is_multi_objective(): best_trial = study.best_trial return BestRun(str(best_trial.number), best_trial.value, best_trial.params) From 9cf4f2aa9a9cecbb22e813931ef3bb72fc773540 Mon Sep 17 00:00:00 2001 From: RhuiDih <166782544+RhuiDih@users.noreply.github.com> Date: Tue, 23 Jul 2024 21:56:41 +0800 Subject: [PATCH 26/73] Enhancing SFT Training Efficiency Using Packing and FlashAttention2 with Position IDs (#31629) * add DataCollatorBatchFlattening * Update data_collator.py * change name * new FA2 flow if position_ids is provided * add comments * minor fix * minor fix data collator * add test cases for models * add test case for data collator * remove extra code * formating for ruff check and check_repo.py * ruff format ruff format tests src utils * custom_init_isort.py --- docs/source/en/main_classes/data_collator.md | 5 ++ src/transformers/__init__.py | 2 + src/transformers/data/__init__.py | 1 + src/transformers/data/data_collator.py | 35 ++++++++ .../modeling_flash_attention_utils.py | 79 +++++++++++++++++++ src/transformers/models/dbrx/modeling_dbrx.py | 1 + .../models/falcon/modeling_falcon.py | 1 + .../models/gemma/modeling_gemma.py | 1 + .../models/llama/modeling_llama.py | 1 + .../models/mistral/modeling_mistral.py | 1 + .../models/mixtral/modeling_mixtral.py | 1 + src/transformers/models/olmo/modeling_olmo.py | 1 + src/transformers/models/phi/modeling_phi.py | 1 + src/transformers/models/phi3/modeling_phi3.py | 1 + .../models/qwen2/modeling_qwen2.py | 1 + .../models/qwen2_moe/modeling_qwen2_moe.py | 1 + .../models/stablelm/modeling_stablelm.py | 1 + .../models/starcoder2/modeling_starcoder2.py | 1 + tests/test_modeling_common.py | 72 +++++++++++++++++ tests/trainer/test_data_collator.py | 19 +++++ 20 files changed, 226 insertions(+) diff --git a/docs/source/en/main_classes/data_collator.md b/docs/source/en/main_classes/data_collator.md index 74e653dd1185e9..e704bb747fe6e0 100644 --- a/docs/source/en/main_classes/data_collator.md +++ b/docs/source/en/main_classes/data_collator.md @@ -66,3 +66,8 @@ Examples of use can be found in the [example scripts](../examples) or [example n - numpy_mask_tokens - tf_mask_tokens - torch_mask_tokens + +## DataCollatorWithFlattening + +[[autodoc]] data.data_collator.DataCollatorWithFlattening + diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index bc6e786358b68d..05becb96e0b808 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -103,6 +103,7 @@ "DataCollatorForSOP", "DataCollatorForTokenClassification", "DataCollatorForWholeWordMask", + "DataCollatorWithFlattening", "DataCollatorWithPadding", "DefaultDataCollator", "default_data_collator", @@ -4764,6 +4765,7 @@ DataCollatorForSOP, DataCollatorForTokenClassification, DataCollatorForWholeWordMask, + DataCollatorWithFlattening, DataCollatorWithPadding, DefaultDataCollator, default_data_collator, diff --git a/src/transformers/data/__init__.py b/src/transformers/data/__init__.py index 1a8ef35ff439e4..8b675aae281f32 100644 --- a/src/transformers/data/__init__.py +++ b/src/transformers/data/__init__.py @@ -19,6 +19,7 @@ DataCollatorForSOP, DataCollatorForTokenClassification, DataCollatorForWholeWordMask, + DataCollatorWithFlattening, DataCollatorWithPadding, DefaultDataCollator, default_data_collator, diff --git a/src/transformers/data/data_collator.py b/src/transformers/data/data_collator.py index ce17f79ccfc88e..20a21318786c4a 100644 --- a/src/transformers/data/data_collator.py +++ b/src/transformers/data/data_collator.py @@ -1611,3 +1611,38 @@ def numpy_mask_tokens(self, inputs: Any) -> Tuple[Any, Any, Any, Any]: ) & masked_indices[i] return inputs.astype(np.int64), perm_mask, target_mapping, labels.astype(np.int64) + + +@dataclass +class DataCollatorWithFlattening(DefaultDataCollator): + """ + Data collator used for padding free approach. Does the following: + + - concatate the entire mini batch into single long sequence [1, total_tokens] + - no padding will be added, returns `input_ids`, `labels` and `position_ids` + """ + + def __init__(self, *args, return_position_ids=True, **kwargs): + super().__init__(*args, **kwargs) + self.return_position_ids = return_position_ids + warnings.warn( + "Using `DataCollatorWithFlattening` will flatten the entire mini batch into single long sequence." + "Make sure your attention computation is able to handle it!" + ) + + def __call__(self, features, return_tensors=None): + if return_tensors is None: + return_tensors = self.return_tensors + is_labels_provided = "labels" in features[0] + ret = {"input_ids": [], "labels": []} + if self.return_position_ids: + ret.update({"position_ids": []}) + for idx in range(0, len(features)): + ret["input_ids"] += features[idx]["input_ids"] + if is_labels_provided: + ret["labels"] += [-100] + features[idx]["labels"][1:] + else: + ret["labels"] += [-100] + features[idx]["input_ids"][1:] + if self.return_position_ids: + ret["position_ids"] += list(range(len(features[idx]["input_ids"]))) + return default_data_collator([ret], return_tensors) diff --git a/src/transformers/modeling_flash_attention_utils.py b/src/transformers/modeling_flash_attention_utils.py index 1742e419b4aaea..88dd99e6901d16 100644 --- a/src/transformers/modeling_flash_attention_utils.py +++ b/src/transformers/modeling_flash_attention_utils.py @@ -130,6 +130,56 @@ def _upad_input( ) +def prepare_fa2_from_position_ids(query, key, value, position_ids): + """ + This function returns necessary arguments to call `flash_attn_varlen_func`. + All three query, key, value states will be flattened. + Cummulative lengths of each examples in the batch will be extracted from position_ids. + + NOTE: ideally cummulative lengths should be prepared at the data collator stage + + Arguments: + query (`torch.Tensor`): + Query state with padding. Shape: (batch_size, query_length, num_heads, head_dim). + key (`torch.Tensor`): + Key state with padding. Shape: (batch_size, kv_seq_len, num_key_value_heads, head_dim). + value (`torch.Tensor`): + Value state with padding. Shape: (batch_size, kv_seq_len, num_key_value_heads, head_dim). + position_ids (`torch.Tensor`): + Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid. + + Return: + query (`torch.Tensor): + Query state without padding. Shape: (total_target_length, num_heads, head_dim). + key (`torch.Tensor`): + Key state with padding. Shape: (total_source_length, num_key_value_heads, head_dim). + value (`torch.Tensor`): + Value state with padding. Shape: (total_source_length, num_key_value_heads, head_dim). + indices_q (`torch.Tensor`): + The indices of non-masked tokens from the flattened input target sequence. + (cu_seqlens_q, cu_seqlens_k) (`Tuple[int]`): + The cumulative sequence lengths for the target (query) and source (key, value), used to index into ragged (unpadded) tensors. `cu_seqlens` shape is (batch_size + 1,). + (max_seqlen_in_batch_q, max_seqlen_in_batch_k) (`Tuple[int]`): + Maximum sequence length in batch (`max_seqlen_in_batch_q` for the target sequence i.e. query, `max_seqlen_in_batch_k` for the source sequence i.e. key/value). + """ + query = query.view(-1, query.size(-2), query.size(-1)) + key = key.view(-1, key.size(-2), key.size(-1)) + value = value.view(-1, value.size(-2), value.size(-1)) + position_ids = position_ids.flatten() + indices_q = torch.arange(position_ids.size(0), device=position_ids.device, dtype=torch.int32) + + cu_seq_lens = torch.cat( + ( + indices_q[position_ids == 0], + torch.tensor(position_ids.size(), device=position_ids.device, dtype=torch.int32), + ) + ) + + max_length = position_ids.max() + 1 + + return (query, key, value, indices_q, (cu_seq_lens, cu_seq_lens), (max_length, max_length)) + + def _flash_attention_forward( query_states: torch.Tensor, key_states: torch.Tensor, @@ -138,6 +188,7 @@ def _flash_attention_forward( query_length: int, is_causal: bool, dropout: float = 0.0, + position_ids: Optional[torch.Tensor] = None, softmax_scale: Optional[float] = None, sliding_window: Optional[int] = None, use_top_left_mask: bool = False, @@ -210,6 +261,34 @@ def _flash_attention_forward( **flash_kwargs, ) attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) + + # if position_ids is provided and check not all examples (row) contain only 1 sequence, + # then use `flash_attn_varlen_func` to prevent cross-example attention and also allow padding free approach + elif position_ids is not None and not (position_ids[:, -1] == position_ids.size(1) - 1).all(): + batch_size = query_states.size(0) + query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = prepare_fa2_from_position_ids( + query_states, key_states, value_states, position_ids + ) + + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + + attn_output = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + **flash_kwargs, + ) + + attn_output = attn_output.view(batch_size, -1, attn_output.size(-2), attn_output.size(-1)) + else: attn_output = flash_attn_func( query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal, **flash_kwargs diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py index 31810028ef4448..e3be8decbc6b52 100644 --- a/src/transformers/models/dbrx/modeling_dbrx.py +++ b/src/transformers/models/dbrx/modeling_dbrx.py @@ -415,6 +415,7 @@ def forward( value_states, attention_mask, q_len, + position_ids=position_ids, dropout=dropout_rate, is_causal=self.is_causal, use_top_left_mask=self._flash_attn_uses_top_left_mask, diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index 663582c8a72a83..fc7a38ed134d4f 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -602,6 +602,7 @@ def forward( value_layer, attention_mask, query_length, + position_ids=position_ids, dropout=attn_dropout, is_causal=self.is_causal, use_top_left_mask=self._flash_attn_uses_top_left_mask, diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py index 80e97fe700b5ca..5bc1af3e7ec7a9 100644 --- a/src/transformers/models/gemma/modeling_gemma.py +++ b/src/transformers/models/gemma/modeling_gemma.py @@ -393,6 +393,7 @@ def forward( value_states, attention_mask, q_len, + position_ids=position_ids, dropout=dropout_rate, sliding_window=getattr(self, "sliding_window", None), is_causal=self.is_causal, diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index 3115cee78f7677..ce76d1d1ec1b9d 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -503,6 +503,7 @@ def forward( value_states, attention_mask, q_len, + position_ids=position_ids, dropout=dropout_rate, sliding_window=getattr(self, "sliding_window", None), use_top_left_mask=self._flash_attn_uses_top_left_mask, diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py index dd814cd75fb112..93a60a49dbf34c 100644 --- a/src/transformers/models/mistral/modeling_mistral.py +++ b/src/transformers/models/mistral/modeling_mistral.py @@ -382,6 +382,7 @@ def forward( value_states, attention_mask, q_len, + position_ids=position_ids, dropout=dropout_rate, sliding_window=getattr(self.config, "sliding_window", None), use_top_left_mask=self._flash_attn_uses_top_left_mask, diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py index 82320de79386b5..d2ee6e6b268ae0 100644 --- a/src/transformers/models/mixtral/modeling_mixtral.py +++ b/src/transformers/models/mixtral/modeling_mixtral.py @@ -488,6 +488,7 @@ def forward( value_states, attention_mask, q_len, + position_ids=position_ids, dropout=dropout_rate, sliding_window=getattr(self.config, "sliding_window", None), is_causal=self.is_causal, diff --git a/src/transformers/models/olmo/modeling_olmo.py b/src/transformers/models/olmo/modeling_olmo.py index a56baf0653ecd3..74d49d5606c145 100644 --- a/src/transformers/models/olmo/modeling_olmo.py +++ b/src/transformers/models/olmo/modeling_olmo.py @@ -428,6 +428,7 @@ def forward( value_states, attention_mask, q_len, + position_ids=position_ids, dropout=dropout_rate, use_top_left_mask=self._flash_attn_uses_top_left_mask, is_causal=self.is_causal, diff --git a/src/transformers/models/phi/modeling_phi.py b/src/transformers/models/phi/modeling_phi.py index f80453d3f7d990..1b23be39e5c05d 100644 --- a/src/transformers/models/phi/modeling_phi.py +++ b/src/transformers/models/phi/modeling_phi.py @@ -501,6 +501,7 @@ def forward( value_states, attention_mask, q_len, + position_ids=position_ids, dropout=attn_dropout, softmax_scale=None, use_top_left_mask=self._flash_attn_uses_top_left_mask, diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py index 76e3fbf514f6d6..90b815184b07a8 100644 --- a/src/transformers/models/phi3/modeling_phi3.py +++ b/src/transformers/models/phi3/modeling_phi3.py @@ -563,6 +563,7 @@ def forward( value_states, attention_mask, q_len, + position_ids=position_ids, dropout=attn_dropout, sliding_window=getattr(self.config, "sliding_window", None), use_top_left_mask=self._flash_attn_uses_top_left_mask, diff --git a/src/transformers/models/qwen2/modeling_qwen2.py b/src/transformers/models/qwen2/modeling_qwen2.py index 68923ed4052dd8..1ff8896ae5f901 100644 --- a/src/transformers/models/qwen2/modeling_qwen2.py +++ b/src/transformers/models/qwen2/modeling_qwen2.py @@ -429,6 +429,7 @@ def forward( value_states, attention_mask, q_len, + position_ids=position_ids, dropout=dropout_rate, sliding_window=sliding_window, is_causal=self.is_causal, diff --git a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py index d88b5c357e86da..54e91da3347dbc 100644 --- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py +++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py @@ -508,6 +508,7 @@ def forward( value_states, attention_mask, q_len, + position_ids=position_ids, dropout=dropout_rate, sliding_window=sliding_window, is_causal=self.is_causal, diff --git a/src/transformers/models/stablelm/modeling_stablelm.py b/src/transformers/models/stablelm/modeling_stablelm.py index ea50a20edea8a8..3a3b6a9e05f117 100755 --- a/src/transformers/models/stablelm/modeling_stablelm.py +++ b/src/transformers/models/stablelm/modeling_stablelm.py @@ -606,6 +606,7 @@ def forward( value_states, attention_mask, q_len, + position_ids=position_ids, dropout=dropout_rate, use_top_left_mask=self._flash_attn_uses_top_left_mask, is_causal=self.is_causal, diff --git a/src/transformers/models/starcoder2/modeling_starcoder2.py b/src/transformers/models/starcoder2/modeling_starcoder2.py index af532b139ca392..f2786f9df48a6b 100644 --- a/src/transformers/models/starcoder2/modeling_starcoder2.py +++ b/src/transformers/models/starcoder2/modeling_starcoder2.py @@ -404,6 +404,7 @@ def forward( value_states, attention_mask, q_len, + position_ids=position_ids, dropout=dropout_rate, sliding_window=getattr(self.config, "sliding_window", None), is_causal=self.is_causal, diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 19a945aec52799..abe5ddea2c2511 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -4327,6 +4327,78 @@ def test_flash_attn_2_fp32_ln(self): # with attention mask _ = model(dummy_input, attention_mask=dummy_attention_mask) + @require_flash_attn + @require_torch_gpu + @mark.flash_attn_test + @slow + def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self): + if not self.has_attentions: + self.skipTest(reason="Model architecture does not support attentions") + + max_new_tokens = 30 + + for model_class in self.all_generative_model_classes: + if not model_class._supports_flash_attn_2: + self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + dummy_input = inputs_dict[model_class.main_input_name] + if dummy_input.dtype in [torch.float32, torch.bfloat16]: + dummy_input = dummy_input.to(torch.float16) + + # make sure that all models have enough positions for generation + if hasattr(config, "max_position_embeddings"): + config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 + + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + + assert 0 in inputs_dict["attention_mask"], "assert padding in testing inputs" + # ensure left padding, to adapt for some models + if 0 in inputs_dict["attention_mask"][:, -1]: + inputs_dict["attention_mask"] = inputs_dict["attention_mask"].flip(1) + dummy_attention_mask = inputs_dict["attention_mask"] + inputs_dict["input_ids"][~dummy_attention_mask.bool()] = config.pad_token_id + + model = ( + model_class.from_pretrained( + tmpdirname, + torch_dtype=torch.float16, + attn_implementation="flash_attention_2", + low_cpu_mem_usage=True, + ) + .to(torch_device) + .eval() + ) + + # flatten + padfree_inputs_dict = { + k: v[dummy_attention_mask.bool()].unsqueeze(0) + for k, v in inputs_dict.items() + if not k == "attention_mask" + } + # add position_ids + padfree_inputs_dict["position_ids"] = ( + torch.cat([torch.arange(length) for length in dummy_attention_mask.sum(1).tolist()]) + .long() + .unsqueeze(0) + .to(torch_device) + ) + + res_padded = model(**inputs_dict) + res_padfree = model(**padfree_inputs_dict) + + logits_padded = res_padded.logits[inputs_dict["attention_mask"].bool()] + logits_padfree = res_padfree.logits[0] + + torch.testing.assert_close(logits_padded.argmax(-1), logits_padfree.argmax(-1), atol=0, rtol=0) + # acceptable numerical instability + tol = torch.finfo(torch.float16).eps + torch.testing.assert_close(logits_padded, logits_padfree, atol=tol, rtol=tol) + @is_pt_tf_cross_test def test_tf_from_pt_safetensors(self): for model_class in self.all_model_classes: diff --git a/tests/trainer/test_data_collator.py b/tests/trainer/test_data_collator.py index 36e1813258d1a3..8c1f593ff4bcb8 100644 --- a/tests/trainer/test_data_collator.py +++ b/tests/trainer/test_data_collator.py @@ -26,6 +26,7 @@ DataCollatorForSeq2Seq, DataCollatorForTokenClassification, DataCollatorForWholeWordMask, + DataCollatorWithFlattening, DataCollatorWithPadding, default_data_collator, is_tf_available, @@ -1531,6 +1532,24 @@ def test_data_collator_with_padding(self): batch = data_collator(features) self.assertEqual(batch["input_ids"].shape, (2, 8)) + def test_data_collator_with_flattening(self): + features = [ + {"input_ids": [10, 11, 12]}, + {"input_ids": [20, 21, 22, 23, 24, 25]}, + {"input_ids": [30, 31, 32, 33, 34, 35, 36]}, + ] + + data_collator = DataCollatorWithFlattening(return_tensors="np") + batch = data_collator(features) + self.assertEqual(batch["input_ids"].shape, (1, 16)) + self.assertEqual( + batch["input_ids"][0].tolist(), [10, 11, 12, 20, 21, 22, 23, 24, 25, 30, 31, 32, 33, 34, 35, 36] + ) + self.assertNotIn("attention_mask", batch) + self.assertIn("position_ids", batch) + self.assertEqual(batch["position_ids"].shape, (1, 16)) + self.assertEqual(batch["position_ids"][0].tolist(), [0, 1, 2, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 6]) + def test_data_collator_for_token_classification(self): tokenizer = BertTokenizer(self.vocab_file) features = [ From d2c687b3f1859b5c61258af14abba5312c0e6201 Mon Sep 17 00:00:00 2001 From: Sai-Suraj-27 Date: Tue, 23 Jul 2024 20:37:31 +0530 Subject: [PATCH 27/73] Updated `ruff` to the latest version (#31926) * Updated ruff version and fixed the required code accorindg to the latest version. * Updated ruff version and fixed the required code accorindg to the latest version. * Added noqa directive to ignore 1 error shown by ruff --- .../seq2seq-distillation/_test_seq2seq_examples.py | 4 ++-- setup.py | 2 +- src/transformers/dependency_versions_table.py | 2 +- src/transformers/hf_argparser.py | 2 +- src/transformers/modeling_flax_utils.py | 2 +- src/transformers/models/distilbert/modeling_distilbert.py | 2 +- .../models/distilbert/modeling_flax_distilbert.py | 2 +- src/transformers/models/esm/openfold_utils/rigid_utils.py | 4 ++-- .../models/markuplm/feature_extraction_markuplm.py | 2 +- src/transformers/models/musicgen/modeling_musicgen.py | 2 +- src/transformers/trainer_pt_utils.py | 2 +- tests/agents/test_agents.py | 2 +- tests/agents/test_python_interpreter.py | 4 ++-- tests/models/roformer/test_tokenization_roformer.py | 2 +- 14 files changed, 17 insertions(+), 17 deletions(-) diff --git a/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples.py b/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples.py index 454951ed3888a0..0ee4dd8afe1d5e 100644 --- a/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples.py +++ b/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples.py @@ -418,7 +418,7 @@ def test_finetune_lr_schedulers(self): with CaptureStdout() as cs: args = parser.parse_args(args) assert False, "--help is expected to sys.exit" - assert excinfo.type == SystemExit + assert excinfo.type is SystemExit expected = lightning_base.arg_to_scheduler_metavar assert expected in cs.out, "--help is expected to list the supported schedulers" @@ -429,7 +429,7 @@ def test_finetune_lr_schedulers(self): with CaptureStderr() as cs: args = parser.parse_args(args) assert False, "invalid argument is expected to sys.exit" - assert excinfo.type == SystemExit + assert excinfo.type is SystemExit expected = f"invalid choice: '{unsupported_param}'" assert expected in cs.err, f"should have bailed on invalid choice of scheduler {unsupported_param}" diff --git a/setup.py b/setup.py index f6a6875dcda691..67f1cbfd80a26f 100644 --- a/setup.py +++ b/setup.py @@ -157,7 +157,7 @@ "rhoknp>=1.1.0,<1.3.1", "rjieba", "rouge-score!=0.0.7,!=0.0.8,!=0.1,!=0.1.1", - "ruff==0.4.4", + "ruff==0.5.1", "sacrebleu>=1.4.12,<2.0.0", "sacremoses", "safetensors>=0.4.1", diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index fcbb8469b9e5fe..7644d8d68d1696 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -63,7 +63,7 @@ "rhoknp": "rhoknp>=1.1.0,<1.3.1", "rjieba": "rjieba", "rouge-score": "rouge-score!=0.0.7,!=0.0.8,!=0.1,!=0.1.1", - "ruff": "ruff==0.4.4", + "ruff": "ruff==0.5.1", "sacrebleu": "sacrebleu>=1.4.12,<2.0.0", "sacremoses": "sacremoses", "safetensors": "safetensors>=0.4.1", diff --git a/src/transformers/hf_argparser.py b/src/transformers/hf_argparser.py index 045bf798050e93..4b5548fffb4154 100644 --- a/src/transformers/hf_argparser.py +++ b/src/transformers/hf_argparser.py @@ -164,7 +164,7 @@ def _parse_dataclass_field(parser: ArgumentParser, field: dataclasses.Field): ) if type(None) not in field.type.__args__: # filter `str` in Union - field.type = field.type.__args__[0] if field.type.__args__[1] == str else field.type.__args__[1] + field.type = field.type.__args__[0] if field.type.__args__[1] is str else field.type.__args__[1] origin_type = getattr(field.type, "__origin__", field.type) elif bool not in field.type.__args__: # filter `NoneType` in Union (except for `Union[bool, NoneType]`) diff --git a/src/transformers/modeling_flax_utils.py b/src/transformers/modeling_flax_utils.py index 61077cf7c30938..9d12e1e67c8082 100644 --- a/src/transformers/modeling_flax_utils.py +++ b/src/transformers/modeling_flax_utils.py @@ -90,7 +90,7 @@ def dtype_byte_size(dtype): 4 ``` """ - if dtype == bool: + if dtype is bool: return 1 / 8 bit_search = re.search(r"[^\d](\d+)$", dtype.name) if bit_search is None: diff --git a/src/transformers/models/distilbert/modeling_distilbert.py b/src/transformers/models/distilbert/modeling_distilbert.py index 4d9173fd08bfd5..e80e3c41d22cb6 100755 --- a/src/transformers/models/distilbert/modeling_distilbert.py +++ b/src/transformers/models/distilbert/modeling_distilbert.py @@ -398,7 +398,7 @@ def forward( if output_attentions: sa_output, sa_weights = sa_output # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length) else: # To handle these `output_attentions` or `output_hidden_states` cases returning tuples - if type(sa_output) != tuple: + if type(sa_output) is not tuple: raise TypeError(f"sa_output must be a tuple but it is {type(sa_output)} type") sa_output = sa_output[0] diff --git a/src/transformers/models/distilbert/modeling_flax_distilbert.py b/src/transformers/models/distilbert/modeling_flax_distilbert.py index d3c48c077adc52..0cb7cdb033c148 100644 --- a/src/transformers/models/distilbert/modeling_flax_distilbert.py +++ b/src/transformers/models/distilbert/modeling_flax_distilbert.py @@ -304,7 +304,7 @@ def __call__( if output_attentions: sa_output, sa_weights = sa_output else: - assert type(sa_output) == tuple + assert type(sa_output) is tuple sa_output = sa_output[0] sa_output = self.sa_layer_norm(sa_output + hidden_states) diff --git a/src/transformers/models/esm/openfold_utils/rigid_utils.py b/src/transformers/models/esm/openfold_utils/rigid_utils.py index 2bc2fe5f5c4ebf..08f5ce0a4f7e2c 100644 --- a/src/transformers/models/esm/openfold_utils/rigid_utils.py +++ b/src/transformers/models/esm/openfold_utils/rigid_utils.py @@ -343,7 +343,7 @@ def __getitem__(self, index: Any) -> Rotation: Returns: The indexed rotation """ - if type(index) != tuple: + if type(index) is not tuple: index = (index,) if self._rot_mats is not None: @@ -827,7 +827,7 @@ def __getitem__(self, index: Any) -> Rigid: Returns: The indexed tensor """ - if type(index) != tuple: + if type(index) is not tuple: index = (index,) return Rigid( diff --git a/src/transformers/models/markuplm/feature_extraction_markuplm.py b/src/transformers/models/markuplm/feature_extraction_markuplm.py index 73c16bad302b54..e3effdc910a8c7 100644 --- a/src/transformers/models/markuplm/feature_extraction_markuplm.py +++ b/src/transformers/models/markuplm/feature_extraction_markuplm.py @@ -68,7 +68,7 @@ def get_three_from_single(self, html_string): for element in html_code.descendants: if isinstance(element, bs4.element.NavigableString): - if type(element.parent) != bs4.element.Tag: + if type(element.parent) is not bs4.element.Tag: continue text_in_this_tag = html.unescape(element).strip() diff --git a/src/transformers/models/musicgen/modeling_musicgen.py b/src/transformers/models/musicgen/modeling_musicgen.py index 7aaaeb461c1343..b0e456db8add38 100644 --- a/src/transformers/models/musicgen/modeling_musicgen.py +++ b/src/transformers/models/musicgen/modeling_musicgen.py @@ -2550,7 +2550,7 @@ def generate( generation_config.validate() self._validate_model_kwargs(model_kwargs.copy()) - if model_kwargs.get("encoder_outputs") is not None and type(model_kwargs["encoder_outputs"]) == tuple: + if model_kwargs.get("encoder_outputs") is not None and type(model_kwargs["encoder_outputs"]) is tuple: # wrap the unconditional outputs as a BaseModelOutput for compatibility with the rest of generate model_kwargs["encoder_outputs"] = BaseModelOutput(last_hidden_state=model_kwargs["encoder_outputs"][0]) diff --git a/src/transformers/trainer_pt_utils.py b/src/transformers/trainer_pt_utils.py index 5c1ffd85163674..69b547dec572fe 100644 --- a/src/transformers/trainer_pt_utils.py +++ b/src/transformers/trainer_pt_utils.py @@ -254,7 +254,7 @@ def reissue_pt_warnings(caught_warnings): # Reissue warnings that are not the SAVE_STATE_WARNING if len(caught_warnings) > 1: for w in caught_warnings: - if w.category != UserWarning or w.message != SAVE_STATE_WARNING: + if w.category is not UserWarning or w.message != SAVE_STATE_WARNING: warnings.warn(w.message, w.category) diff --git a/tests/agents/test_agents.py b/tests/agents/test_agents.py index f47d0b0c35c3e0..6dac8b85201528 100644 --- a/tests/agents/test_agents.py +++ b/tests/agents/test_agents.py @@ -198,7 +198,7 @@ def test_react_fails_max_iterations(self): ) agent.run("What is 2 multiplied by 3.6452?") assert len(agent.logs) == 7 - assert type(agent.logs[-1]["error"]) == AgentMaxIterationsError + assert type(agent.logs[-1]["error"]) is AgentMaxIterationsError @require_torch def test_init_agent_with_different_toolsets(self): diff --git a/tests/agents/test_python_interpreter.py b/tests/agents/test_python_interpreter.py index 8614302baae764..feb923af28b4ed 100644 --- a/tests/agents/test_python_interpreter.py +++ b/tests/agents/test_python_interpreter.py @@ -214,7 +214,7 @@ def test_evaluate_slicing(self): def test_access_attributes(self): code = "integer = 1\nobj_class = integer.__class__\nobj_class" result = evaluate_python_code(code, {}, state={}) - assert result == int + assert result is int def test_list_comprehension(self): code = "sentence = 'THESEAGULL43'\nmeaningful_sentence = '-'.join([char.lower() for char in sentence if char.isalpha()])" @@ -591,7 +591,7 @@ def test_types_as_objects(self): code = "type_a = float(2); type_b = str; type_c = int" state = {} result = evaluate_python_code(code, {"float": float, "str": str, "int": int}, state=state) - assert result == int + assert result is int def test_tuple_id(self): code = """ diff --git a/tests/models/roformer/test_tokenization_roformer.py b/tests/models/roformer/test_tokenization_roformer.py index 2c5b9c65e96793..6dfd0a385f0d87 100644 --- a/tests/models/roformer/test_tokenization_roformer.py +++ b/tests/models/roformer/test_tokenization_roformer.py @@ -56,7 +56,7 @@ def test_tokenizer(self): exp_tokens = [22943, 21332, 34431, 45904, 117, 306, 1231, 1231, 2653, 33994, 1266, 100] self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), exp_tokens) - def test_rust_tokenizer(self): + def test_rust_tokenizer(self): # noqa: F811 tokenizer = self.get_rust_tokenizer() input_text, output_text = self.get_chinese_input_output_texts() tokens = tokenizer.tokenize(input_text) From ff0d708fe627d6715f9a3e97d0a7947f70437447 Mon Sep 17 00:00:00 2001 From: Lysandre Date: Tue, 23 Jul 2024 17:12:47 +0200 Subject: [PATCH 28/73] Dev version: v4.44.0.dev0 --- examples/flax/question-answering/run_qa.py | 2 +- .../speech-recognition/run_flax_speech_recognition_seq2seq.py | 2 +- examples/flax/text-classification/run_flax_glue.py | 2 +- examples/flax/token-classification/run_flax_ner.py | 2 +- .../pytorch/audio-classification/run_audio_classification.py | 2 +- examples/pytorch/contrastive-image-text/run_clip.py | 2 +- .../pytorch/image-classification/run_image_classification.py | 2 +- .../image-classification/run_image_classification_no_trainer.py | 2 +- examples/pytorch/image-pretraining/run_mae.py | 2 +- examples/pytorch/image-pretraining/run_mim.py | 2 +- examples/pytorch/image-pretraining/run_mim_no_trainer.py | 2 +- .../pytorch/instance-segmentation/run_instance_segmentation.py | 2 +- .../run_instance_segmentation_no_trainer.py | 2 +- examples/pytorch/language-modeling/run_clm.py | 2 +- examples/pytorch/language-modeling/run_clm_no_trainer.py | 2 +- examples/pytorch/language-modeling/run_fim.py | 2 +- examples/pytorch/language-modeling/run_fim_no_trainer.py | 2 +- examples/pytorch/language-modeling/run_mlm.py | 2 +- examples/pytorch/language-modeling/run_mlm_no_trainer.py | 2 +- examples/pytorch/language-modeling/run_plm.py | 2 +- examples/pytorch/multiple-choice/run_swag.py | 2 +- examples/pytorch/multiple-choice/run_swag_no_trainer.py | 2 +- examples/pytorch/object-detection/run_object_detection.py | 2 +- .../pytorch/object-detection/run_object_detection_no_trainer.py | 2 +- examples/pytorch/question-answering/run_qa.py | 2 +- examples/pytorch/question-answering/run_qa_beam_search.py | 2 +- .../pytorch/question-answering/run_qa_beam_search_no_trainer.py | 2 +- examples/pytorch/question-answering/run_qa_no_trainer.py | 2 +- examples/pytorch/question-answering/run_seq2seq_qa.py | 2 +- .../pytorch/semantic-segmentation/run_semantic_segmentation.py | 2 +- .../run_semantic_segmentation_no_trainer.py | 2 +- .../pytorch/speech-recognition/run_speech_recognition_ctc.py | 2 +- .../speech-recognition/run_speech_recognition_ctc_adapter.py | 2 +- .../speech-recognition/run_speech_recognition_seq2seq.py | 2 +- examples/pytorch/summarization/run_summarization.py | 2 +- examples/pytorch/summarization/run_summarization_no_trainer.py | 2 +- examples/pytorch/text-classification/run_classification.py | 2 +- examples/pytorch/text-classification/run_glue.py | 2 +- examples/pytorch/text-classification/run_glue_no_trainer.py | 2 +- examples/pytorch/text-classification/run_xnli.py | 2 +- examples/pytorch/token-classification/run_ner.py | 2 +- examples/pytorch/token-classification/run_ner_no_trainer.py | 2 +- examples/pytorch/translation/run_translation.py | 2 +- examples/pytorch/translation/run_translation_no_trainer.py | 2 +- examples/tensorflow/contrastive-image-text/run_clip.py | 2 +- .../tensorflow/image-classification/run_image_classification.py | 2 +- examples/tensorflow/multiple-choice/run_swag.py | 2 +- examples/tensorflow/question-answering/run_qa.py | 2 +- examples/tensorflow/summarization/run_summarization.py | 2 +- examples/tensorflow/text-classification/run_glue.py | 2 +- examples/tensorflow/translation/run_translation.py | 2 +- setup.py | 2 +- src/transformers/__init__.py | 2 +- 53 files changed, 53 insertions(+), 53 deletions(-) diff --git a/examples/flax/question-answering/run_qa.py b/examples/flax/question-answering/run_qa.py index 1819b0235fafc9..cb708534116eee 100644 --- a/examples/flax/question-answering/run_qa.py +++ b/examples/flax/question-answering/run_qa.py @@ -61,7 +61,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") Array = Any Dataset = datasets.arrow_dataset.Dataset diff --git a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py index 501aaf18642400..cea636c9a782f2 100644 --- a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py +++ b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py @@ -60,7 +60,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risk. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/flax/speech-recognition/requirements.txt") diff --git a/examples/flax/text-classification/run_flax_glue.py b/examples/flax/text-classification/run_flax_glue.py index 281881c51182ee..5bb8a245bbd139 100755 --- a/examples/flax/text-classification/run_flax_glue.py +++ b/examples/flax/text-classification/run_flax_glue.py @@ -56,7 +56,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") Array = Any Dataset = datasets.arrow_dataset.Dataset diff --git a/examples/flax/token-classification/run_flax_ner.py b/examples/flax/token-classification/run_flax_ner.py index 6eb162adcb0784..88f4f2f635d042 100644 --- a/examples/flax/token-classification/run_flax_ner.py +++ b/examples/flax/token-classification/run_flax_ner.py @@ -57,7 +57,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt") diff --git a/examples/pytorch/audio-classification/run_audio_classification.py b/examples/pytorch/audio-classification/run_audio_classification.py index 3c75f0b1504d19..ad9fde080373c1 100644 --- a/examples/pytorch/audio-classification/run_audio_classification.py +++ b/examples/pytorch/audio-classification/run_audio_classification.py @@ -45,7 +45,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt") diff --git a/examples/pytorch/contrastive-image-text/run_clip.py b/examples/pytorch/contrastive-image-text/run_clip.py index c4936410c52ade..60f698951d0baa 100644 --- a/examples/pytorch/contrastive-image-text/run_clip.py +++ b/examples/pytorch/contrastive-image-text/run_clip.py @@ -54,7 +54,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt") diff --git a/examples/pytorch/image-classification/run_image_classification.py b/examples/pytorch/image-classification/run_image_classification.py index b7557b903fdf06..3f8a2433c94f68 100755 --- a/examples/pytorch/image-classification/run_image_classification.py +++ b/examples/pytorch/image-classification/run_image_classification.py @@ -56,7 +56,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt") diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py index e67424f6819ca9..ee30bd4608638c 100644 --- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py +++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py @@ -49,7 +49,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") logger = get_logger(__name__) diff --git a/examples/pytorch/image-pretraining/run_mae.py b/examples/pytorch/image-pretraining/run_mae.py index 8af6e18b1ca37c..a1624d12c01e73 100644 --- a/examples/pytorch/image-pretraining/run_mae.py +++ b/examples/pytorch/image-pretraining/run_mae.py @@ -43,7 +43,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt") diff --git a/examples/pytorch/image-pretraining/run_mim.py b/examples/pytorch/image-pretraining/run_mim.py index c2c3ff818b5b6b..379ad7eaecae35 100644 --- a/examples/pytorch/image-pretraining/run_mim.py +++ b/examples/pytorch/image-pretraining/run_mim.py @@ -48,7 +48,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt") diff --git a/examples/pytorch/image-pretraining/run_mim_no_trainer.py b/examples/pytorch/image-pretraining/run_mim_no_trainer.py index e3efbec76c4419..fe1593fbdd849f 100644 --- a/examples/pytorch/image-pretraining/run_mim_no_trainer.py +++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py @@ -53,7 +53,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt") diff --git a/examples/pytorch/instance-segmentation/run_instance_segmentation.py b/examples/pytorch/instance-segmentation/run_instance_segmentation.py index 9a29e43d7d304c..e7d752a8b4513f 100644 --- a/examples/pytorch/instance-segmentation/run_instance_segmentation.py +++ b/examples/pytorch/instance-segmentation/run_instance_segmentation.py @@ -46,7 +46,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt") diff --git a/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py b/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py index 8f57997deacbc7..e926b8b73ba5cd 100644 --- a/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py +++ b/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py @@ -52,7 +52,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py index cf80ae83ab2e0a..d6f04ed75ee4bc 100755 --- a/examples/pytorch/language-modeling/run_clm.py +++ b/examples/pytorch/language-modeling/run_clm.py @@ -55,7 +55,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py index 7ef8d94f3e3cfb..577d86e4fc1579 100755 --- a/examples/pytorch/language-modeling/run_clm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py @@ -57,7 +57,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") logger = get_logger(__name__) diff --git a/examples/pytorch/language-modeling/run_fim.py b/examples/pytorch/language-modeling/run_fim.py index 7154f1ffcd71e5..dd00375dfe849f 100644 --- a/examples/pytorch/language-modeling/run_fim.py +++ b/examples/pytorch/language-modeling/run_fim.py @@ -58,7 +58,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_fim_no_trainer.py b/examples/pytorch/language-modeling/run_fim_no_trainer.py index 11c64c7c4849ef..6cb4a3b6bca5e7 100644 --- a/examples/pytorch/language-modeling/run_fim_no_trainer.py +++ b/examples/pytorch/language-modeling/run_fim_no_trainer.py @@ -60,7 +60,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") logger = get_logger(__name__) diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py index f40015d9701dc1..ada6a4534b67df 100755 --- a/examples/pytorch/language-modeling/run_mlm.py +++ b/examples/pytorch/language-modeling/run_mlm.py @@ -54,7 +54,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py index 75c3b1936fd024..8fe6697c497d94 100755 --- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py @@ -57,7 +57,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") logger = get_logger(__name__) require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py index 33dc8baaa6e93f..3a9b3ffa16ac8d 100755 --- a/examples/pytorch/language-modeling/run_plm.py +++ b/examples/pytorch/language-modeling/run_plm.py @@ -47,7 +47,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/multiple-choice/run_swag.py b/examples/pytorch/multiple-choice/run_swag.py index 51c13458ed44c0..7cc2a772d7eddf 100755 --- a/examples/pytorch/multiple-choice/run_swag.py +++ b/examples/pytorch/multiple-choice/run_swag.py @@ -47,7 +47,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") logger = logging.getLogger(__name__) diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py index 8356493762ed7c..697ec34ac8860b 100755 --- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py +++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py @@ -56,7 +56,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") logger = get_logger(__name__) # You should update this to your particular problem to have better documentation of `model_type` diff --git a/examples/pytorch/object-detection/run_object_detection.py b/examples/pytorch/object-detection/run_object_detection.py index 3f1eb681df225a..8250320dfbdad5 100644 --- a/examples/pytorch/object-detection/run_object_detection.py +++ b/examples/pytorch/object-detection/run_object_detection.py @@ -48,7 +48,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/object-detection/requirements.txt") diff --git a/examples/pytorch/object-detection/run_object_detection_no_trainer.py b/examples/pytorch/object-detection/run_object_detection_no_trainer.py index 296f045a5234e3..34d04d99d53c39 100644 --- a/examples/pytorch/object-detection/run_object_detection_no_trainer.py +++ b/examples/pytorch/object-detection/run_object_detection_no_trainer.py @@ -51,7 +51,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") logging.basicConfig(level=logging.INFO) logger = get_logger(__name__) diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py index 83eda3e98a75ea..53393ddc5d8ab4 100755 --- a/examples/pytorch/question-answering/run_qa.py +++ b/examples/pytorch/question-answering/run_qa.py @@ -50,7 +50,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/question-answering/run_qa_beam_search.py b/examples/pytorch/question-answering/run_qa_beam_search.py index 4ba3564d6e505e..bfdf1fc70a1967 100755 --- a/examples/pytorch/question-answering/run_qa_beam_search.py +++ b/examples/pytorch/question-answering/run_qa_beam_search.py @@ -48,7 +48,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py index db6256aedbacfd..909084e65298a9 100644 --- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py @@ -56,7 +56,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py index 202cc7d661db87..6aed16d9fba88b 100755 --- a/examples/pytorch/question-answering/run_qa_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_no_trainer.py @@ -57,7 +57,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/question-answering/run_seq2seq_qa.py b/examples/pytorch/question-answering/run_seq2seq_qa.py index 1932df9677ce6c..bac49185d55b43 100644 --- a/examples/pytorch/question-answering/run_seq2seq_qa.py +++ b/examples/pytorch/question-answering/run_seq2seq_qa.py @@ -46,7 +46,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py index a5929205b2f5fe..ad92a3c99e6be1 100644 --- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py +++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py @@ -51,7 +51,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/semantic-segmentation/requirements.txt") diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py index 5ff906c22cba16..5c78da9119c078 100644 --- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py +++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py @@ -50,7 +50,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") logger = get_logger(__name__) diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py index b4019d64f774ce..ffcca59f3ddb3f 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py @@ -50,7 +50,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt") diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py index 3da281430ec272..77bd768cc4f389 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py @@ -53,7 +53,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt") diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py index 501b2df1c5eb6a..b18bfb15ea1793 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py @@ -48,7 +48,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt") diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py index 225b20fcd63572..9bc80cc5f27e26 100755 --- a/examples/pytorch/summarization/run_summarization.py +++ b/examples/pytorch/summarization/run_summarization.py @@ -52,7 +52,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py index 325cbebd9634b0..3f35c72113cf05 100644 --- a/examples/pytorch/summarization/run_summarization_no_trainer.py +++ b/examples/pytorch/summarization/run_summarization_no_trainer.py @@ -56,7 +56,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") logger = get_logger(__name__) require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") diff --git a/examples/pytorch/text-classification/run_classification.py b/examples/pytorch/text-classification/run_classification.py index b4520d6af340eb..d37ebb7c4bbf3d 100755 --- a/examples/pytorch/text-classification/run_classification.py +++ b/examples/pytorch/text-classification/run_classification.py @@ -47,7 +47,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py index e3bb7ea94ca07f..751084a6f6e3dc 100755 --- a/examples/pytorch/text-classification/run_glue.py +++ b/examples/pytorch/text-classification/run_glue.py @@ -48,7 +48,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py index 5ec90faf87a57d..152b8b02999db9 100644 --- a/examples/pytorch/text-classification/run_glue_no_trainer.py +++ b/examples/pytorch/text-classification/run_glue_no_trainer.py @@ -49,7 +49,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") logger = get_logger(__name__) diff --git a/examples/pytorch/text-classification/run_xnli.py b/examples/pytorch/text-classification/run_xnli.py index 7fa0d3bc9005bf..18f5d37821c186 100755 --- a/examples/pytorch/text-classification/run_xnli.py +++ b/examples/pytorch/text-classification/run_xnli.py @@ -48,7 +48,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py index 937d09d74d6277..65a34df2945d1f 100755 --- a/examples/pytorch/token-classification/run_ner.py +++ b/examples/pytorch/token-classification/run_ner.py @@ -49,7 +49,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt") diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py index 61e14da6cb8b25..085dc7879d9b69 100755 --- a/examples/pytorch/token-classification/run_ner_no_trainer.py +++ b/examples/pytorch/token-classification/run_ner_no_trainer.py @@ -56,7 +56,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") logger = get_logger(__name__) require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt") diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py index 7ccfc30802f5f0..f4194d4e30b834 100755 --- a/examples/pytorch/translation/run_translation.py +++ b/examples/pytorch/translation/run_translation.py @@ -52,7 +52,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt") diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py index e2e9f3d3d3daa0..7e7ce04f02413e 100644 --- a/examples/pytorch/translation/run_translation_no_trainer.py +++ b/examples/pytorch/translation/run_translation_no_trainer.py @@ -57,7 +57,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") logger = get_logger(__name__) require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt") diff --git a/examples/tensorflow/contrastive-image-text/run_clip.py b/examples/tensorflow/contrastive-image-text/run_clip.py index ba83bbe56dcfb3..9cd6dc08bff5e4 100644 --- a/examples/tensorflow/contrastive-image-text/run_clip.py +++ b/examples/tensorflow/contrastive-image-text/run_clip.py @@ -51,7 +51,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version( "datasets>=1.8.0", "To fix: pip install -r examples/tensorflow/contrastive-image-text/requirements.txt" diff --git a/examples/tensorflow/image-classification/run_image_classification.py b/examples/tensorflow/image-classification/run_image_classification.py index a3ea7cf0b10d99..025ab7c448b8a3 100644 --- a/examples/tensorflow/image-classification/run_image_classification.py +++ b/examples/tensorflow/image-classification/run_image_classification.py @@ -55,7 +55,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt") diff --git a/examples/tensorflow/multiple-choice/run_swag.py b/examples/tensorflow/multiple-choice/run_swag.py index 7d8189b087efd8..d6fe9ab7cb5007 100644 --- a/examples/tensorflow/multiple-choice/run_swag.py +++ b/examples/tensorflow/multiple-choice/run_swag.py @@ -50,7 +50,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") logger = logging.getLogger(__name__) diff --git a/examples/tensorflow/question-answering/run_qa.py b/examples/tensorflow/question-answering/run_qa.py index d9758c401826b7..1c3de1b137f315 100755 --- a/examples/tensorflow/question-answering/run_qa.py +++ b/examples/tensorflow/question-answering/run_qa.py @@ -62,7 +62,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") logger = logging.getLogger(__name__) diff --git a/examples/tensorflow/summarization/run_summarization.py b/examples/tensorflow/summarization/run_summarization.py index 3f5190186dc3f5..b5f6f444057a32 100644 --- a/examples/tensorflow/summarization/run_summarization.py +++ b/examples/tensorflow/summarization/run_summarization.py @@ -53,7 +53,7 @@ # region Checking dependencies # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") diff --git a/examples/tensorflow/text-classification/run_glue.py b/examples/tensorflow/text-classification/run_glue.py index ec3fbe8c61bad3..c098192477a3bf 100644 --- a/examples/tensorflow/text-classification/run_glue.py +++ b/examples/tensorflow/text-classification/run_glue.py @@ -47,7 +47,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") task_to_keys = { "cola": ("sentence", None), diff --git a/examples/tensorflow/translation/run_translation.py b/examples/tensorflow/translation/run_translation.py index 7280f7a95b37a5..6fe65b0cf3c669 100644 --- a/examples/tensorflow/translation/run_translation.py +++ b/examples/tensorflow/translation/run_translation.py @@ -56,7 +56,7 @@ # region Dependencies and constants # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.43.0.dev0") +check_min_version("4.44.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") diff --git a/setup.py b/setup.py index 67f1cbfd80a26f..5815d69e4da7cb 100644 --- a/setup.py +++ b/setup.py @@ -430,7 +430,7 @@ def run(self): setup( name="transformers", - version="4.43.0.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) + version="4.44.0.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)", author_email="transformers@huggingface.co", description="State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow", diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 05becb96e0b808..9108367f35b321 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -18,7 +18,7 @@ # to defer the actual importing for when the objects are requested. This way `import transformers` provides the names # in the namespace without actually importing anything (and especially none of the backends). -__version__ = "4.43.0.dev0" +__version__ = "4.44.0.dev0" from typing import TYPE_CHECKING From d5a99dfcee6e94065cb7c83cc8ab6fc5daa0cc4e Mon Sep 17 00:00:00 2001 From: Lysandre Date: Tue, 23 Jul 2024 16:58:17 +0200 Subject: [PATCH 29/73] Llama 3.1 conversion Co-authored-by: Arthur Zucker --- src/transformers/modeling_rope_utils.py | 103 +++++++++++- .../models/llama/configuration_llama.py | 31 ++-- .../llama/convert_llama_weights_to_hf.py | 153 ++++++++++++------ 3 files changed, 228 insertions(+), 59 deletions(-) diff --git a/src/transformers/modeling_rope_utils.py b/src/transformers/modeling_rope_utils.py index 33055d2bf942e9..14a12b9394059d 100644 --- a/src/transformers/modeling_rope_utils.py +++ b/src/transformers/modeling_rope_utils.py @@ -129,6 +129,7 @@ def _compute_dynamic_ntk_parameters( Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ + # TODO (joao): use the new `original_max_position_embeddings` from rope_scaling if config is not None and len(rope_kwargs) > 0: raise ValueError( "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in " @@ -249,6 +250,7 @@ def _compute_longrope_parameters( Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the post-processing scaling factor applied to the computed cos/sin. """ + # TODO (joao): use the new `original_max_position_embeddings` from rope_scaling # No need to keep BC with longrope, unreleased when this new pattern was created. if len(rope_kwargs) > 0: raise ValueError( @@ -293,6 +295,50 @@ def _compute_longrope_parameters( return inv_freq, attention_factor +def _compute_llama3_parameters( + config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs +) -> Tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies for llama 3.1. + + Args: + config ([`~transformers.PretrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + rope_kwargs (`Dict`, *optional*): + BC compatibility with the previous RoPE class instantiation, will be removed in v4.45. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin. + """ + # Gets the default RoPE parameters + inv_freq, attention_factor = _compute_default_rope_parameters(config, device, seq_len, **rope_kwargs) + + factor = config.rope_scaling["factor"] # `8` in the original implementation + low_freq_factor = config.rope_scaling["low_freq_factor"] # `1` in the original implementation + high_freq_factor = config.rope_scaling["high_freq_factor"] # `4` in the original implementation + old_context_len = config.rope_scaling["original_max_position_embeddings"] # `8192` in the original implementation + + low_freq_wavelen = old_context_len / low_freq_factor + high_freq_wavelen = old_context_len / high_freq_factor + new_freqs = [] + for freq in inv_freq: + wavelen = 2 * math.pi / freq + if wavelen < high_freq_wavelen: + new_freqs.append(freq) + elif wavelen > low_freq_wavelen: + new_freqs.append(freq / factor) + else: + assert low_freq_wavelen != high_freq_wavelen + smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) + new_freqs.append((1 - smooth) * freq / factor + smooth * freq) + inv_freq = torch.tensor(new_freqs, dtype=inv_freq.dtype, device=inv_freq.device) + return inv_freq, attention_factor + + # This maps the "rope_type" string field in rope config to the corresponding function to compute the RoPE parameters # from the model config. You can append new {'rope_type': callable} pairs to this dictionary to enable custom RoPE # parameterizations, as long as the callable has the same signature. @@ -302,6 +348,7 @@ def _compute_longrope_parameters( "dynamic": _compute_dynamic_ntk_parameters, "yarn": _compute_yarn_parameters, "longrope": _compute_longrope_parameters, + "llama3": _compute_llama3_parameters, } @@ -339,6 +386,20 @@ def _validate_linear_scaling_rope_parameters(config: PretrainedConfig): raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") +def _validate_dynamic_scaling_rope_parameters(config: PretrainedConfig): + rope_scaling = config.rope_scaling + rope_type = rope_scaling["rope_type"] + required_keys = {"rope_type", "factor"} + # TODO (joao): update logic for the inclusion of `original_max_position_embeddings` + optional_keys = {"original_max_position_embeddings"} + received_keys = set(rope_scaling.keys()) + _check_received_keys(rope_type, received_keys, required_keys, optional_keys) + + factor = rope_scaling["factor"] + if factor is None or not isinstance(factor, float) or factor < 1.0: + raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") + + def _validate_yarn_parameters(config: PretrainedConfig): rope_scaling = config.rope_scaling rope_type = rope_scaling["rope_type"] @@ -374,7 +435,8 @@ def _validate_longrope_parameters(config: PretrainedConfig): rope_scaling = config.rope_scaling rope_type = rope_scaling["rope_type"] required_keys = {"rope_type", "short_factor", "long_factor"} - optional_keys = {"attention_factor", "factor"} + # TODO (joao): update logic for the inclusion of `original_max_position_embeddings` + optional_keys = {"attention_factor", "factor", "original_max_position_embeddings"} received_keys = set(rope_scaling.keys()) _check_received_keys(rope_type, received_keys, required_keys, optional_keys) @@ -417,13 +479,50 @@ def _validate_longrope_parameters(config: PretrainedConfig): ) +def _validate_llama3_parameters(config: PretrainedConfig): + rope_scaling = config.rope_scaling + rope_type = rope_scaling["rope_type"] + required_keys = {"rope_type", "factor", "original_max_position_embeddings", "low_freq_factor", "high_freq_factor"} + received_keys = set(rope_scaling.keys()) + _check_received_keys(rope_type, received_keys, required_keys) + + factor = rope_scaling["factor"] + if factor is None or not isinstance(factor, float) or factor < 1.0: + raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") + + low_freq_factor = rope_scaling["low_freq_factor"] + high_freq_factor = rope_scaling["high_freq_factor"] + if low_freq_factor is None or not isinstance(low_freq_factor, float): + raise ValueError(f"`rope_scaling`'s low_freq_factor field must be a float, got {low_freq_factor}") + if high_freq_factor is None or not isinstance(high_freq_factor, float): + raise ValueError(f"`rope_scaling`'s high_freq_factor field must be a float, got {high_freq_factor}") + if high_freq_factor < low_freq_factor: + raise ValueError( + "`rope_scaling`'s high_freq_factor field must be greater than low_freq_factor, got high_freq_factor=" + f"{high_freq_factor} and low_freq_factor={low_freq_factor}" + ) + + original_max_position_embeddings = rope_scaling["original_max_position_embeddings"] + if original_max_position_embeddings is None or not isinstance(original_max_position_embeddings, int): + raise ValueError( + "`rope_scaling`'s original_max_position_embeddings field must be an integer, got " + f"{original_max_position_embeddings}" + ) + if original_max_position_embeddings >= config.max_position_embeddings: + raise ValueError( + "`rope_scaling`'s original_max_position_embeddings field must be less than max_position_embeddings, got " + f"{original_max_position_embeddings} and max_position_embeddings={config.max_position_embeddings}" + ) + + # Like `ROPE_INIT_FUNCTIONS`, this validation function mapping can be dynamically updated for custom RoPE types. ROPE_VALIDATION_FUNCTIONS = { "default": _validate_default_rope_parameters, "linear": _validate_linear_scaling_rope_parameters, - "dynamic": _validate_linear_scaling_rope_parameters, # `dynamic` has the same validation pattern as `linear` + "dynamic": _validate_dynamic_scaling_rope_parameters, "yarn": _validate_yarn_parameters, "longrope": _validate_longrope_parameters, + "llama3": _validate_llama3_parameters, } diff --git a/src/transformers/models/llama/configuration_llama.py b/src/transformers/models/llama/configuration_llama.py index 7c987ec85a0409..c632a870be7a18 100644 --- a/src/transformers/models/llama/configuration_llama.py +++ b/src/transformers/models/llama/configuration_llama.py @@ -73,25 +73,28 @@ class LlamaConfig(PretrainedConfig): End of stream token id. pretraining_tp (`int`, *optional*, defaults to 1): Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this - document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to understand more about it. This value is - necessary to ensure exact reproducibility of the pretraining results. Please refer to [this - issue](https://github.com/pytorch/pytorch/issues/76232). + document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to + understand more about it. This value is necessary to ensure exact reproducibility of the pretraining + results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232). tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings rope_theta (`float`, *optional*, defaults to 10000.0): The base period of the RoPE embeddings. rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. IMPORTANT: RoPE scaling expects - `max_position_embeddings` to remain unchanged -- some methods, like 'longrope', require the original value - to determine which scaling to apply. + Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type + and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value + accordingly. Expected contents: `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope'], - with 'default' being the original RoPE implementation. + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', + 'llama3'], with 'default' being the original RoPE implementation. `factor` (`float`, *optional*): Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In most scaling types, a `factor` of x will enable the model to handle sequences of length x * - `max_position_embeddings`. + original maximum pre-trained length. + `original_max_position_embeddings` (`int`, *optional*): + Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during + pretraining. `attention_factor` (`float`, *optional*): Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention computation. If unspecified, it defaults to value recommended by the implementation, using the @@ -104,12 +107,16 @@ class LlamaConfig(PretrainedConfig): ramp function. If unspecified, it defaults to 1. `short_factor` (`List[float]`, *optional*): Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `max_position_embeddings` * `factor`). Must be a list of numbers with the same length as the hidden + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2 `long_factor` (`List[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `max_position_embeddings` * `factor`). Must be a list of numbers with the same length as the hidden + Only used with 'longrope'. The scaling factor to be applied to long contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2 + `low_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE + `high_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE attention_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): diff --git a/src/transformers/models/llama/convert_llama_weights_to_hf.py b/src/transformers/models/llama/convert_llama_weights_to_hf.py index fd6ab4f2e926de..384daab6b6d7a5 100644 --- a/src/transformers/models/llama/convert_llama_weights_to_hf.py +++ b/src/transformers/models/llama/convert_llama_weights_to_hf.py @@ -17,10 +17,11 @@ import os import shutil import warnings +from typing import List import torch -from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer, PreTrainedTokenizerFast +from transformers import GenerationConfig, LlamaConfig, LlamaForCausalLM, LlamaTokenizer, PreTrainedTokenizerFast from transformers.convert_slow_tokenizer import TikTokenConverter @@ -85,8 +86,12 @@ "65B": 8, "70B": 8, "70Bf": 8, + "405B": 8, + "405B-MP16": 16, } +CONTEXT_LENGTH_FOR_VERSION = {"3.1": 131072, "3": 8192, "2": 4096, "1": 2048} + def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256): return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of) @@ -107,9 +112,10 @@ def write_model( input_base_path, model_size=None, safe_serialization=True, - llama_version=1, + llama_version="1", vocab_size=None, num_shards=None, + instruct=False, ): os.makedirs(model_path, exist_ok=True) tmp_model_path = os.path.join(model_path, "tmp") @@ -125,18 +131,11 @@ def write_model( dims_per_head = dim // n_heads base = params.get("rope_theta", 10000.0) inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head)) - if base > 10000.0 and llama_version != 3: + if base > 10000.0 and float(llama_version) < 3: max_position_embeddings = 16384 else: - # Depending on the Llama version, the default max_position_embeddings has different values. - if llama_version == 1: - max_position_embeddings = 2048 - elif llama_version == 2: - max_position_embeddings = 4096 - elif llama_version == 3: - max_position_embeddings = 8192 - - vocab_size = vocab_size if vocab_size is not None else 32000 + max_position_embeddings = CONTEXT_LENGTH_FOR_VERSION[llama_version] + if params.get("n_kv_heads", None) is not None: num_key_value_heads = params["n_kv_heads"] # for GQA / MQA num_key_value_heads_per_shard = num_key_value_heads // num_shards @@ -144,8 +143,7 @@ def write_model( else: # compatibility with other checkpoints num_key_value_heads = n_heads num_key_value_heads_per_shard = n_heads_per_shard - key_value_dim = dims_per_head * num_key_value_heads - print(num_shards, num_key_value_heads, num_key_value_heads_per_shard, key_value_dim) + key_value_dim = dim # permute for sliced rotary def permute(w, n_heads, dim1=dim, dim2=dim): @@ -159,11 +157,9 @@ def permute(w, n_heads, dim1=dim, dim2=dim): loaded = torch.load(os.path.join(input_base_path, "consolidated.00.pth"), map_location="cpu") else: # Sharded - loaded = [ - torch.load(os.path.join(input_base_path, file), map_location="cpu") - for file in sorted(os.listdir(input_base_path)) - if file.endswith(".pth") - ] + checkpoint_list = sorted([file for file in os.listdir(input_base_path) if file.endswith(".pth")]) + print("Loading in order:", checkpoint_list) + loaded = [torch.load(os.path.join(input_base_path, file), map_location="cpu") for file in checkpoint_list] param_count = 0 index_dict = {"weight_map": {}} for layer_i in range(n_layers): @@ -263,7 +259,7 @@ def permute(w, n_heads, dim1=dim, dim2=dim): "lm_head.weight": loaded["output.weight"], } else: - concat_dim = 0 if llama_version == 3 else 1 + concat_dim = 0 if llama_version in ["3", "3.1"] else 1 state_dict = { "model.norm.weight": loaded[0]["norm.weight"], "model.embed_tokens.weight": torch.cat( @@ -282,6 +278,18 @@ def permute(w, n_heads, dim1=dim, dim2=dim): write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json")) ffn_dim_multiplier = params["ffn_dim_multiplier"] if "ffn_dim_multiplier" in params else 1 multiple_of = params["multiple_of"] if "multiple_of" in params else 256 + + if llama_version in ["3", "3.1"]: + bos_token_id = 128000 + + if instruct: + eos_token_id = [128001, 128008, 128009] + else: + eos_token_id = 128001 + else: + bos_token_id = 1 + eos_token_id = 2 + config = LlamaConfig( hidden_size=dim, intermediate_size=compute_intermediate_size(dim, ffn_dim_multiplier, multiple_of), @@ -292,11 +300,21 @@ def permute(w, n_heads, dim1=dim, dim2=dim): vocab_size=vocab_size, rope_theta=base, max_position_embeddings=max_position_embeddings, - bos_token_id=128000 if llama_version == 3 else 1, - eos_token_id=128001 if llama_version == 3 else 2, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, ) config.save_pretrained(tmp_model_path) + if instruct: + generation_config = GenerationConfig( + do_sample=True, + temperature=0.6, + top_p=0.9, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + ) + generation_config.save_pretrained(tmp_model_path) + # Make space so we can load the model properly now. del state_dict del loaded @@ -313,7 +331,7 @@ def permute(w, n_heads, dim1=dim, dim2=dim): class Llama3Converter(TikTokenConverter): - def __init__(self, vocab_file, num_reserved_special_tokens=256, **kwargs): + def __init__(self, vocab_file, special_tokens=None, instruct=False, model_max_length=None, **kwargs): super().__init__(vocab_file, **kwargs) tokenizer = self.converted() chat_template = ( @@ -327,34 +345,24 @@ def __init__(self, vocab_file, num_reserved_special_tokens=256, **kwargs): "{% endfor %}" "{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}" ) - num_reserved_special_tokens = 256 - special_tokens = [ - "<|begin_of_text|>", - "<|end_of_text|>", - "<|reserved_special_token_0|>", - "<|reserved_special_token_1|>", - "<|reserved_special_token_2|>", - "<|reserved_special_token_3|>", - "<|start_header_id|>", - "<|end_header_id|>", - "<|reserved_special_token_4|>", - "<|eot_id|>", # end of turn - ] + [f"<|reserved_special_token_{i}|>" for i in range(5, num_reserved_special_tokens - 5)] tokenizer.add_special_tokens(special_tokens) self.tokenizer = PreTrainedTokenizerFast( tokenizer_object=tokenizer, bos_token="<|begin_of_text|>", - eos_token="<|end_of_text|>", - chat_template=chat_template, + eos_token="<|end_of_text|>" if not instruct else "<|eot_id|>", + chat_template=chat_template if instruct else None, model_input_names=["input_ids", "attention_mask"], + model_max_length=model_max_length, ) -def write_tokenizer(tokenizer_path, input_tokenizer_path, llama_version=2): +def write_tokenizer(tokenizer_path, input_tokenizer_path, llama_version="2", special_tokens=None, instruct=False): tokenizer_class = LlamaTokenizer if LlamaTokenizerFast is None else LlamaTokenizerFast - if llama_version == 3: - tokenizer = Llama3Converter(input_tokenizer_path).tokenizer + if llama_version in ["3", "3.1"]: + tokenizer = Llama3Converter( + input_tokenizer_path, special_tokens, instruct, model_max_length=CONTEXT_LENGTH_FOR_VERSION[llama_version] + ).tokenizer else: tokenizer = tokenizer_class(input_tokenizer_path) print(f"Saving a {tokenizer_class.__name__} to {tokenizer_path}.") @@ -362,6 +370,37 @@ def write_tokenizer(tokenizer_path, input_tokenizer_path, llama_version=2): return tokenizer +DEFAULT_LLAMA_SPECIAL_TOKENS = { + "3": [ + "<|begin_of_text|>", + "<|end_of_text|>", + "<|reserved_special_token_0|>", + "<|reserved_special_token_1|>", + "<|reserved_special_token_2|>", + "<|reserved_special_token_3|>", + "<|start_header_id|>", + "<|end_header_id|>", + "<|reserved_special_token_4|>", + "<|eot_id|>", # end of turn + ] + + [f"<|reserved_special_token_{i}|>" for i in range(5, 256 - 5)], + "3.1": [ + "<|begin_of_text|>", + "<|end_of_text|>", + "<|reserved_special_token_0|>", + "<|reserved_special_token_1|>", + "<|finetune_right_pad_id|>", + "<|reserved_special_token_2|>", + "<|start_header_id|>", + "<|end_header_id|>", + "<|eom_id|>", # end of message + "<|eot_id|>", # end of turn + "<|python_tag|>", + ] + + [f"<|reserved_special_token_{i}|>" for i in range(3, 256 - 8)], +} + + def main(): parser = argparse.ArgumentParser() parser.add_argument( @@ -383,9 +422,9 @@ def main(): # Different Llama versions used different default values for max_position_embeddings, hence the need to be able to specify which version is being used. parser.add_argument( "--llama_version", - choices=[1, 2, 3], - default=1, - type=int, + choices=["1", "2", "3", "3.1"], + default="1", + type=str, help="Version of the Llama model to convert. Currently supports Llama1 and Llama2. Controls the context size", ) parser.add_argument( @@ -394,11 +433,34 @@ def main(): type=int, help="The number of individual shards used for the model. Does not have to be the same as the number of consolidated_xx.pth", ) + parser.add_argument( + "--special_tokens", + default=None, + type=List[str], + help="The list of special tokens that should be added to the model.", + ) + parser.add_argument( + "--instruct", + default=False, + type=bool, + help="Whether the model is an instruct model or not. Will affect special tokens for llama 3.1.", + ) args = parser.parse_args() if args.model_size is None and args.num_shards is None: raise ValueError("You have to set at least `num_shards` if you are not giving the `model_size`") + if args.special_tokens is None: + args.special_tokens = DEFAULT_LLAMA_SPECIAL_TOKENS[str(args.llama_version)] + spm_path = os.path.join(args.input_dir, "tokenizer.model") - vocab_size = len(write_tokenizer(args.output_dir, spm_path, llama_version=args.llama_version)) + vocab_size = len( + write_tokenizer( + args.output_dir, + spm_path, + llama_version=args.llama_version, + special_tokens=args.special_tokens, + instruct=args.instruct, + ) + ) if args.model_size != "tokenizer_only": write_model( model_path=args.output_dir, @@ -408,6 +470,7 @@ def main(): llama_version=args.llama_version, vocab_size=vocab_size, num_shards=args.num_shards, + instruct=args.instruct, ) From 23f6a43f82fb2980f4b30cf3f95eb3a940384895 Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Tue, 23 Jul 2024 16:48:16 +0100 Subject: [PATCH 30/73] fix (#32162) --- src/transformers/models/llama/modeling_llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index ce76d1d1ec1b9d..adb455acfbbc24 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -107,7 +107,7 @@ def __init__( else: # BC: "rope_type" was originally "type" if config.rope_scaling is not None: - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling["type"]) + self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) else: self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings From bc2adb0112b6677b0dfb4105c74570a0f92183eb Mon Sep 17 00:00:00 2001 From: Sai-Suraj-27 Date: Tue, 23 Jul 2024 21:22:41 +0530 Subject: [PATCH 31/73] fix: Fixed an if condition that is always evaluating to true (#32160) Fixed an if condition always evaluating to true. --- src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py b/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py index a407a67f3813ed..16e4d71212b53a 100644 --- a/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py +++ b/src/transformers/models/dpt/convert_dpt_hybrid_to_pytorch.py @@ -43,7 +43,7 @@ def get_dpt_config(checkpoint_url): config.neck_hidden_sizes = [256, 512, 1024, 1024] expected_shape = (1, 384, 384) - if "nyu" or "midas" in checkpoint_url: + if "nyu" in checkpoint_url or "midas" in checkpoint_url: config.hidden_size = 768 config.reassemble_factors = [1, 1, 1, 0.5] config.neck_hidden_sizes = [256, 512, 768, 768] From c85510f958e6955d88ea1bafb4f320074bfbd0c1 Mon Sep 17 00:00:00 2001 From: Fanli Lin Date: Wed, 24 Jul 2024 00:47:51 +0800 Subject: [PATCH 32/73] [docs] change temperature to a positive value (#32077) fix --- docs/source/en/conversations.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/conversations.md b/docs/source/en/conversations.md index 9336503ad7cb8c..a48c046b4949d7 100644 --- a/docs/source/en/conversations.md +++ b/docs/source/en/conversations.md @@ -195,7 +195,7 @@ inputs = {key: tensor.to(model.device) for key, tensor in inputs.items()} print("Tokenized inputs:\n", inputs) # 4: Generate text from the model -outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.) +outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.1) print("Generated tokens:\n", outputs) # 5: Decode the output back to a string From 01be5b48790f113b7d71943b580c842e3e097988 Mon Sep 17 00:00:00 2001 From: Rohit Dwivedula <25080952+rohitdwivedula@users.noreply.github.com> Date: Wed, 24 Jul 2024 02:09:59 -0500 Subject: [PATCH 33/73] adds: extra_repr() to MambaRMSNorm to include hidden size / size of weights in the layer (#32171) * adds: extra_repr() to MambaRMSNorm to include the hidden size of the layer * style fix with ruff: --- src/transformers/models/mamba/modeling_mamba.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/transformers/models/mamba/modeling_mamba.py b/src/transformers/models/mamba/modeling_mamba.py index 50c0f9ebe4a580..fb519bee3da03f 100644 --- a/src/transformers/models/mamba/modeling_mamba.py +++ b/src/transformers/models/mamba/modeling_mamba.py @@ -327,6 +327,9 @@ def forward(self, hidden_states): hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) return self.weight * hidden_states.to(input_dtype) + def extra_repr(self): + return f"{self.weight.shape[0]}, eps={self.variance_epsilon}" + class MambaBlock(nn.Module): def __init__(self, config, layer_idx): From 8678879f1dc2578cec18232146bf19de97aecaa1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EC=A1=B0=EC=A4=80=EB=9E=98?= Date: Wed, 24 Jul 2024 19:38:49 +0900 Subject: [PATCH 34/73] fix: default value reflects the runtime environment variables rather than the ones present at import time. (#32153) * fix: default value reflects the runtime environment variables rather than the ones present at import time. * Fix: Change `deterministic` to None by default; use env var if None --- src/transformers/modeling_flash_attention_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/transformers/modeling_flash_attention_utils.py b/src/transformers/modeling_flash_attention_utils.py index 88dd99e6901d16..4f0ff8817b7e36 100644 --- a/src/transformers/modeling_flash_attention_utils.py +++ b/src/transformers/modeling_flash_attention_utils.py @@ -193,7 +193,7 @@ def _flash_attention_forward( sliding_window: Optional[int] = None, use_top_left_mask: bool = False, softcap: Optional[float] = None, - deterministic: bool = os.environ.get("FLASH_ATTENTION_DETERMINISTIC", "0") == "1", + deterministic: bool = None, ): """ Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token @@ -233,6 +233,8 @@ def _flash_attention_forward( flash_kwargs = {"window_size": (sliding_window, sliding_window)} if use_sliding_windows else {} if is_flash_attn_greater_or_equal("2.4.1"): + if deterministic is None: + deterministic = os.environ.get("FLASH_ATTENTION_DETERMINISTIC", "0") == "1" flash_kwargs["deterministic"] = deterministic if softcap is not None: From 5f4ee98a7ade33e1c54fdd6181d04ee7b426b392 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dr=2E=20Artificial=E6=9B=BE=E5=B0=8F=E5=81=A5?= <875100501@qq.com> Date: Wed, 24 Jul 2024 18:54:41 +0800 Subject: [PATCH 35/73] Update qwen2.md (#32108) * Update qwen2.md outdated description * Update qwen2.md amended * Update qwen2.md Update * Update qwen2.md fix wrong version code, now good to go --- docs/source/en/model_doc/qwen2.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/source/en/model_doc/qwen2.md b/docs/source/en/model_doc/qwen2.md index ac0e25e02c35f9..16815f2fc1f3cd 100644 --- a/docs/source/en/model_doc/qwen2.md +++ b/docs/source/en/model_doc/qwen2.md @@ -18,7 +18,7 @@ rendered properly in your Markdown viewer. ## Overview -Qwen2 is the new model series of large language models from the Qwen team. Previously, we released the Qwen series, including Qwen-72B, Qwen-1.8B, Qwen-VL, Qwen-Audio, etc. +Qwen2 is the new model series of large language models from the Qwen team. Previously, we released the Qwen series, including Qwen2-0.5B, Qwen2-1.5B, Qwen2-7B, Qwen2-57B-A14B, Qwen2-72B, Qwen2-Audio, etc. ### Model Details @@ -27,16 +27,16 @@ Qwen2 is a language model series including decoder language models of different ## Usage tips -`Qwen2-7B-beta` and `Qwen2-7B-Chat-beta` can be found on the [Huggingface Hub](https://huggingface.co/Qwen) +`Qwen2-7B` and `Qwen2-7B-Instruct` can be found on the [Huggingface Hub](https://huggingface.co/Qwen) -In the following, we demonstrate how to use `Qwen2-7B-Chat-beta` for the inference. Note that we have used the ChatML format for dialog, in this demo we show how to leverage `apply_chat_template` for this purpose. +In the following, we demonstrate how to use `Qwen2-7B-Instruct` for the inference. Note that we have used the ChatML format for dialog, in this demo we show how to leverage `apply_chat_template` for this purpose. ```python >>> from transformers import AutoModelForCausalLM, AutoTokenizer >>> device = "cuda" # the device to load the model onto ->>> model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen1.5-7B-Chat", device_map="auto") ->>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-7B-Chat") +>>> model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-7B-Instruct", device_map="auto") +>>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-7B-Instruct") >>> prompt = "Give me a short introduction to large language model." From 165116bc145dcc186fa287e624b28a9ab3a79955 Mon Sep 17 00:00:00 2001 From: amyeroberts <22614925+amyeroberts@users.noreply.github.com> Date: Wed, 24 Jul 2024 14:03:40 +0100 Subject: [PATCH 36/73] Remove conversational pipeline tests (#32099) Remove conversation pipeline tests --- .../test_pipelines_conversational.py | 439 ------------------ 1 file changed, 439 deletions(-) delete mode 100644 tests/pipelines/test_pipelines_conversational.py diff --git a/tests/pipelines/test_pipelines_conversational.py b/tests/pipelines/test_pipelines_conversational.py deleted file mode 100644 index 5b6eb514b1a954..00000000000000 --- a/tests/pipelines/test_pipelines_conversational.py +++ /dev/null @@ -1,439 +0,0 @@ -# Copyright 2020 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import gc -import unittest - -from transformers import ( - MODEL_FOR_CAUSAL_LM_MAPPING, - MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING, - TF_MODEL_FOR_CAUSAL_LM_MAPPING, - TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING, - AutoModelForCausalLM, - AutoModelForSeq2SeqLM, - AutoTokenizer, - BlenderbotSmallForConditionalGeneration, - BlenderbotSmallTokenizer, - Conversation, - ConversationalPipeline, - TFAutoModelForCausalLM, - pipeline, -) -from transformers.testing_utils import ( - backend_empty_cache, - is_pipeline_test, - is_torch_available, - require_tf, - require_torch, - slow, - torch_device, -) - -from .test_pipelines_common import ANY - - -@is_pipeline_test -class ConversationalPipelineTests(unittest.TestCase): - def tearDown(self): - super().tearDown() - # clean-up as much as possible GPU memory occupied by PyTorch - gc.collect() - if is_torch_available(): - backend_empty_cache(torch_device) - - model_mapping = dict( - list(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING.items()) - if MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING - else [] + list(MODEL_FOR_CAUSAL_LM_MAPPING.items()) - if MODEL_FOR_CAUSAL_LM_MAPPING - else [] - ) - tf_model_mapping = dict( - list(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING.items()) - if TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING - else [] + list(TF_MODEL_FOR_CAUSAL_LM_MAPPING.items()) - if TF_MODEL_FOR_CAUSAL_LM_MAPPING - else [] - ) - - def get_test_pipeline(self, model, tokenizer, processor): - conversation_agent = ConversationalPipeline(model=model, tokenizer=tokenizer) - return conversation_agent, [Conversation("Hi there!")] - - def run_pipeline_test(self, conversation_agent, _): - # Simple - outputs = conversation_agent(Conversation("Hi there!"), max_new_tokens=5) - self.assertEqual( - outputs, - Conversation([{"role": "user", "content": "Hi there!"}, {"role": "assistant", "content": ANY(str)}]), - ) - - # Single list - outputs = conversation_agent([Conversation("Hi there!")], max_new_tokens=5) - self.assertEqual( - outputs, - Conversation([{"role": "user", "content": "Hi there!"}, {"role": "assistant", "content": ANY(str)}]), - ) - - # Batch - conversation_1 = Conversation("Going to the movies tonight - any suggestions?") - conversation_2 = Conversation("What's the last book you have read?") - self.assertEqual(len(conversation_1), 1) - self.assertEqual(len(conversation_2), 1) - - outputs = conversation_agent([conversation_1, conversation_2], max_new_tokens=5) - self.assertEqual(outputs, [conversation_1, conversation_2]) - self.assertEqual( - outputs, - [ - Conversation( - [ - {"role": "user", "content": "Going to the movies tonight - any suggestions?"}, - {"role": "assistant", "content": ANY(str)}, - ], - ), - Conversation( - [ - {"role": "user", "content": "What's the last book you have read?"}, - {"role": "assistant", "content": ANY(str)}, - ] - ), - ], - ) - - # One conversation with history - conversation_2.add_message({"role": "user", "content": "Why do you recommend it?"}) - outputs = conversation_agent(conversation_2, max_new_tokens=5) - self.assertEqual(outputs, conversation_2) - self.assertEqual( - outputs, - Conversation( - [ - {"role": "user", "content": "What's the last book you have read?"}, - {"role": "assistant", "content": ANY(str)}, - {"role": "user", "content": "Why do you recommend it?"}, - {"role": "assistant", "content": ANY(str)}, - ] - ), - ) - - @require_torch - @slow - def test_integration_torch_conversation(self): - # When - conversation_agent = pipeline(task="conversational", device=torch_device) - conversation_1 = Conversation("Going to the movies tonight - any suggestions?") - conversation_2 = Conversation("What's the last book you have read?") - # Then - self.assertEqual(len(conversation_1.past_user_inputs), 0) - self.assertEqual(len(conversation_2.past_user_inputs), 0) - # When - result = conversation_agent([conversation_1, conversation_2], do_sample=False, max_length=1000) - # Then - self.assertEqual(result, [conversation_1, conversation_2]) - self.assertEqual(len(result[0].past_user_inputs), 1) - self.assertEqual(len(result[1].past_user_inputs), 1) - self.assertEqual(len(result[0].generated_responses), 1) - self.assertEqual(len(result[1].generated_responses), 1) - self.assertEqual(result[0].past_user_inputs[0], "Going to the movies tonight - any suggestions?") - self.assertEqual(result[0].generated_responses[0], "The Big Lebowski") - self.assertEqual(result[1].past_user_inputs[0], "What's the last book you have read?") - self.assertEqual(result[1].generated_responses[0], "The Last Question") - # When - conversation_2.add_user_input("Why do you recommend it?") - result = conversation_agent(conversation_2, do_sample=False, max_length=1000) - # Then - self.assertEqual(result, conversation_2) - self.assertEqual(len(result.past_user_inputs), 2) - self.assertEqual(len(result.generated_responses), 2) - self.assertEqual(result.past_user_inputs[1], "Why do you recommend it?") - self.assertEqual(result.generated_responses[1], "It's a good book.") - - @require_torch - @slow - def test_integration_torch_conversation_truncated_history(self): - # When - conversation_agent = pipeline(task="conversational", min_length_for_response=24, device=torch_device) - conversation_1 = Conversation("Going to the movies tonight - any suggestions?") - # Then - self.assertEqual(len(conversation_1.past_user_inputs), 0) - # When - result = conversation_agent(conversation_1, do_sample=False, max_length=36) - # Then - self.assertEqual(result, conversation_1) - self.assertEqual(len(result.past_user_inputs), 1) - self.assertEqual(len(result.generated_responses), 1) - self.assertEqual(result.past_user_inputs[0], "Going to the movies tonight - any suggestions?") - self.assertEqual(result.generated_responses[0], "The Big Lebowski") - # When - conversation_1.add_user_input("Is it an action movie?") - result = conversation_agent(conversation_1, do_sample=False, max_length=36) - # Then - self.assertEqual(result, conversation_1) - self.assertEqual(len(result.past_user_inputs), 2) - self.assertEqual(len(result.generated_responses), 2) - self.assertEqual(result.past_user_inputs[1], "Is it an action movie?") - self.assertEqual(result.generated_responses[1], "It's a comedy.") - - @require_torch - def test_small_model_pt(self): - tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small") - model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small") - conversation_agent = ConversationalPipeline(model=model, tokenizer=tokenizer) - conversation = Conversation("hello") - output = conversation_agent(conversation) - self.assertEqual(output, Conversation(past_user_inputs=["hello"], generated_responses=["Hi"])) - - @require_tf - def test_small_model_tf(self): - tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small") - model = TFAutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small") - conversation_agent = ConversationalPipeline(model=model, tokenizer=tokenizer) - conversation = Conversation("hello") - output = conversation_agent(conversation) - self.assertEqual(output, Conversation(past_user_inputs=["hello"], generated_responses=["Hi"])) - - @require_torch - @slow - def test_integration_torch_conversation_dialogpt_input_ids(self): - tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small") - model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small") - conversation_agent = ConversationalPipeline(model=model, tokenizer=tokenizer) - - conversation_1 = Conversation("hello") - inputs = conversation_agent.preprocess(conversation_1) - self.assertEqual(inputs["input_ids"].tolist(), [[31373, 50256]]) - - conversation_2 = Conversation("how are you ?", past_user_inputs=["hello"], generated_responses=["Hi there!"]) - inputs = conversation_agent.preprocess(conversation_2) - self.assertEqual( - inputs["input_ids"].tolist(), [[31373, 50256, 17250, 612, 0, 50256, 4919, 389, 345, 5633, 50256]] - ) - - @unittest.skip(reason="Model is curently gated") - @require_torch - @slow - def test_integration_torch_conversation_llama2_input_ids(self): - tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf", use_default_system_prompt=True) - - conversation = Conversation( - "What is so great about #1?", - past_user_inputs=["I am going to Paris, what should I see?"], - generated_responses=[ - """\ -Paris, the capital of France, is known for its stunning architecture, art museums, historical landmarks, and romantic atmosphere. Here are some of the top attractions to see in Paris: - -1. The Eiffel Tower: The iconic Eiffel Tower is one of the most recognizable landmarks in the world and offers breathtaking views of the city. -2. The Louvre Museum: The Louvre is one of the world's largest and most famous museums, housing an impressive collection of art and artifacts, including the Mona Lisa. -3. Notre-Dame Cathedral: This beautiful cathedral is one of the most famous landmarks in Paris and is known for its Gothic architecture and stunning stained glass windows. - -These are just a few of the many attractions that Paris has to offer. With so much to see and do, it's no wonder that Paris is one of the most popular tourist destinations in the world.""" - ], - ) - inputs = tokenizer._build_conversation_input_ids(conversation) - EXPECTED_INPUTS_IDS = [ 1, 518, 25580, 29962, 3532, 14816, 29903, 6778, 13, 3492, 526, 263, 8444, 29892, 3390, 1319, 322, 15993, 20255, 29889, 29849, 1234, 408, 1371, 3730, 408, 1950, 29892, 1550, 1641, 9109, 29889, 29871, 3575, 6089, 881, 451, 3160, 738, 10311, 1319, 29892, 443, 621, 936, 29892, 11021, 391, 29892, 7916, 391, 29892, 304, 27375, 29892, 18215, 29892, 470, 27302, 2793, 29889, 3529, 9801, 393, 596, 20890, 526, 5374, 635, 443, 5365, 1463, 322, 6374, 297, 5469, 29889, 13, 13, 3644, 263, 1139, 947, 451, 1207, 738, 4060, 29892, 470, 338, 451, 2114, 1474, 16165, 261, 296, 29892, 5649, 2020, 2012, 310, 22862, 1554, 451, 1959, 29889, 960, 366, 1016, 29915, 29873, 1073, 278, 1234, 304, 263, 1139, 29892, 3113, 1016, 29915, 29873, 6232, 2089, 2472, 29889, 13, 29966, 829, 14816, 29903, 6778, 13, 13, 29902, 626, 2675, 304, 3681, 29892, 825, 881, 306, 1074, 29973, 518, 29914, 25580, 29962, 3681, 29892, 278, 7483, 310, 3444, 29892, 338, 2998, 363, 967, 380, 27389, 11258, 29892, 1616, 19133, 29879, 29892, 15839, 2982, 22848, 29892, 322, 6017, 7716, 25005, 29889, 2266, 526, 777, 310, 278, 2246, 19650, 1953, 304, 1074, 297, 3681, 29901, 13, 13, 29896, 29889, 450, 382, 2593, 295, 23615, 29901, 450, 9849, 293, 382, 2593, 295, 23615, 338, 697, 310, 278, 1556, 5936, 13902, 2982, 22848, 297, 278, 3186, 322, 16688, 2078, 271, 400, 5086, 8386, 310, 278, 4272, 29889, 13, 29906, 29889, 450, 4562, 12675, 6838, 29901, 450, 4562, 12675, 338, 697, 310, 278, 3186, 29915, 29879, 10150, 322, 1556, 13834, 19133, 29879, 29892, 27261, 385, 21210, 573, 4333, 310, 1616, 322, 24238, 29879, 29892, 3704, 278, 2598, 29874, 29420, 29889, 13, 29941, 29889, 24337, 29899, 29928, 420, 315, 21471, 29901, 910, 9560, 274, 21471, 338, 697, 310, 278, 1556, 13834, 2982, 22848, 297, 3681, 322, 338, 2998, 363, 967, 22883, 293, 11258, 322, 380, 27389, 380, 7114, 12917, 5417, 29889, 13, 13, 1349, 968, 526, 925, 263, 2846, 310, 278, 1784, 19650, 1953, 393, 3681, 756, 304, 5957, 29889, 2973, 577, 1568, 304, 1074, 322, 437, 29892, 372, 29915, 29879, 694, 4997, 393, 3681, 338, 697, 310, 278, 1556, 5972, 6282, 391, 15422, 800, 297, 278, 3186, 29889, 29871, 2, 1, 518, 25580, 29962, 1724, 338, 577, 2107, 1048, 396, 29896, 29973, 518, 29914, 25580, 29962] # fmt: skip - self.assertEqual(inputs, EXPECTED_INPUTS_IDS) - - model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf") - conversation_agent = ConversationalPipeline(model=model, tokenizer=tokenizer) - EXPECTED_TEXT = "what topic you want to focus on and create content around it. This will help you stand out from other creators and attract a specific audience.\n\nStep 2: Set Up Your Channel\nCreate your YouTube account and customize your channel with your branding and logo. Make sure your channel name and profile picture are consistent with your niche.\n\nStep 3: Plan Your Content\nDevelop a content strategy that includes the type of content you want to create, how often you will post, and when you will post. Consider creating a content calendar to help you stay organized.\n\nStep 4: Invest in Quality Equipment\nInvest in good quality camera and microphone equipment to ensure your videos look and sound professional. You don't need to break the bank, but investing in good equipment will make a big difference in the quality of your videos.\n\nStep 5: Optimize Your Videos for Search\nUse keywords in your video titles, descriptions, and tags to help people find your videos when they search for topics related to your niche" - conversation = Conversation( - "<>\n Only answer with emojis, and charades\n<>\n\nHow can I build a house in 10 steps?" - ) - result = conversation_agent(conversation) - self.assertEqual(result.generated_responses[-1], EXPECTED_TEXT) - - @require_torch - @slow - def test_integration_torch_conversation_blenderbot_400M_input_ids(self): - tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill") - model = AutoModelForSeq2SeqLM.from_pretrained("facebook/blenderbot-400M-distill") - conversation_agent = ConversationalPipeline(model=model, tokenizer=tokenizer) - - # test1 - conversation_1 = Conversation("hello") - inputs = conversation_agent.preprocess(conversation_1) - self.assertEqual(inputs["input_ids"].tolist(), [[1710, 86, 2]]) - - # test2 - conversation_1 = Conversation( - "I like lasagne.", - past_user_inputs=["hello"], - generated_responses=[ - " Do you like lasagne? It is a traditional Italian dish consisting of a shepherd's pie." - ], - ) - inputs = conversation_agent.preprocess(conversation_1) - self.assertEqual( - inputs["input_ids"].tolist(), - [ - # This should be compared with the same conversation on ParlAI `safe_interactive` demo. - [ - 1710, # hello - 86, - 228, # Double space - 228, - 946, - 304, - 398, - 6881, - 558, - 964, - 38, - 452, - 315, - 265, - 6252, - 452, - 322, - 968, - 6884, - 3146, - 278, - 306, - 265, - 617, - 87, - 388, - 75, - 341, - 286, - 521, - 21, - 228, # Double space - 228, - 281, # I like lasagne. - 398, - 6881, - 558, - 964, - 21, - 2, # EOS - ], - ], - ) - - @require_torch - @slow - def test_integration_torch_conversation_blenderbot_400M(self): - tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill") - model = AutoModelForSeq2SeqLM.from_pretrained("facebook/blenderbot-400M-distill") - conversation_agent = ConversationalPipeline(model=model, tokenizer=tokenizer) - - conversation_1 = Conversation("hello") - result = conversation_agent( - conversation_1, - ) - self.assertEqual( - result.generated_responses[0], - # ParlAI implementation output, we have a different one, but it's our - # second best, you can check by using num_return_sequences=10 - # " Hello! How are you? I'm just getting ready to go to work, how about you?", - " Hello! How are you doing today? I just got back from a walk with my dog.", - ) - - conversation_1 = Conversation("Lasagne hello") - result = conversation_agent(conversation_1, encoder_no_repeat_ngram_size=3) - self.assertEqual( - result.generated_responses[0], - " Do you like lasagne? It is a traditional Italian dish consisting of a shepherd's pie.", - ) - - conversation_1 = Conversation( - "Lasagne hello Lasagne is my favorite Italian dish. Do you like lasagne? I like lasagne." - ) - result = conversation_agent( - conversation_1, - encoder_no_repeat_ngram_size=3, - ) - self.assertEqual( - result.generated_responses[0], - " Me too. I like how it can be topped with vegetables, meats, and condiments.", - ) - - @require_torch - @slow - def test_integration_torch_conversation_encoder_decoder(self): - # When - tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot_small-90M") - model = AutoModelForSeq2SeqLM.from_pretrained("facebook/blenderbot_small-90M") - conversation_agent = ConversationalPipeline(model=model, tokenizer=tokenizer, device=torch_device) - - conversation_1 = Conversation("My name is Sarah and I live in London") - conversation_2 = Conversation("Going to the movies tonight, What movie would you recommend? ") - # Then - self.assertEqual(len(conversation_1.past_user_inputs), 0) - self.assertEqual(len(conversation_2.past_user_inputs), 0) - # When - result = conversation_agent([conversation_1, conversation_2], do_sample=False, max_length=1000) - # Then - self.assertEqual(result, [conversation_1, conversation_2]) - self.assertEqual(len(result[0].past_user_inputs), 1) - self.assertEqual(len(result[1].past_user_inputs), 1) - self.assertEqual(len(result[0].generated_responses), 1) - self.assertEqual(len(result[1].generated_responses), 1) - self.assertEqual(result[0].past_user_inputs[0], "My name is Sarah and I live in London") - self.assertEqual( - result[0].generated_responses[0], - "hi sarah, i live in london as well. do you have any plans for the weekend?", - ) - self.assertEqual( - result[1].past_user_inputs[0], "Going to the movies tonight, What movie would you recommend? " - ) - self.assertEqual( - result[1].generated_responses[0], "i don't know... i'm not really sure. what movie are you going to see?" - ) - # When - conversation_1.add_user_input("Not yet, what about you?") - conversation_2.add_user_input("What's your name?") - result = conversation_agent([conversation_1, conversation_2], do_sample=False, max_length=1000) - # Then - self.assertEqual(result, [conversation_1, conversation_2]) - self.assertEqual(len(result[0].past_user_inputs), 2) - self.assertEqual(len(result[1].past_user_inputs), 2) - self.assertEqual(len(result[0].generated_responses), 2) - self.assertEqual(len(result[1].generated_responses), 2) - self.assertEqual(result[0].past_user_inputs[1], "Not yet, what about you?") - self.assertEqual(result[0].generated_responses[1], "i don't have any plans yet. i'm not sure what to do yet.") - self.assertEqual(result[1].past_user_inputs[1], "What's your name?") - self.assertEqual(result[1].generated_responses[1], "i don't have a name, but i'm going to see a horror movie.") - - @require_torch - @slow - def test_from_pipeline_conversation(self): - model_id = "facebook/blenderbot_small-90M" - - # from model id - conversation_agent_from_model_id = pipeline("conversational", model=model_id, tokenizer=model_id) - - # from model object - model = BlenderbotSmallForConditionalGeneration.from_pretrained(model_id) - tokenizer = BlenderbotSmallTokenizer.from_pretrained(model_id) - conversation_agent_from_model = pipeline("conversational", model=model, tokenizer=tokenizer) - - conversation = Conversation("My name is Sarah and I live in London") - conversation_copy = Conversation("My name is Sarah and I live in London") - - result_model_id = conversation_agent_from_model_id([conversation]) - result_model = conversation_agent_from_model([conversation_copy]) - - # check for equality - self.assertEqual( - result_model_id.generated_responses[0], - "hi sarah, i live in london as well. do you have any plans for the weekend?", - ) - self.assertEqual( - result_model_id.generated_responses[0], - result_model.generated_responses[0], - ) From e0182f3bd7f4753c1e378e052ceea67898d97359 Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Wed, 24 Jul 2024 15:00:48 +0100 Subject: [PATCH 37/73] RoPE: relaxed rope validation (#32182) * relaxed rope check * lets also accept rope_type=None, defaulting to the original implementation * type and rope_type can coexist --- src/transformers/modeling_rope_utils.py | 72 +++++++++---------- .../models/llama/configuration_llama.py | 3 + tests/models/llama/test_modeling_llama.py | 54 ++++++++++++++ 3 files changed, 93 insertions(+), 36 deletions(-) diff --git a/src/transformers/modeling_rope_utils.py b/src/transformers/modeling_rope_utils.py index 14a12b9394059d..bee0269ff82e9a 100644 --- a/src/transformers/modeling_rope_utils.py +++ b/src/transformers/modeling_rope_utils.py @@ -354,6 +354,11 @@ def _compute_llama3_parameters( def _check_received_keys(rope_type: str, received_keys: set, required_keys: set, optional_keys: Optional[set] = None): """Compare the received keys in `config.rope_scaling` against the expected and optional keys""" + # BC: "rope_type" was originally "type" -- let's gracefully handle it + if "rope_type" not in received_keys and "type" in received_keys: + received_keys -= {"type"} + received_keys.add("rope_type") + missing_keys = required_keys - received_keys if missing_keys: raise KeyError(f"Missing required keys in `rope_scaling` for 'rope_type'='{rope_type}': {missing_keys}") @@ -361,14 +366,14 @@ def _check_received_keys(rope_type: str, received_keys: set, required_keys: set, if optional_keys is not None: unused_keys = received_keys - required_keys - optional_keys else: - unused_keys = received_keys - received_keys + unused_keys = received_keys - required_keys if unused_keys: - raise KeyError(f"Unrecognized keys in `rope_scaling` for 'rope_type'='{rope_type}': {unused_keys}") + logger.warning(f"Unrecognized keys in `rope_scaling` for 'rope_type'='{rope_type}': {unused_keys}") def _validate_default_rope_parameters(config: PretrainedConfig): rope_scaling = config.rope_scaling - rope_type = rope_scaling["rope_type"] + rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type" required_keys = {"rope_type"} received_keys = set(rope_scaling.keys()) _check_received_keys(rope_type, received_keys, required_keys) @@ -376,19 +381,19 @@ def _validate_default_rope_parameters(config: PretrainedConfig): def _validate_linear_scaling_rope_parameters(config: PretrainedConfig): rope_scaling = config.rope_scaling - rope_type = rope_scaling["rope_type"] + rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type" required_keys = {"rope_type", "factor"} received_keys = set(rope_scaling.keys()) _check_received_keys(rope_type, received_keys, required_keys) factor = rope_scaling["factor"] if factor is None or not isinstance(factor, float) or factor < 1.0: - raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") + logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") def _validate_dynamic_scaling_rope_parameters(config: PretrainedConfig): rope_scaling = config.rope_scaling - rope_type = rope_scaling["rope_type"] + rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type" required_keys = {"rope_type", "factor"} # TODO (joao): update logic for the inclusion of `original_max_position_embeddings` optional_keys = {"original_max_position_embeddings"} @@ -397,12 +402,12 @@ def _validate_dynamic_scaling_rope_parameters(config: PretrainedConfig): factor = rope_scaling["factor"] if factor is None or not isinstance(factor, float) or factor < 1.0: - raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") + logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") def _validate_yarn_parameters(config: PretrainedConfig): rope_scaling = config.rope_scaling - rope_type = rope_scaling["rope_type"] + rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type" required_keys = {"rope_type", "factor"} optional_keys = {"attention_factor", "beta_fast", "beta_slow"} received_keys = set(rope_scaling.keys()) @@ -410,22 +415,22 @@ def _validate_yarn_parameters(config: PretrainedConfig): factor = rope_scaling["factor"] if factor is None or not isinstance(factor, float) or factor < 1.0: - raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") + logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") attention_factor = rope_scaling.get("attention_factor") if attention_factor is not None and (not isinstance(attention_factor, float) or attention_factor < 0): - raise ValueError( + logger.warning( f"`rope_scaling`'s attention_factor field must be a float greater than 0, got {attention_factor}" ) beta_fast = rope_scaling.get("beta_fast") if beta_fast is not None and not isinstance(beta_fast, float): - raise ValueError(f"`rope_scaling`'s beta_fast field must be a float, got {beta_fast}") + logger.warning(f"`rope_scaling`'s beta_fast field must be a float, got {beta_fast}") beta_slow = rope_scaling.get("beta_slow") if beta_slow is not None and not isinstance(beta_slow, float): - raise ValueError(f"`rope_scaling`'s beta_slow field must be a float, got {beta_slow}") + logger.warning(f"`rope_scaling`'s beta_slow field must be a float, got {beta_slow}") if (beta_fast or 32) < (beta_slow or 1): - raise ValueError( + logger.warning( f"`rope_scaling`'s beta_fast field must be greater than beta_slow, got beta_fast={beta_fast} " f"(defaults to 32 if None) and beta_slow={beta_slow} (defaults to 1 if None)" ) @@ -433,7 +438,7 @@ def _validate_yarn_parameters(config: PretrainedConfig): def _validate_longrope_parameters(config: PretrainedConfig): rope_scaling = config.rope_scaling - rope_type = rope_scaling["rope_type"] + rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type" required_keys = {"rope_type", "short_factor", "long_factor"} # TODO (joao): update logic for the inclusion of `original_max_position_embeddings` optional_keys = {"attention_factor", "factor", "original_max_position_embeddings"} @@ -445,15 +450,15 @@ def _validate_longrope_parameters(config: PretrainedConfig): short_factor = rope_scaling.get("short_factor") if not isinstance(short_factor, list) and all(isinstance(x, (int, float)) for x in short_factor): - raise ValueError(f"`rope_scaling`'s short_factor field must be a list of numbers, got {short_factor}") + logger.warning(f"`rope_scaling`'s short_factor field must be a list of numbers, got {short_factor}") if not len(short_factor) == dim // 2: - raise ValueError(f"`rope_scaling`'s short_factor field must have length {dim // 2}, got {len(short_factor)}") + logger.warning(f"`rope_scaling`'s short_factor field must have length {dim // 2}, got {len(short_factor)}") long_factor = rope_scaling.get("long_factor") if not isinstance(long_factor, list) and all(isinstance(x, (int, float)) for x in long_factor): - raise ValueError(f"`rope_scaling`'s long_factor field must be a list of numbers, got {long_factor}") + logger.warning(f"`rope_scaling`'s long_factor field must be a list of numbers, got {long_factor}") if not len(long_factor) == dim // 2: - raise ValueError(f"`rope_scaling`'s long_factor field must have length {dim // 2}, got {len(long_factor)}") + logger.warning(f"`rope_scaling`'s long_factor field must have length {dim // 2}, got {len(long_factor)}") # Handle Phi3 divergence: prefer the use of `attention_factor` and/or `factor` over # `original_max_position_embeddings` to compute internal variables. The latter lives outside `rope_scaling` and is @@ -468,48 +473,48 @@ def _validate_longrope_parameters(config: PretrainedConfig): else: factor = rope_scaling.get("factor") if factor is None: - raise ValueError("Missing required keys in `rope_scaling`: 'factor'") + logger.warning("Missing required keys in `rope_scaling`: 'factor'") elif not isinstance(factor, float) or factor < 1.0: - raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") + logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") attention_factor = rope_scaling.get("attention_factor") if attention_factor is not None and not isinstance(attention_factor, float) or attention_factor < 0: - raise ValueError( + logger.warning( f"`rope_scaling`'s attention_factor field must be a float greater than 0, got {attention_factor}" ) def _validate_llama3_parameters(config: PretrainedConfig): rope_scaling = config.rope_scaling - rope_type = rope_scaling["rope_type"] + rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type" required_keys = {"rope_type", "factor", "original_max_position_embeddings", "low_freq_factor", "high_freq_factor"} received_keys = set(rope_scaling.keys()) _check_received_keys(rope_type, received_keys, required_keys) factor = rope_scaling["factor"] if factor is None or not isinstance(factor, float) or factor < 1.0: - raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") + logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") low_freq_factor = rope_scaling["low_freq_factor"] high_freq_factor = rope_scaling["high_freq_factor"] if low_freq_factor is None or not isinstance(low_freq_factor, float): - raise ValueError(f"`rope_scaling`'s low_freq_factor field must be a float, got {low_freq_factor}") + logger.warning(f"`rope_scaling`'s low_freq_factor field must be a float, got {low_freq_factor}") if high_freq_factor is None or not isinstance(high_freq_factor, float): - raise ValueError(f"`rope_scaling`'s high_freq_factor field must be a float, got {high_freq_factor}") + logger.warning(f"`rope_scaling`'s high_freq_factor field must be a float, got {high_freq_factor}") if high_freq_factor < low_freq_factor: - raise ValueError( + logger.warning( "`rope_scaling`'s high_freq_factor field must be greater than low_freq_factor, got high_freq_factor=" f"{high_freq_factor} and low_freq_factor={low_freq_factor}" ) original_max_position_embeddings = rope_scaling["original_max_position_embeddings"] if original_max_position_embeddings is None or not isinstance(original_max_position_embeddings, int): - raise ValueError( + logger.warning( "`rope_scaling`'s original_max_position_embeddings field must be an integer, got " f"{original_max_position_embeddings}" ) if original_max_position_embeddings >= config.max_position_embeddings: - raise ValueError( + logger.warning( "`rope_scaling`'s original_max_position_embeddings field must be less than max_position_embeddings, got " f"{original_max_position_embeddings} and max_position_embeddings={config.max_position_embeddings}" ) @@ -534,17 +539,12 @@ def rope_config_validation(config: PretrainedConfig): if rope_scaling is None: return - possible_rope_types = set(ROPE_INIT_FUNCTIONS.keys()) - rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type" - if rope_type is None: - raise ValueError( - f"rope_scaling must contain a non-None 'rope_type' field. Possible options are {possible_rope_types}" - ) - + # BC: "rope_type" was originally "type" + rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", "default")) validation_fn = ROPE_VALIDATION_FUNCTIONS.get(rope_type) if validation_fn is not None: validation_fn(config) else: - raise ValueError( + logger.warning( f"Missing validation function mapping in `ROPE_VALIDATION_FUNCTIONS` for 'rope_type'='{rope_type}'" ) diff --git a/src/transformers/models/llama/configuration_llama.py b/src/transformers/models/llama/configuration_llama.py index c632a870be7a18..710809093f3849 100644 --- a/src/transformers/models/llama/configuration_llama.py +++ b/src/transformers/models/llama/configuration_llama.py @@ -189,6 +189,9 @@ def __init__( self.mlp_bias = mlp_bias # Validate the correctness of rotary position embeddings parameters + # BC: if there is a 'type' field, move it to 'rope_type'. + if self.rope_scaling is not None and "type" in self.rope_scaling: + self.rope_scaling["rope_type"] = self.rope_scaling["type"] rope_config_validation(self) super().__init__( diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py index 85d352fc814f6f..19cb7bd6be393c 100644 --- a/tests/models/llama/test_modeling_llama.py +++ b/tests/models/llama/test_modeling_llama.py @@ -526,6 +526,60 @@ def test_rope_class_retrocompatibility(self): torch.testing.assert_close(old_cos_long, new_cos_long) torch.testing.assert_close(old_sin_long, new_sin_long) + def test_model_loading_old_rope_configs(self): + def _reinitialize_config(base_config, new_kwargs): + # Reinitialize the config with the new kwargs, forcing the config to go through its __init__ validation + # steps. + base_config_dict = base_config.to_dict() + new_config = LlamaConfig.from_dict(config_dict={**base_config_dict, **new_kwargs}) + return new_config + + # from untouched config -> ✅ + base_config, model_inputs = self.model_tester.prepare_config_and_inputs_for_common() + original_model = LlamaForCausalLM(base_config).to(torch_device) + original_model(**model_inputs) + + # from a config with the expected rope configuration -> ✅ + config = _reinitialize_config(base_config, {"rope_scaling": {"rope_type": "linear", "factor": 10.0}}) + original_model = LlamaForCausalLM(config).to(torch_device) + original_model(**model_inputs) + + # from a config with the old rope configuration ('type' instead of 'rope_type') -> ✅ we gracefully handle BC + config = _reinitialize_config(base_config, {"rope_scaling": {"type": "linear", "factor": 10.0}}) + original_model = LlamaForCausalLM(config).to(torch_device) + original_model(**model_inputs) + + # from a config with both 'type' and 'rope_type' -> ✅ they can coexist (and both are present in the config) + config = _reinitialize_config( + base_config, {"rope_scaling": {"type": "linear", "rope_type": "linear", "factor": 10.0}} + ) + self.assertTrue(config.rope_scaling["type"] == "linear") + self.assertTrue(config.rope_scaling["rope_type"] == "linear") + original_model = LlamaForCausalLM(config).to(torch_device) + original_model(**model_inputs) + + # from a config with parameters in a bad range ('factor' should be >= 1.0) -> ⚠️ throws a warning + with self.assertLogs("transformers.modeling_rope_utils", level="WARNING") as logs: + config = _reinitialize_config(base_config, {"rope_scaling": {"rope_type": "linear", "factor": -999.0}}) + original_model = LlamaForCausalLM(config).to(torch_device) + original_model(**model_inputs) + self.assertEqual(len(logs.output), 1) + self.assertIn("factor field", logs.output[0]) + + # from a config with unknown parameters ('foo' isn't a rope option) -> ⚠️ throws a warning + with self.assertLogs("transformers.modeling_rope_utils", level="WARNING") as logs: + config = _reinitialize_config( + base_config, {"rope_scaling": {"rope_type": "linear", "factor": 10.0, "foo": "bar"}} + ) + original_model = LlamaForCausalLM(config).to(torch_device) + original_model(**model_inputs) + self.assertEqual(len(logs.output), 1) + self.assertIn("Unrecognized keys", logs.output[0]) + + # from a config with specific rope type but missing one of its mandatory parameters -> ❌ throws exception + with self.assertRaises(KeyError): + config = _reinitialize_config(base_config, {"rope_scaling": {"rope_type": "linear"}}) # missing "factor" + @require_flash_attn @require_torch_gpu @require_bitsandbytes From 8d2534c4d0ab94a97a72d2ce6bb9ccd201abadb3 Mon Sep 17 00:00:00 2001 From: Arthur <48595927+ArthurZucker@users.noreply.github.com> Date: Wed, 24 Jul 2024 16:06:39 +0200 Subject: [PATCH 38/73] let's not warn when someone is running a forward (#32176) * let's not warn when someone is running a foward without cache + self.training * more models * fixup --- src/transformers/models/cohere/modeling_cohere.py | 4 +++- src/transformers/models/dbrx/modeling_dbrx.py | 4 +++- src/transformers/models/gemma/diff_gemma.py | 4 +++- src/transformers/models/gemma/modeling_gemma.py | 8 ++++++-- src/transformers/models/jetmoe/modeling_jetmoe.py | 4 +++- src/transformers/models/llama/modeling_llama.py | 4 +++- src/transformers/models/mistral/modeling_mistral.py | 2 +- src/transformers/models/mixtral/modeling_mixtral.py | 2 +- src/transformers/models/olmo/modeling_olmo.py | 4 +++- src/transformers/models/persimmon/modeling_persimmon.py | 2 +- src/transformers/models/phi/modeling_phi.py | 2 +- src/transformers/models/phi3/modeling_phi3.py | 2 +- src/transformers/models/qwen2/modeling_qwen2.py | 2 +- src/transformers/models/qwen2_moe/modeling_qwen2_moe.py | 2 +- src/transformers/models/stablelm/modeling_stablelm.py | 2 +- src/transformers/models/starcoder2/modeling_starcoder2.py | 2 +- 16 files changed, 33 insertions(+), 17 deletions(-) diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py index 6532c656d453e0..6257eeb9958cc8 100644 --- a/src/transformers/models/cohere/modeling_cohere.py +++ b/src/transformers/models/cohere/modeling_cohere.py @@ -769,7 +769,9 @@ def forward( past_seen_tokens = 0 return_legacy_cache = False - if use_cache and not isinstance(past_key_values, Cache): # kept for BC (non `Cache` `past_key_values` inputs) + if ( + use_cache and not isinstance(past_key_values, Cache) and not self.training + ): # kept for BC (non `Cache` `past_key_values` inputs) return_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py index e3be8decbc6b52..b1f3ce1b8ba963 100644 --- a/src/transformers/models/dbrx/modeling_dbrx.py +++ b/src/transformers/models/dbrx/modeling_dbrx.py @@ -1005,7 +1005,9 @@ def forward( inputs_embeds = nn.functional.dropout(inputs_embeds, p=self.emb_pdrop, training=self.training) return_legacy_cache = False - if use_cache and not isinstance(past_key_values, Cache): # kept for BC (non `Cache` `past_key_values` inputs) + if ( + use_cache and not isinstance(past_key_values, Cache) and not self.training + ): # kept for BC (non `Cache` `past_key_values` inputs) return_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( diff --git a/src/transformers/models/gemma/diff_gemma.py b/src/transformers/models/gemma/diff_gemma.py index d2a653120965da..4e2ea82950bf18 100644 --- a/src/transformers/models/gemma/diff_gemma.py +++ b/src/transformers/models/gemma/diff_gemma.py @@ -474,7 +474,9 @@ def forward( inputs_embeds = self.embed_tokens(input_ids) return_legacy_cache = False # noqa: F841 - if use_cache and not isinstance(past_key_values, Cache): # kept for BC (non `Cache` `past_key_values` inputs) + if ( + use_cache and not isinstance(past_key_values, Cache) and not self.training + ): # kept for BC (non `Cache` `past_key_values` inputs) return_legacy_cache = True # noqa: F841 past_key_values = DynamicCache.from_legacy_cache(past_key_values) diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py index 5bc1af3e7ec7a9..ae27fe98512fa5 100644 --- a/src/transformers/models/gemma/modeling_gemma.py +++ b/src/transformers/models/gemma/modeling_gemma.py @@ -770,7 +770,9 @@ def forward( inputs_embeds = self.embed_tokens(input_ids) return_legacy_cache = False # noqa: F841 - if use_cache and not isinstance(past_key_values, Cache): # kept for BC (non `Cache` `past_key_values` inputs) + if ( + use_cache and not isinstance(past_key_values, Cache) and not self.training + ): # kept for BC (non `Cache` `past_key_values` inputs) return_legacy_cache = True # noqa: F841 past_key_values = DynamicCache.from_legacy_cache(past_key_values) @@ -795,7 +797,9 @@ def forward( # See https://github.com/huggingface/transformers/pull/29402 normalizer = torch.tensor(self.config.hidden_size**0.5, dtype=hidden_states.dtype) hidden_states = hidden_states * normalizer - if use_cache and not isinstance(past_key_values, Cache): # kept for BC (non `Cache` `past_key_values` inputs) + if ( + use_cache and not isinstance(past_key_values, Cache) and not self.training + ): # kept for BC (non `Cache` `past_key_values` inputs) return_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( diff --git a/src/transformers/models/jetmoe/modeling_jetmoe.py b/src/transformers/models/jetmoe/modeling_jetmoe.py index fa15393a40a5f3..583751520183a0 100644 --- a/src/transformers/models/jetmoe/modeling_jetmoe.py +++ b/src/transformers/models/jetmoe/modeling_jetmoe.py @@ -978,7 +978,9 @@ def forward( inputs_embeds = self.embed_tokens(input_ids) return_legacy_cache = False - if use_cache and not isinstance(past_key_values, Cache): # kept for BC (non `Cache` `past_key_values` inputs) + if ( + use_cache and not isinstance(past_key_values, Cache) and not self.training + ): # kept for BC (non `Cache` `past_key_values` inputs) return_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index adb455acfbbc24..d553ce4432f9cc 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -894,7 +894,9 @@ def forward( inputs_embeds = self.embed_tokens(input_ids) return_legacy_cache = False - if use_cache and not isinstance(past_key_values, Cache): # kept for BC (non `Cache` `past_key_values` inputs) + if ( + use_cache and not isinstance(past_key_values, Cache) and not self.training + ): # kept for BC (non `Cache` `past_key_values` inputs) return_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py index 93a60a49dbf34c..7c339271a58974 100644 --- a/src/transformers/models/mistral/modeling_mistral.py +++ b/src/transformers/models/mistral/modeling_mistral.py @@ -758,7 +758,7 @@ def forward( inputs_embeds = self.embed_tokens(input_ids) return_legacy_cache = False - if use_cache and not isinstance(past_key_values, Cache): + if use_cache and not isinstance(past_key_values, Cache) and not self.training: past_key_values = DynamicCache.from_legacy_cache(past_key_values) return_legacy_cache = True logger.warning_once( diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py index d2ee6e6b268ae0..7df175a0467dcf 100644 --- a/src/transformers/models/mixtral/modeling_mixtral.py +++ b/src/transformers/models/mixtral/modeling_mixtral.py @@ -960,7 +960,7 @@ def forward( use_cache = False use_legacy_cache = False - if use_cache and not isinstance(past_key_values, Cache): + if use_cache and not isinstance(past_key_values, Cache) and not self.training: use_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( diff --git a/src/transformers/models/olmo/modeling_olmo.py b/src/transformers/models/olmo/modeling_olmo.py index 74d49d5606c145..16e8711188ddaf 100644 --- a/src/transformers/models/olmo/modeling_olmo.py +++ b/src/transformers/models/olmo/modeling_olmo.py @@ -811,7 +811,9 @@ def forward( inputs_embeds = self.embed_tokens(input_ids) return_legacy_cache = False - if use_cache and not isinstance(past_key_values, Cache): # kept for BC (non `Cache` `past_key_values` inputs) + if ( + use_cache and not isinstance(past_key_values, Cache) and not self.training + ): # kept for BC (non `Cache` `past_key_values` inputs) return_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( diff --git a/src/transformers/models/persimmon/modeling_persimmon.py b/src/transformers/models/persimmon/modeling_persimmon.py index af22145e3e9de9..c718b7a406333a 100644 --- a/src/transformers/models/persimmon/modeling_persimmon.py +++ b/src/transformers/models/persimmon/modeling_persimmon.py @@ -626,7 +626,7 @@ def forward( use_cache = False use_legacy_cache = False - if use_cache and not isinstance(past_key_values, Cache): + if use_cache and not isinstance(past_key_values, Cache) and not self.training: use_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( diff --git a/src/transformers/models/phi/modeling_phi.py b/src/transformers/models/phi/modeling_phi.py index 1b23be39e5c05d..1289910381e52e 100644 --- a/src/transformers/models/phi/modeling_phi.py +++ b/src/transformers/models/phi/modeling_phi.py @@ -909,7 +909,7 @@ def forward( use_cache = False use_legacy_cache = False - if use_cache and not isinstance(past_key_values, Cache): + if use_cache and not isinstance(past_key_values, Cache) and not self.training: use_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py index 90b815184b07a8..dfcb7c2dd009ec 100644 --- a/src/transformers/models/phi3/modeling_phi3.py +++ b/src/transformers/models/phi3/modeling_phi3.py @@ -950,7 +950,7 @@ def forward( use_cache = False use_legacy_cache = False - if use_cache and not isinstance(past_key_values, Cache): + if use_cache and not isinstance(past_key_values, Cache) and not self.training: use_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( diff --git a/src/transformers/models/qwen2/modeling_qwen2.py b/src/transformers/models/qwen2/modeling_qwen2.py index 1ff8896ae5f901..67c8c71fe59db4 100644 --- a/src/transformers/models/qwen2/modeling_qwen2.py +++ b/src/transformers/models/qwen2/modeling_qwen2.py @@ -808,7 +808,7 @@ def forward( use_cache = False use_legacy_cache = False - if use_cache and not isinstance(past_key_values, Cache): + if use_cache and not isinstance(past_key_values, Cache) and not self.training: use_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( diff --git a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py index 54e91da3347dbc..9b50235c15d52b 100644 --- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py +++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py @@ -970,7 +970,7 @@ def forward( use_cache = False use_legacy_cache = False - if use_cache and not isinstance(past_key_values, Cache): + if use_cache and not isinstance(past_key_values, Cache) and not self.training: use_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( diff --git a/src/transformers/models/stablelm/modeling_stablelm.py b/src/transformers/models/stablelm/modeling_stablelm.py index 3a3b6a9e05f117..086f525644b2da 100755 --- a/src/transformers/models/stablelm/modeling_stablelm.py +++ b/src/transformers/models/stablelm/modeling_stablelm.py @@ -902,7 +902,7 @@ def forward( use_cache = False use_legacy_cache = False - if use_cache and not isinstance(past_key_values, Cache): + if use_cache and not isinstance(past_key_values, Cache) and not self.training: use_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( diff --git a/src/transformers/models/starcoder2/modeling_starcoder2.py b/src/transformers/models/starcoder2/modeling_starcoder2.py index f2786f9df48a6b..18dbe510450f74 100644 --- a/src/transformers/models/starcoder2/modeling_starcoder2.py +++ b/src/transformers/models/starcoder2/modeling_starcoder2.py @@ -784,7 +784,7 @@ def forward( use_cache = False use_legacy_cache = False - if use_cache and not isinstance(past_key_values, Cache): + if use_cache and not isinstance(past_key_values, Cache) and not self.training: use_legacy_cache = True past_key_values = DynamicCache.from_legacy_cache(past_key_values) logger.warning_once( From 1392a6867f40a55dfabaf306745c67627598b1af Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Wed, 24 Jul 2024 19:26:20 +0500 Subject: [PATCH 39/73] Fix resize embedding with Deepspeed (#32192) fix resize when deepspeed --- src/transformers/modeling_utils.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 81403f524f9e79..9f32da16fd7972 100755 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -2131,13 +2131,23 @@ def _get_resized_embeddings( # Replace weights in old_embeddings and return to maintain the same embedding type. # This ensures correct functionality when a Custom Embedding class is passed as input. # The input and output embedding types remain consistent. (c.f. https://github.com/huggingface/transformers/pull/31979) - old_embeddings.weight.data = new_embeddings.weight.data - old_embeddings.num_embeddings = new_embeddings.weight.data.shape[0] + if is_deepspeed_zero3_enabled() and not is_quantized: + import deepspeed + + params = [old_embeddings.weight, new_embeddings.weight] + with deepspeed.zero.GatheredParameters(params, modifier_rank=0): + old_embeddings.weight.data = new_embeddings.weight.data + old_embeddings.num_embeddings = new_embeddings.weight.data.shape[0] - # If the new number of tokens is smaller than the original `padding_idx`, the `padding_idx` - # will be set to `None` in the resized embeddings. - if old_embeddings.padding_idx is not None and (new_num_tokens - 1) < old_embeddings.padding_idx: - old_embeddings.padding_idx = None + # If the new number of tokens is smaller than the original `padding_idx`, the `padding_idx` + # will be set to `None` in the resized embeddings. + if old_embeddings.padding_idx is not None and (new_num_tokens - 1) < old_embeddings.padding_idx: + old_embeddings.padding_idx = None + else: + old_embeddings.weight.data = new_embeddings.weight.data + old_embeddings.num_embeddings = new_embeddings.weight.data.shape[0] + if old_embeddings.padding_idx is not None and (new_num_tokens - 1) < old_embeddings.padding_idx: + old_embeddings.padding_idx = None return old_embeddings From af0e4b7b37b2d7eefe7531cf5201a5d6bae85525 Mon Sep 17 00:00:00 2001 From: Marc Sun <57196510+SunMarc@users.noreply.github.com> Date: Wed, 24 Jul 2024 17:14:05 +0200 Subject: [PATCH 40/73] Fix float8_e4m3fn in modeling_utils (#32193) * Fix float8_e4m3fn in modeling_utils * style * fix * comment --- src/transformers/modeling_utils.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 9f32da16fd7972..8f1ad56f6999df 100755 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -855,6 +855,8 @@ def _load_state_dict_into_meta_model( for old_key, new_key in zip(old_keys, new_keys): state_dict[new_key] = state_dict.pop(old_key) + is_torch_e4m3fn_available = hasattr(torch, "float8_e4m3fn") + for param_name, param in state_dict.items(): # First part of the test is always true as load_state_dict_keys always contains state_dict keys. if param_name not in loaded_state_dict_keys or param_name not in expected_keys: @@ -866,9 +868,10 @@ def _load_state_dict_into_meta_model( module_name = param_name set_module_kwargs = {} - # We convert floating dtypes to the `dtype` passed. We want to keep the buffers/params + # We convert floating dtypes to the `dtype` passed except for float8_e4m3fn type. We also want to keep the buffers/params # in int/uint/bool and not cast them. - if dtype is not None and torch.is_floating_point(param) and param.dtype != torch.float8_e4m3fn: + is_param_float8_e4m3fn = is_torch_e4m3fn_available and param.dtype == torch.float8_e4m3fn + if dtype is not None and torch.is_floating_point(param) and not is_param_float8_e4m3fn: if ( keep_in_fp32_modules is not None and any( From 1c122a46dc3c4448901f8d2f3018d9d58b846ba5 Mon Sep 17 00:00:00 2001 From: Penut Chen <94501378+PenutChen@users.noreply.github.com> Date: Wed, 24 Jul 2024 23:59:59 +0800 Subject: [PATCH 41/73] Support dequantizing GGUF FP16 format (#31783) * support gguf fp16 * support gguf bf16 with pytorch * add gguf f16 test * remove bf16 --- src/transformers/integrations/ggml.py | 3 +++ tests/quantization/ggml/test_ggml.py | 14 ++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/src/transformers/integrations/ggml.py b/src/transformers/integrations/ggml.py index 71aa87afa94b5d..47f3f0cf8d57b4 100644 --- a/src/transformers/integrations/ggml.py +++ b/src/transformers/integrations/ggml.py @@ -36,6 +36,7 @@ # Listed here: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md GGML_TYPES = { "F32": 0, + "F16": 1, "Q4_0": 2, "Q8_0": 8, "Q2_K": 10, @@ -489,6 +490,8 @@ def dequantize_q5_k(data): def load_dequant_gguf_tensor(shape, ggml_type, data): if ggml_type == GGML_TYPES["F32"]: values = data + elif ggml_type == GGML_TYPES["F16"]: + values = data elif ggml_type == GGML_TYPES["Q8_0"]: values = dequantize_q8_0(data) elif ggml_type == GGML_TYPES["Q4_0"]: diff --git a/tests/quantization/ggml/test_ggml.py b/tests/quantization/ggml/test_ggml.py index a5866094a1cc6f..e42900a1d51b44 100644 --- a/tests/quantization/ggml/test_ggml.py +++ b/tests/quantization/ggml/test_ggml.py @@ -33,6 +33,7 @@ class GgufIntegrationTests(unittest.TestCase): mistral_model_id = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF" qwen2_model_id = "Qwen/Qwen1.5-0.5B-Chat-GGUF" llama3_model_id = "NousResearch/Meta-Llama-3-8B-GGUF" + tinyllama_model_id = "PenutChen/TinyLlama-1.1B-Chat-v1.0-GGUF" q4_0_gguf_model_id = "tinyllama-1.1b-chat-v1.0.Q4_0.gguf" q4_k_gguf_model_id = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf" @@ -45,6 +46,7 @@ class GgufIntegrationTests(unittest.TestCase): q4_0_mistral_model_id = "mistral-7b-instruct-v0.2.Q4_0.gguf" q4_0_qwen2_model_id = "qwen1_5-0_5b-chat-q4_0.gguf" q4_llama3_model_id = "Meta-Llama-3-8B-Q4_K_M.gguf" + f16_tinyllama_model_id = "TinyLlama-1.1B-Chat-v1.0.FP16.gguf" example_text = "Hello" @@ -149,6 +151,18 @@ def test_q8_0(self): EXPECTED_TEXT = "Hello, World!\n\n5. Use a library" self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT) + def test_f16(self): + tokenizer = AutoTokenizer.from_pretrained(self.tinyllama_model_id, gguf_file=self.f16_tinyllama_model_id) + model = AutoModelForCausalLM.from_pretrained( + self.tinyllama_model_id, gguf_file=self.f16_tinyllama_model_id + ).to(torch_device) + + text = tokenizer(self.example_text, return_tensors="pt").to(torch_device) + out = model.generate(**text, max_new_tokens=10) + + EXPECTED_TEXT = "Hello, World!\n\n5. Node.js" + self.assertEqual(tokenizer.decode(out[0], skip_special_tokens=True), EXPECTED_TEXT) + def test_mistral_q4_0(self): tokenizer = AutoTokenizer.from_pretrained(self.mistral_model_id, gguf_file=self.q4_0_mistral_model_id) model = AutoModelForCausalLM.from_pretrained( From edd68f4ed8db241bd3e9dc6c4ed96d471f243c9a Mon Sep 17 00:00:00 2001 From: Matt Date: Wed, 24 Jul 2024 17:36:32 +0100 Subject: [PATCH 42/73] :rotating_light: No more default chat templates (#31733) * No more default chat templates * Add the template to the GPT-SW3 tests since it's not available by default now * Fix GPT2 test * Fix Bloom test * Fix Bloom test * Remove default templates again --- docs/source/en/chat_templating.md | 19 +- docs/source/es/chat_templating.md | 8 +- docs/source/ja/chat_templating.md | 2 +- docs/source/zh/chat_templating.md | 2 +- .../blenderbot/tokenization_blenderbot.py | 14 -- .../tokenization_blenderbot_fast.py | 15 -- .../tokenization_blenderbot_small.py | 15 -- .../tokenization_blenderbot_small_fast.py | 15 -- .../models/bloom/tokenization_bloom_fast.py | 8 - .../code_llama/tokenization_code_llama.py | 55 ------ .../tokenization_code_llama_fast.py | 55 ------ .../models/cohere/tokenization_cohere_fast.py | 182 ------------------ .../tokenization_gptsan_japanese.py | 13 -- .../models/gpt2/tokenization_gpt2.py | 7 - .../models/gpt2/tokenization_gpt2_fast.py | 9 - .../gpt_neox/tokenization_gpt_neox_fast.py | 8 - .../tokenization_gpt_neox_japanese.py | 12 -- .../models/gpt_sw3/tokenization_gpt_sw3.py | 16 -- .../models/idefics2/processing_idefics2.py | 57 ------ .../models/llama/tokenization_llama.py | 54 ------ .../models/llama/tokenization_llama_fast.py | 55 ------ .../processing_llava_next_video.py | 60 ------ .../models/whisper/tokenization_whisper.py | 8 - .../whisper/tokenization_whisper_fast.py | 8 - src/transformers/processing_utils.py | 13 +- src/transformers/tokenization_utils_base.py | 54 ++---- tests/models/bloom/test_tokenization_bloom.py | 1 + tests/models/gpt2/test_tokenization_gpt2.py | 1 + .../gpt_sw3/test_tokenization_gpt_sw3.py | 9 + 29 files changed, 28 insertions(+), 747 deletions(-) diff --git a/docs/source/en/chat_templating.md b/docs/source/en/chat_templating.md index d840caaf660520..c4069dd1afc706 100644 --- a/docs/source/en/chat_templating.md +++ b/docs/source/en/chat_templating.md @@ -580,7 +580,7 @@ default template for that model class is used instead. Let's take a look at the >>> from transformers import AutoTokenizer >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill") ->>> tokenizer.default_chat_template +>>> tokenizer.chat_template "{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ ' ' }}{% endif %}{% endfor %}{{ eos_token }}" ``` @@ -704,23 +704,6 @@ with other names, pass the name of the template you want to the `chat_template` We find that this can be a bit confusing for users, though - so if you're writing a template yourself, we recommend trying to put it all in a single template where possible! -### What are "default" templates? - -Before the introduction of chat templates, chat handling was hardcoded at the model class level. For backwards -compatibility, we have retained this class-specific handling as default templates, also set at the class level. If a -model does not have a chat template set, but there is a default template for its model class, the `TextGenerationPipeline` -class and methods like `apply_chat_template` will use the class template instead. You can find out what the default -template for your tokenizer is by checking the `tokenizer.default_chat_template` attribute. - -This is something we do purely for backward compatibility reasons, to avoid breaking any existing workflows. Even when -the class template is appropriate for your model, we strongly recommend overriding the default template by -setting the `chat_template` attribute explicitly to make it clear to users that your model has been correctly configured -for chat. - -Now that actual chat templates have been adopted more widely, default templates have been deprecated and will be -removed in a future release. We strongly recommend setting the `chat_template` attribute for any tokenizers that -still depend on them! - ### What template should I use? When setting the template for a model that's already been trained for chat, you should ensure that the template diff --git a/docs/source/es/chat_templating.md b/docs/source/es/chat_templating.md index 10129e87ef1184..e287c213743542 100644 --- a/docs/source/es/chat_templating.md +++ b/docs/source/es/chat_templating.md @@ -220,7 +220,7 @@ La plantilla de chat para un modelo se almacena en el atributo `tokenizer.chat_t >>> from transformers import AutoTokenizer >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill") ->>> tokenizer.default_chat_template +>>> tokenizer.chat_template "{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ ' ' }}{% endif %}{% endfor %}{{ eos_token }}" ``` @@ -307,12 +307,6 @@ Si estás ajustando finamente un modelo para chat, además de establecer una pla -### ¿Qué son las plantillas "default"? - -Antes de la introducción de las plantillas de chat, el manejo del chat estaba codificado en el nivel de la clase del modelo. Por razones de compatibilidad con versiones anteriores, hemos conservado este manejo específico de la clase como plantillas predeterminadas, también establecidas a nivel de clase. Si un modelo no tiene una plantilla de chat establecida, pero hay una plantilla predeterminada para su clase de modelo, la clase `TextGenerationPipeline` y métodos como `apply_chat_template` usarán la plantilla de clase en su lugar. Puedes averiguar cuál es la plantilla predeterminada para tu tokenizador comprobando el atributo `tokenizer.default_chat_template`. - -Esto es algo que hacemos puramente por razones de compatibilidad con versiones anteriores, para evitar romper cualquier flujo de trabajo existente. Incluso cuando la plantilla de clase es apropiada para tu modelo, recomendamos encarecidamente anular la plantilla predeterminada estableciendo explícitamente el atributo `chat_template` para dejar claro a los usuarios que tu modelo ha sido configurado correctamente para el chat, y para estar preparados para el futuro en caso de que las plantillas predeterminadas alguna vez se alteren o se eliminen. - ### ¿Qué plantilla debería usar? Cuando establezcas la plantilla para un modelo que ya ha sido entrenado para chat, debes asegurarte de que la plantilla coincida exactamente con el formato de mensajes que el modelo vio durante el entrenamiento, o de lo contrario es probable que experimentes degradación del rendimiento. Esto es cierto incluso si estás entrenando aún más el modelo; probablemente obtendrás el mejor rendimiento si mantienes constantes los tokens de chat. Esto es muy análogo a la tokenización: generalmente obtienes el mejor rendimiento para la inferencia o el ajuste fino cuando coincides precisamente con la tokenización utilizada durante el entrenamiento. diff --git a/docs/source/ja/chat_templating.md b/docs/source/ja/chat_templating.md index 200bf40ac4cf40..82db942ef1e15b 100644 --- a/docs/source/ja/chat_templating.md +++ b/docs/source/ja/chat_templating.md @@ -85,7 +85,7 @@ LLM(Language Model)のますます一般的な使用事例の1つは「チ >>> from transformers import AutoTokenizer >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill") ->>> tokenizer.default_chat_template +>>> tokenizer.chat_template "{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ ' ' }}{% endif %}{% endfor %}{{ eos_token }}" ``` diff --git a/docs/source/zh/chat_templating.md b/docs/source/zh/chat_templating.md index a08da47cb27a26..e0ab50b634c780 100644 --- a/docs/source/zh/chat_templating.md +++ b/docs/source/zh/chat_templating.md @@ -228,7 +228,7 @@ The sun. >>> from transformers import AutoTokenizer >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill") ->>> tokenizer.default_chat_template +>>> tokenizer.chat_template "{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ ' ' }}{% endif %}{% endfor %}{{ eos_token }}" ``` diff --git a/src/transformers/models/blenderbot/tokenization_blenderbot.py b/src/transformers/models/blenderbot/tokenization_blenderbot.py index 67724538233430..1a8807214d52ba 100644 --- a/src/transformers/models/blenderbot/tokenization_blenderbot.py +++ b/src/transformers/models/blenderbot/tokenization_blenderbot.py @@ -405,17 +405,3 @@ def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens. """ return token_ids_0 + [self.eos_token_id] - - @property - def default_chat_template(self): - """ - A very simple chat template that just adds whitespace between messages. - """ - return ( - "{% for message in messages %}" - "{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}" - "{{ message['content'] }}" - "{% if not loop.last %}{{ ' ' }}{% endif %}" - "{% endfor %}" - "{{ eos_token }}" - ) diff --git a/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py b/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py index 01cbf13809d657..0d24ed62c574a3 100644 --- a/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py +++ b/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py @@ -287,18 +287,3 @@ def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens. """ return token_ids_0 + [self.eos_token_id] - - @property - # Copied from transformers.models.blenderbot.tokenization_blenderbot.BlenderbotTokenizer.default_chat_template - def default_chat_template(self): - """ - A very simple chat template that just adds whitespace between messages. - """ - return ( - "{% for message in messages %}" - "{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}" - "{{ message['content'] }}" - "{% if not loop.last %}{{ ' ' }}{% endif %}" - "{% endfor %}" - "{{ eos_token }}" - ) diff --git a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py index 832b5315edfd7c..08c7be332e31ef 100644 --- a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py +++ b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py @@ -217,18 +217,3 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = index += 1 return vocab_file, merge_file - - @property - # Copied from transformers.models.blenderbot.tokenization_blenderbot.BlenderbotTokenizer.default_chat_template - def default_chat_template(self): - """ - A very simple chat template that just adds whitespace between messages. - """ - return ( - "{% for message in messages %}" - "{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}" - "{{ message['content'] }}" - "{% if not loop.last %}{{ ' ' }}{% endif %}" - "{% endfor %}" - "{{ eos_token }}" - ) diff --git a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py index a80acdb650e445..21fb76cbfc8691 100644 --- a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py +++ b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py @@ -98,18 +98,3 @@ def create_token_type_ids_from_sequences( if token_ids_1 is None: return len(cls + token_ids_0 + sep) * [0] return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] - - @property - # Copied from transformers.models.blenderbot.tokenization_blenderbot.BlenderbotTokenizer.default_chat_template - def default_chat_template(self): - """ - A very simple chat template that just adds whitespace between messages. - """ - return ( - "{% for message in messages %}" - "{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}" - "{{ message['content'] }}" - "{% if not loop.last %}{{ ' ' }}{% endif %}" - "{% endfor %}" - "{{ eos_token }}" - ) diff --git a/src/transformers/models/bloom/tokenization_bloom_fast.py b/src/transformers/models/bloom/tokenization_bloom_fast.py index d0da1621d4c968..54e6377353084d 100644 --- a/src/transformers/models/bloom/tokenization_bloom_fast.py +++ b/src/transformers/models/bloom/tokenization_bloom_fast.py @@ -147,11 +147,3 @@ def _encode_plus(self, *args, **kwargs) -> BatchEncoding: def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: files = self._tokenizer.model.save(save_directory, name=filename_prefix) return tuple(files) - - @property - # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.default_chat_template - def default_chat_template(self): - """ - A simple chat template that ignores role information and just concatenates messages with EOS tokens. - """ - return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}" diff --git a/src/transformers/models/code_llama/tokenization_code_llama.py b/src/transformers/models/code_llama/tokenization_code_llama.py index 5bbf2d0452f4ff..cc906687874ce0 100644 --- a/src/transformers/models/code_llama/tokenization_code_llama.py +++ b/src/transformers/models/code_llama/tokenization_code_llama.py @@ -437,61 +437,6 @@ def create_token_type_ids_from_sequences( return output - @property - # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.default_chat_template - def default_chat_template(self): - """ - LLaMA uses [INST] and [/INST] to indicate user messages, and <> and <> to indicate system messages. - Assistant messages do not have special tokens, because LLaMA chat models are generally trained with strict - user/assistant/user/assistant message ordering, and so assistant messages can be identified from the ordering - rather than needing special tokens. The system message is partly 'embedded' in the first user message, which - results in an unusual token ordering when it is present. This template should definitely be changed if you wish - to fine-tune a model with more flexible role ordering! - - The output should look something like: - - [INST] B_SYS SystemPrompt E_SYS Prompt [/INST] Answer [INST] Prompt [/INST] Answer - [INST] Prompt [/INST] - - The reference for this chat template is [this code - snippet](https://github.com/facebookresearch/llama/blob/556949fdfb72da27c2f4a40b7f0e4cf0b8153a28/llama/generation.py#L320-L362) - in the original repository. - """ - template = ( - "{% if messages[0]['role'] == 'system' %}" - "{% set loop_messages = messages[1:] %}" # Extract system message if it's present - "{% set system_message = messages[0]['content'] %}" - "{% elif USE_DEFAULT_PROMPT == true and not '<>' in messages[0]['content'] %}" - "{% set loop_messages = messages %}" # Or use the default system message if the flag is set - "{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}" - "{% else %}" - "{% set loop_messages = messages %}" - "{% set system_message = false %}" - "{% endif %}" - "{% for message in loop_messages %}" # Loop over all non-system messages - "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}" - "{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}" - "{% endif %}" - "{% if loop.index0 == 0 and system_message != false %}" # Embed system message in first message - "{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}" - "{% else %}" - "{% set content = message['content'] %}" - "{% endif %}" - "{% if message['role'] == 'user' %}" # After all of that, handle messages/roles in a fairly normal way - "{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}" - "{% elif message['role'] == 'system' %}" - "{{ '<>\\n' + content.strip() + '\\n<>\\n\\n' }}" - "{% elif message['role'] == 'assistant' %}" - "{{ ' ' + content.strip() + ' ' + eos_token }}" - "{% endif %}" - "{% endfor %}" - ) - template = template.replace("USE_DEFAULT_PROMPT", "true" if self.use_default_system_prompt else "false") - default_message = DEFAULT_SYSTEM_PROMPT.replace("\n", "\\n").replace("'", "\\'") - template = template.replace("DEFAULT_SYSTEM_MESSAGE", default_message) - - return template - def __getstate__(self): state = self.__dict__.copy() state["sp_model"] = None diff --git a/src/transformers/models/code_llama/tokenization_code_llama_fast.py b/src/transformers/models/code_llama/tokenization_code_llama_fast.py index 9bdb7a65b58499..b832348d07af4d 100644 --- a/src/transformers/models/code_llama/tokenization_code_llama_fast.py +++ b/src/transformers/models/code_llama/tokenization_code_llama_fast.py @@ -349,61 +349,6 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = return (out_vocab_file,) - @property - # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.default_chat_template - def default_chat_template(self): - """ - LLaMA uses [INST] and [/INST] to indicate user messages, and <> and <> to indicate system messages. - Assistant messages do not have special tokens, because LLaMA chat models are generally trained with strict - user/assistant/user/assistant message ordering, and so assistant messages can be identified from the ordering - rather than needing special tokens. The system message is partly 'embedded' in the first user message, which - results in an unusual token ordering when it is present. This template should definitely be changed if you wish - to fine-tune a model with more flexible role ordering! - - The output should look something like: - - [INST] B_SYS SystemPrompt E_SYS Prompt [/INST] Answer [INST] Prompt [/INST] Answer - [INST] Prompt [/INST] - - The reference for this chat template is [this code - snippet](https://github.com/facebookresearch/llama/blob/556949fdfb72da27c2f4a40b7f0e4cf0b8153a28/llama/generation.py#L320-L362) - in the original repository. - """ - template = ( - "{% if messages[0]['role'] == 'system' %}" - "{% set loop_messages = messages[1:] %}" # Extract system message if it's present - "{% set system_message = messages[0]['content'] %}" - "{% elif USE_DEFAULT_PROMPT == true and not '<>' in messages[0]['content'] %}" - "{% set loop_messages = messages %}" # Or use the default system message if the flag is set - "{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}" - "{% else %}" - "{% set loop_messages = messages %}" - "{% set system_message = false %}" - "{% endif %}" - "{% for message in loop_messages %}" # Loop over all non-system messages - "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}" - "{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}" - "{% endif %}" - "{% if loop.index0 == 0 and system_message != false %}" # Embed system message in first message - "{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}" - "{% else %}" - "{% set content = message['content'] %}" - "{% endif %}" - "{% if message['role'] == 'user' %}" # After all of that, handle messages/roles in a fairly normal way - "{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}" - "{% elif message['role'] == 'system' %}" - "{{ '<>\\n' + content.strip() + '\\n<>\\n\\n' }}" - "{% elif message['role'] == 'assistant' %}" - "{{ ' ' + content.strip() + ' ' + eos_token }}" - "{% endif %}" - "{% endfor %}" - ) - template = template.replace("USE_DEFAULT_PROMPT", "true" if self.use_default_system_prompt else "false") - default_message = DEFAULT_SYSTEM_PROMPT.replace("\n", "\\n").replace("'", "\\'") - template = template.replace("DEFAULT_SYSTEM_MESSAGE", default_message) - - return template - def build_inputs_with_special_tokens( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: diff --git a/src/transformers/models/cohere/tokenization_cohere_fast.py b/src/transformers/models/cohere/tokenization_cohere_fast.py index b0a62e279ca8e9..bac665b473c57b 100644 --- a/src/transformers/models/cohere/tokenization_cohere_fast.py +++ b/src/transformers/models/cohere/tokenization_cohere_fast.py @@ -228,188 +228,6 @@ def add_bos_token(self, value): self._add_bos_token = value self.update_post_processor() - @property - def default_chat_template(self): - """ - Cohere Tokenizer uses <|START_OF_TURN_TOKEN|> and <|END_OF_TURN_TOKEN|> to indicate each turn in a chat. - Additioanlly, to indicate the source of the message, <|USER_TOKEN|>, <|CHATBOT_TOKEN|> and <|SYSTEM_TOKEN|> - for user, assitant and system messages respectively. - - The output should look something like: - <|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{ preamble }}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{ How are you? }}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{ I am doing well! }}<|END_OF_TURN_TOKEN|> - - Use add_generation_prompt to add a prompt for the model to generate a response: - >>> from transformers import AutoTokenizer - >>> tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01") - >>> messages = [{"role": "user", "content": "Hello, how are you?"}] - >>> tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello, how are you?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' - - """ - default_template = ( - "{{ bos_token }}" - "{% if messages[0]['role'] == 'system' %}" - "{% set loop_messages = messages[1:] %}" # Extract system message if it's present - "{% set system_message = messages[0]['content'] %}" - "{% elif USE_DEFAULT_PROMPT == true %}" - "{% set loop_messages = messages %}" # Or use the default system message if the flag is set - "{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}" - "{% else %}" - "{% set loop_messages = messages %}" - "{% set system_message = false %}" - "{% endif %}" - "{% if system_message != false %}" # Start with system message - "{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}" - "{% endif %}" - "{% for message in loop_messages %}" # Loop over all non-system messages - "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}" - "{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}" - "{% endif %}" - "{% set content = message['content'] %}" - "{% if message['role'] == 'user' %}" # After all of that, handle messages/roles in a fairly normal way - "{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}" - "{% elif message['role'] == 'assistant' %}" - "{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}" - "{% endif %}" - "{% endfor %}" - "{% if add_generation_prompt %}" - "{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}" - "{% endif %}" - ) - default_template = default_template.replace( - "USE_DEFAULT_PROMPT", "true" if self.use_default_system_prompt else "false" - ) - default_message = DEFAULT_SYSTEM_PROMPT.replace("\n", "\\n").replace("'", "\\'") - default_template = default_template.replace("DEFAULT_SYSTEM_MESSAGE", default_message) - - tool_use_template = ( - "{{ bos_token }}" - "{% if messages[0]['role'] == 'system' %}" - "{% set loop_messages = messages[1:] %}" # Extract system message if it's present - "{% set system_message = messages[0]['content'] %}" - "{% else %}" - "{% set loop_messages = messages %}" - "{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}" - "{% endif %}" - "{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' }}" - "{{ '# Safety Preamble' }}" - "{{ '\nThe instructions in this section override those in the task description and style guide sections. Don\\'t answer questions that are harmful or immoral.' }}" - "{{ '\n\n# System Preamble' }}" - "{{ '\n## Basic Rules' }}" - "{{ '\nYou are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user\\'s requests, you cite your sources in your answers, according to those instructions.' }}" - "{{ '\n\n# User Preamble' }}" - "{{ '\n' + system_message }}" - "{{'\n\n## Available Tools\nHere is a list of tools that you have available to you:\n\n'}}" - "{% for tool in tools %}" - "{% if loop.index0 != 0 %}" - "{{ '\n\n'}}" - "{% endif %}" - "{{'```python\ndef ' + tool.name + '('}}" - "{% for param_name, param_fields in tool.parameter_definitions.items() %}" - "{% if loop.index0 != 0 %}" - "{{ ', '}}" - "{% endif %}" - "{{param_name}}: " - "{% if not param_fields.required %}" - "{{'Optional[' + param_fields.type + '] = None'}}" - "{% else %}" - "{{ param_fields.type }}" - "{% endif %}" - "{% endfor %}" - '{{ \') -> List[Dict]:\n """\'}}' - "{{ tool.description }}" - "{% if tool.parameter_definitions|length != 0 %}" - "{{ '\n\n Args:\n '}}" - "{% for param_name, param_fields in tool.parameter_definitions.items() %}" - "{% if loop.index0 != 0 %}" - "{{ '\n ' }}" - "{% endif %}" - "{{ param_name + ' ('}}" - "{% if not param_fields.required %}" - "{{'Optional[' + param_fields.type + ']'}}" - "{% else %}" - "{{ param_fields.type }}" - "{% endif %}" - "{{ '): ' + param_fields.description }}" - "{% endfor %}" - "{% endif %}" - '{{ \'\n """\n pass\n```\' }}' - "{% endfor %}" - "{{ '<|END_OF_TURN_TOKEN|>'}}" - "{% for message in loop_messages %}" - "{% set content = message['content'] %}" - "{% if message['role'] == 'user' %}" - "{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}" - "{% elif message['role'] == 'system' %}" - "{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}" - "{% elif message['role'] == 'assistant' %}" - "{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}" - "{% endif %}" - "{% endfor %}" - "{{'<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>Write \\'Action:\\' followed by a json-formatted list of actions that you want to perform in order to produce a good response to the user\\'s last input. You can use any of the supplied tools any number of times, but you should aim to execute the minimum number of necessary actions for the input. You should use the `directly-answer` tool if calling the other tools is unnecessary. The list of actions you want to call should be formatted as a list of json objects, for example:\n```json\n[\n {\n \"tool_name\": title of the tool in the specification,\n \"parameters\": a dict of parameters to input into the tool as they are defined in the specs, or {} if it takes no parameters\n }\n]```<|END_OF_TURN_TOKEN|>'}}" - "{% if add_generation_prompt %}" - "{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}" - "{% endif %}" - ) - default_tool_message = DEFAULT_RAG_PREAMBLE.replace("\n", "\\n").replace("'", "\\'") - tool_use_template = tool_use_template.replace("DEFAULT_SYSTEM_MESSAGE", default_tool_message) - - rag_template = ( - "{{ bos_token }}" - "{% if messages[0]['role'] == 'system' %}" - "{% set loop_messages = messages[1:] %}" # Extract system message if it's present - "{% set system_message = messages[0]['content'] %}" - "{% else %}" - "{% set loop_messages = messages %}" - "{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}" - "{% endif %}" - "{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' }}" - "{{ '# Safety Preamble' }}" - "{{ '\nThe instructions in this section override those in the task description and style guide sections. Don\\'t answer questions that are harmful or immoral.' }}" - "{{ '\n\n# System Preamble' }}" - "{{ '\n## Basic Rules' }}" - "{{ '\nYou are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user\\'s requests, you cite your sources in your answers, according to those instructions.' }}" - "{{ '\n\n# User Preamble' }}" - "{{ '\n' + system_message }}" - "{{ '<|END_OF_TURN_TOKEN|>'}}" - "{% for message in loop_messages %}" # Loop over all non-system messages - "{% set content = message['content'] %}" - "{% if message['role'] == 'user' %}" # After all of that, handle messages/roles in a fairly normal way - "{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}" - "{% elif message['role'] == 'system' %}" - "{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}" - "{% elif message['role'] == 'assistant' %}" - "{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}" - "{% endif %}" - "{% endfor %}" - "{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>'}}" - "{{ '' }}" - "{% for document in documents %}" # Loop over all non-system messages - "{{ '\nDocument: ' }}" - "{{ loop.index0 }}\n" - "{% for key, value in document.items() %}" - "{{ key }}: {{value}}\n" - "{% endfor %}" - "{% endfor %}" - "{{ ''}}" - "{{ '<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' }}" - "{{ 'Carefully perform the following instructions, in order, starting each with a new line.\n' }}" - "{{ 'Firstly, Decide which of the retrieved documents are relevant to the user\\'s last input by writing \\'Relevant Documents:\\' followed by comma-separated list of document numbers. If none are relevant, you should instead write \\'None\\'.\n' }}" - "{{ 'Secondly, Decide which of the retrieved documents contain facts that should be cited in a good answer to the user\\'s last input by writing \\'Cited Documents:\\' followed a comma-separated list of document numbers. If you dont want to cite any of them, you should instead write \\'None\\'.\n' }}" - "{% if citation_mode=='accurate' %}" - "{{ 'Thirdly, Write \\'Answer:\\' followed by a response to the user\\'s last input in high quality natural english. Use the retrieved documents to help you. Do not insert any citations or grounding markup.\n' }}" - "{% endif %}" - "{{ 'Finally, Write \\'Grounded answer:\\' followed by a response to the user\\'s last input in high quality natural english. Use the symbols and to indicate when a fact comes from a document in the search result, e.g my fact for a fact from document 0.' }}" - "{{ '<|END_OF_TURN_TOKEN|>' }}" - "{% if add_generation_prompt %}" - "{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}" - "{% endif %}" - ) - default_rag_message = DEFAULT_RAG_PREAMBLE.replace("\n", "\\n").replace("'", "\\'") - rag_template = rag_template.replace("DEFAULT_SYSTEM_MESSAGE", default_rag_message) - - return {"default": default_template, "tool_use": tool_use_template, "rag": rag_template} - def apply_tool_use_template( self, conversation: Union[List[Dict[str, str]]], diff --git a/src/transformers/models/deprecated/gptsan_japanese/tokenization_gptsan_japanese.py b/src/transformers/models/deprecated/gptsan_japanese/tokenization_gptsan_japanese.py index 51789e49b2d263..782f68bf921e04 100644 --- a/src/transformers/models/deprecated/gptsan_japanese/tokenization_gptsan_japanese.py +++ b/src/transformers/models/deprecated/gptsan_japanese/tokenization_gptsan_japanese.py @@ -236,19 +236,6 @@ def convert_tokens_to_string(self, tokens): text = "".join(words) return text - @property - def default_chat_template(self): - """ - A simple chat template that adds standard BOS, SEP and EOS tokens between messages while discarding role - information. - """ - return ( - "{% for message in messages %}" - "{% if not loop.first %}{{ bos_token}}{% endif %}" - "{{ sep_token }}{{ message.content }} {{ eos_token }}" - "{% endfor %}" - ) - def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: index = 0 if os.path.isdir(save_directory): diff --git a/src/transformers/models/gpt2/tokenization_gpt2.py b/src/transformers/models/gpt2/tokenization_gpt2.py index 9bca559d9ea009..badacf6dbe71ff 100644 --- a/src/transformers/models/gpt2/tokenization_gpt2.py +++ b/src/transformers/models/gpt2/tokenization_gpt2.py @@ -329,10 +329,3 @@ def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs): if is_split_into_words or add_prefix_space: text = " " + text return (text, kwargs) - - @property - def default_chat_template(self): - """ - A simple chat template that ignores role information and just concatenates messages with EOS tokens. - """ - return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}" diff --git a/src/transformers/models/gpt2/tokenization_gpt2_fast.py b/src/transformers/models/gpt2/tokenization_gpt2_fast.py index e6747119f4227f..90e83f0d35a351 100644 --- a/src/transformers/models/gpt2/tokenization_gpt2_fast.py +++ b/src/transformers/models/gpt2/tokenization_gpt2_fast.py @@ -139,12 +139,3 @@ def _encode_plus(self, *args, **kwargs) -> BatchEncoding: def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: files = self._tokenizer.model.save(save_directory, name=filename_prefix) return tuple(files) - - @property - # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.default_chat_template - def default_chat_template(self): - """ - A simple chat template that ignores role information and just concatenates messages with EOS tokens. - """ - - return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}" diff --git a/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py b/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py index 2504fa3cc05154..c79e6d9ada15d3 100644 --- a/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py +++ b/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py @@ -228,11 +228,3 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: files = self._tokenizer.model.save(save_directory, name=filename_prefix) return tuple(files) - - @property - # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.default_chat_template - def default_chat_template(self): - """ - A simple chat template that ignores role information and just concatenates messages with EOS tokens. - """ - return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}" diff --git a/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py index f36f7e3fd6104d..ea7f3959c78db0 100644 --- a/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py +++ b/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py @@ -161,18 +161,6 @@ def convert_tokens_to_string(self, tokens): out_string = "".join(tokens).strip() return out_string - @property - def default_chat_template(self): - """ - A simple chat template that just adds BOS/EOS tokens around messages while discarding role information. - """ - return ( - "{% for message in messages %}" - "{{ bos_token + eos_token + message.content + eos_token }}" - "{% endfor %}" - "{% if add_generation_prompt %} {{ bos_token + eos_token }} {% endif %}" - ) - def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: index = 0 if os.path.isdir(save_directory): diff --git a/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py b/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py index 1000bfd1b6c8b1..262aeaba5eea10 100644 --- a/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py +++ b/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py @@ -294,19 +294,3 @@ def decode_fast(self, token_ids: Union[int, List[int]]) -> str: """ return self.sp_model.decode(token_ids) - - @property - def default_chat_template(self): - """ - This chat template formats messages like an instant messenger chat log, with "User:" and "Bot:" strings - preceding messages. BOS tokens are added between all messages. - """ - return ( - "{{ eos_token }}{{ bos_token }}" - "{% for message in messages %}" - "{% if message['role'] == 'user' %}{{ 'User: ' + message['content']}}" - "{% else %}{{ 'Bot: ' + message['content']}}{% endif %}" - "{{ message['text'] }}{{ bos_token }}" - "{% endfor %}" - "Bot:" - ) diff --git a/src/transformers/models/idefics2/processing_idefics2.py b/src/transformers/models/idefics2/processing_idefics2.py index c665ba74d06aef..2e14118144baaa 100644 --- a/src/transformers/models/idefics2/processing_idefics2.py +++ b/src/transformers/models/idefics2/processing_idefics2.py @@ -251,60 +251,3 @@ def model_input_names(self): tokenizer_input_names = self.tokenizer.model_input_names image_processor_input_names = self.image_processor.model_input_names return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) - - @property - def default_chat_template(self): - """ - This template formats inputs in the form of a chat history. For each message in the chat history: - * the template will output the role of the speaker followed by the content of the message. - * content can be a single string or a list of strings and images. - * If the content element is an image, the template will output a sequence of tokens and token before and after each image - * The template will output an token at the end of each message. - - Example: - - ```python - messages = [{ - "role": "user", - "content": [ - {"type": "text", "text": "What’s in this image?"}, - {"type": "image"}, - {"type": "image"}, - ], - }, - { - "role": "assistant", - "content": [{"type": "text", "text": "This picture depicts Idefix, the dog of Obelix in Asterix and Obelix. Idefix is running on the ground."},] - }] - ``` - - Will create outputs like: - ``` - User: What is in this Image? - Assistant: This picture depicts Idefix, the dog of Obelix in Asterix and Obelix. Idefix is running on the ground. - ``` - """ - # fmt: off - return ( - "{% for message in messages %}" - "{{message['role'].capitalize()}}" - "{% if message['content'][0]['type'] == 'image' %}" - "{{':'}}" - "{% else %}" - "{{': '}}" - "{% endif %}" - "{% for line in message['content'] %}" - "{% if line['type'] == 'text' %}" - "{{line['text']}}" - "{% elif line['type'] == 'image' %}" - "{{ '' }}" - "{% endif %}" - "{% endfor %}" - "\n" - "{% endfor %}" - - "{% if add_generation_prompt %}" - "{{ 'Assistant:' }}" - "{% endif %}" - ) - # fmt: on diff --git a/src/transformers/models/llama/tokenization_llama.py b/src/transformers/models/llama/tokenization_llama.py index 80865ba98d6d67..385ad2d88e1053 100644 --- a/src/transformers/models/llama/tokenization_llama.py +++ b/src/transformers/models/llama/tokenization_llama.py @@ -411,57 +411,3 @@ def create_token_type_ids_from_sequences( output += [1] * len(bos_token_id + token_ids_1 + eos_token_id) return output - - @property - def default_chat_template(self): - """ - LLaMA uses [INST] and [/INST] to indicate user messages, and <> and <> to indicate system messages. - Assistant messages do not have special tokens, because LLaMA chat models are generally trained with strict - user/assistant/user/assistant message ordering, and so assistant messages can be identified from the ordering - rather than needing special tokens. The system message is partly 'embedded' in the first user message, which - results in an unusual token ordering when it is present. This template should definitely be changed if you wish - to fine-tune a model with more flexible role ordering! - - The output should look something like: - - [INST] B_SYS SystemPrompt E_SYS Prompt [/INST] Answer [INST] Prompt [/INST] Answer - [INST] Prompt [/INST] - - The reference for this chat template is [this code - snippet](https://github.com/facebookresearch/llama/blob/556949fdfb72da27c2f4a40b7f0e4cf0b8153a28/llama/generation.py#L320-L362) - in the original repository. - """ - template = ( - "{% if messages[0]['role'] == 'system' %}" - "{% set loop_messages = messages[1:] %}" # Extract system message if it's present - "{% set system_message = messages[0]['content'] %}" - "{% elif USE_DEFAULT_PROMPT == true and not '<>' in messages[0]['content'] %}" - "{% set loop_messages = messages %}" # Or use the default system message if the flag is set - "{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}" - "{% else %}" - "{% set loop_messages = messages %}" - "{% set system_message = false %}" - "{% endif %}" - "{% for message in loop_messages %}" # Loop over all non-system messages - "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}" - "{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}" - "{% endif %}" - "{% if loop.index0 == 0 and system_message != false %}" # Embed system message in first message - "{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}" - "{% else %}" - "{% set content = message['content'] %}" - "{% endif %}" - "{% if message['role'] == 'user' %}" # After all of that, handle messages/roles in a fairly normal way - "{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}" - "{% elif message['role'] == 'system' %}" - "{{ '<>\\n' + content.strip() + '\\n<>\\n\\n' }}" - "{% elif message['role'] == 'assistant' %}" - "{{ ' ' + content.strip() + ' ' + eos_token }}" - "{% endif %}" - "{% endfor %}" - ) - template = template.replace("USE_DEFAULT_PROMPT", "true" if self.use_default_system_prompt else "false") - default_message = DEFAULT_SYSTEM_PROMPT.replace("\n", "\\n").replace("'", "\\'") - template = template.replace("DEFAULT_SYSTEM_MESSAGE", default_message) - - return template diff --git a/src/transformers/models/llama/tokenization_llama_fast.py b/src/transformers/models/llama/tokenization_llama_fast.py index 91d3bf3615171f..67e339b4290a2b 100644 --- a/src/transformers/models/llama/tokenization_llama_fast.py +++ b/src/transformers/models/llama/tokenization_llama_fast.py @@ -241,61 +241,6 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = return (out_vocab_file,) - @property - # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.default_chat_template - def default_chat_template(self): - """ - LLaMA uses [INST] and [/INST] to indicate user messages, and <> and <> to indicate system messages. - Assistant messages do not have special tokens, because LLaMA chat models are generally trained with strict - user/assistant/user/assistant message ordering, and so assistant messages can be identified from the ordering - rather than needing special tokens. The system message is partly 'embedded' in the first user message, which - results in an unusual token ordering when it is present. This template should definitely be changed if you wish - to fine-tune a model with more flexible role ordering! - - The output should look something like: - - [INST] B_SYS SystemPrompt E_SYS Prompt [/INST] Answer [INST] Prompt [/INST] Answer - [INST] Prompt [/INST] - - The reference for this chat template is [this code - snippet](https://github.com/facebookresearch/llama/blob/556949fdfb72da27c2f4a40b7f0e4cf0b8153a28/llama/generation.py#L320-L362) - in the original repository. - """ - template = ( - "{% if messages[0]['role'] == 'system' %}" - "{% set loop_messages = messages[1:] %}" # Extract system message if it's present - "{% set system_message = messages[0]['content'] %}" - "{% elif USE_DEFAULT_PROMPT == true and not '<>' in messages[0]['content'] %}" - "{% set loop_messages = messages %}" # Or use the default system message if the flag is set - "{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}" - "{% else %}" - "{% set loop_messages = messages %}" - "{% set system_message = false %}" - "{% endif %}" - "{% for message in loop_messages %}" # Loop over all non-system messages - "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}" - "{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}" - "{% endif %}" - "{% if loop.index0 == 0 and system_message != false %}" # Embed system message in first message - "{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}" - "{% else %}" - "{% set content = message['content'] %}" - "{% endif %}" - "{% if message['role'] == 'user' %}" # After all of that, handle messages/roles in a fairly normal way - "{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}" - "{% elif message['role'] == 'system' %}" - "{{ '<>\\n' + content.strip() + '\\n<>\\n\\n' }}" - "{% elif message['role'] == 'assistant' %}" - "{{ ' ' + content.strip() + ' ' + eos_token }}" - "{% endif %}" - "{% endfor %}" - ) - template = template.replace("USE_DEFAULT_PROMPT", "true" if self.use_default_system_prompt else "false") - default_message = DEFAULT_SYSTEM_PROMPT.replace("\n", "\\n").replace("'", "\\'") - template = template.replace("DEFAULT_SYSTEM_MESSAGE", default_message) - - return template - # TODO ArthurZ let's rely on the template processor instead, refactor all fast tokenizers # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.build_inputs_with_special_tokens def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): diff --git a/src/transformers/models/llava_next_video/processing_llava_next_video.py b/src/transformers/models/llava_next_video/processing_llava_next_video.py index 81426b3a0af3ac..6b5e86ab414958 100644 --- a/src/transformers/models/llava_next_video/processing_llava_next_video.py +++ b/src/transformers/models/llava_next_video/processing_llava_next_video.py @@ -159,63 +159,3 @@ def model_input_names(self): tokenizer_input_names = self.tokenizer.model_input_names image_processor_input_names = self.image_processor.model_input_names return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) - - @property - def default_chat_template(self): - """ - This default vicuna template formats inputs in the form of a chat history. For each message in the chat history: - * the template will output the role of the speaker followed by the content of the message. - * content is a list of strings and images. - * If the content element is an image, the template will output a sequence of or