From f80018453347612d57013b7932063f6f1de932c3 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Mon, 6 Jun 2022 17:53:35 +0200 Subject: [PATCH 01/89] Simplification of language_model.py to remove code duplication --- haystack/modeling/model/language_model.py | 1209 ++++------------- haystack/modeling/visual.py | 2 +- .../nodes/retriever/_embedding_encoder.py | 61 + 3 files changed, 364 insertions(+), 908 deletions(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 1247a5dcf6..d1eda885ca 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -20,6 +20,7 @@ from __future__ import absolute_import, division, print_function, unicode_literals from typing import Optional, Dict, Any, Union +import re import json import logging import os @@ -30,26 +31,8 @@ from torch import nn import transformers from transformers import ( - BertModel, - BertConfig, - RobertaModel, - RobertaConfig, - XLNetModel, - XLNetConfig, - AlbertModel, - AlbertConfig, - XLMRobertaModel, - XLMRobertaConfig, - DistilBertModel, - DistilBertConfig, - ElectraModel, - ElectraConfig, - CamembertModel, - CamembertConfig, - BigBirdModel, - BigBirdConfig, - DebertaV2Model, - DebertaV2Config, + PretrainedConfig, + PreTrainedModel, ) from transformers import AutoModel, AutoConfig from transformers.modeling_utils import SequenceSummary @@ -58,6 +41,115 @@ logger = logging.getLogger(__name__) +HF_PARAMETERS_BY_MODEL = { + "bert": { + "prefix": "Bert", + }, + "xlm.*roberta": { + "prefix": "XLMRoberta", + }, + "roberta.*xml": { + "prefix": "XLMRoberta", + }, + "bigbird": { + "prefix": "BigBird", + }, + "roberta": { + "prefix": "Roberta", + }, + "codebert.*mlm": { + "prefix": "Roberta", + }, + "mlm.*codebert": { + "prefix": "Roberta", + }, + "camembert": { + "prefix": "Camembert", + }, + "umberto": { + "prefix": "Camembert", + }, + "albert": { + "prefix": "Albert", + }, + "distilbert": { + "prefix": "DistilBert", + "sequence_summary_config": { + "summary_last_dropout": 0, + "summary_type": "first", + "summary_activation": "tanh", + } + }, + "xlnet": { + "prefix": "XLNet", + "sequence_summary_config": { + "summary_last_dropout": 0, + } + }, + "electra": { + "prefix": "Electra", + "sequence_summary_config": { + "summary_last_dropout": 0, + "summary_type": "first", + "summary_activation": "gelu", + "summary_use_proj": False, + } + }, + "word2vec": { + "prefix": "WordEmbedding_LM", + }, + "glove": { + "prefix": "WordEmbedding_LM", + }, + "minilm": { + "prefix": "Bert", + }, + "deberta-v2": { + "prefix": "DebertaV2", + "sequence_summary_config": { + "summary_last_dropout": 0, + "summary_type": "first", + "summary_activati": "tanh", + "summary_use_proj": False + } + }, +} + +HF_MODEL_TYPES = { + "xlm-roberta": "XLMRoberta", + "roberta": "Roberta", + "camembert": "Camembert", + "albert": "Albert", + "distilbert": "DistilBert", + "bert": "Bert", + "xlnet": "XLNet", + "electra": "Electra", + "big_bird": "BigBird", + "deberta-v2": "DebertaV2", +} + +HF_MODEL_STRINGS_HINTS = { + "xlm.*roberta|roberta.*xlm": "XLMRoberta", + "bigbird": "BigBird", + "roberta": "Roberta", + "codebert": "Roberta", + "camembert": "Camembert", + "albert": "Albert", + "distilbert": "DistilBert", + "bert": "Bert", + "xlnet": "XLNet", + "electra": "Electra", + "word2vec": "WordEmbedding_LM", + "glove": "WordEmbedding_LM", + "minilm": "Bert", + "dpr-question_encoder": "DPRQuestionEncoder", + "dpr-ctx_encoder": "DPRContextEncoder", +} + +KNOWN_LANGUAGES = ("german", "english", "chinese", "indian", "french", "polish", "spanish", "multilingual") +KNOWN_LANGUAGE_SPECIFIC_MODELS = (("camembert", "french"), ("umberto", "italian")) + + def silence_transformers_logs(from_pretrained_func): """ A wrapper that raises the log level of Transformers to @@ -208,86 +300,38 @@ def load( @staticmethod def get_language_model_class(model_name_or_path, use_auth_token: Union[str, bool] = None, **kwargs): + """ + Given a model name, try to use AutoConfig to understand which model type it is. + In case it's not successful, tries to infer the type from the name of the model. + """ # it's transformers format (either from model hub or local) model_name_or_path = str(model_name_or_path) - config = AutoConfig.from_pretrained(model_name_or_path, use_auth_token=use_auth_token, **kwargs) - model_type = config.model_type - if model_type == "xlm-roberta": - language_model_class = "XLMRoberta" - elif model_type == "roberta": - if "mlm" in model_name_or_path.lower(): - raise NotImplementedError("MLM part of codebert is currently not supported in Haystack") - language_model_class = "Roberta" - elif model_type == "camembert": - language_model_class = "Camembert" - elif model_type == "albert": - language_model_class = "Albert" - elif model_type == "distilbert": - language_model_class = "DistilBert" - elif model_type == "bert": - language_model_class = "Bert" - elif model_type == "xlnet": - language_model_class = "XLNet" - elif model_type == "electra": - language_model_class = "Electra" - elif model_type == "dpr": - if config.architectures[0] == "DPRQuestionEncoder": - language_model_class = "DPRQuestionEncoder" - elif config.architectures[0] == "DPRContextEncoder": - language_model_class = "DPRContextEncoder" - elif config.archictectures[0] == "DPRReader": - raise NotImplementedError("DPRReader models are currently not supported.") - elif model_type == "big_bird": - language_model_class = "BigBird" - elif model_type == "deberta-v2": - language_model_class = "DebertaV2" - else: - # Fall back to inferring type from model name - logger.warning( - "Could not infer LanguageModel class from config. Trying to infer " - "LanguageModel class from model name." - ) - language_model_class = LanguageModel._infer_language_model_class_from_string(model_name_or_path) - - return language_model_class + language_model_class = HF_MODEL_TYPES.get(config.model_type, None) + + # Handle special cases + if not language_model_class: + + # DPR + if config.model_type == "dpr": + if config.architectures[0] == "DPRQuestionEncoder": + language_model_class = "DPRQuestionEncoder" + elif config.architectures[0] == "DPRContextEncoder": + language_model_class = "DPRContextEncoder" + elif config.archictectures[0] == "DPRReader": + raise NotImplementedError("DPRReader models are currently not supported.") + + # Infer from model name if still not found + else: + logger.warning("Could not infer the class from config. Trying to infer class from model name.") + for regex, class_ in HF_MODEL_STRINGS_HINTS.items(): + if re.match(regex, model_name_or_path): + language_model_class = class_ + break - @staticmethod - def _infer_language_model_class_from_string(model_name_or_path): - # If inferring Language model class from config doesn't succeed, - # fall back to inferring Language model class from model name. - if "xlm" in model_name_or_path.lower() and "roberta" in model_name_or_path.lower(): - language_model_class = "XLMRoberta" - elif "bigbird" in model_name_or_path.lower(): - language_model_class = "BigBird" - elif "roberta" in model_name_or_path.lower(): - language_model_class = "Roberta" - elif "codebert" in model_name_or_path.lower(): - if "mlm" in model_name_or_path.lower(): - raise NotImplementedError("MLM part of codebert is currently not supported in Haystack") - language_model_class = "Roberta" - elif "camembert" in model_name_or_path.lower() or "umberto" in model_name_or_path.lower(): - language_model_class = "Camembert" - elif "albert" in model_name_or_path.lower(): - language_model_class = "Albert" - elif "distilbert" in model_name_or_path.lower(): - language_model_class = "DistilBert" - elif "bert" in model_name_or_path.lower(): - language_model_class = "Bert" - elif "xlnet" in model_name_or_path.lower(): - language_model_class = "XLNet" - elif "electra" in model_name_or_path.lower(): - language_model_class = "Electra" - elif "word2vec" in model_name_or_path.lower() or "glove" in model_name_or_path.lower(): - language_model_class = "WordEmbedding_LM" - elif "minilm" in model_name_or_path.lower(): - language_model_class = "Bert" - elif "dpr-question_encoder" in model_name_or_path.lower(): - language_model_class = "DPRQuestionEncoder" - elif "dpr-ctx_encoder" in model_name_or_path.lower(): - language_model_class = "DPRContextEncoder" - else: - language_model_class = None + # Notes for some models + if language_model_class == "Roberta" and "mlm" in model_name_or_path.lower(): + raise NotImplementedError("MLM part of codebert is currently not supported in Haystack.") return language_model_class @@ -298,14 +342,6 @@ def get_output_dims(self): return getattr(config, odn) raise Exception("Could not infer the output dimensions of the language model") - def freeze(self, layers): - """To be implemented""" - raise NotImplementedError() - - def unfreeze(self): - """To be implemented""" - raise NotImplementedError() - def save_config(self, save_dir): save_filename = Path(save_dir) / "language_model_config.json" with open(save_filename, "w") as file: @@ -329,38 +365,24 @@ def save(self, save_dir: Union[str, Path], state_dict: Dict[Any, Any] = None): save_name = Path(save_dir) / "language_model.bin" model_to_save = ( self.model.module if hasattr(self.model, "module") else self.model - ) # Only save the model it-self + ) # Only save the model itself if not state_dict: state_dict = model_to_save.state_dict() torch.save(state_dict, save_name) self.save_config(save_dir) - @classmethod - def _get_or_infer_language_from_name(cls, language, name): - if language is not None: - return language - else: - return cls._infer_language_from_name(name) - - @classmethod - def _infer_language_from_name(cls, name): - known_languages = ("german", "english", "chinese", "indian", "french", "polish", "spanish", "multilingual") - matches = [lang for lang in known_languages if lang in name] - if "camembert" in name: - language = "french" - logger.info(f"Automatically detected language from language model name: {language}") - elif "umberto" in name: - language = "italian" - logger.info(f"Automatically detected language from language model name: {language}") - elif len(matches) == 0: - language = "english" - elif len(matches) > 1: - language = matches[0] + @staticmethod + def _infer_language_from_name(name: str) -> str: + language = "english" + languages = [lang for lang in KNOWN_LANGUAGES if lang in name] + if len(languages) == 0: + languages = [lang for model, lang in KNOWN_LANGUAGE_SPECIFIC_MODELS if model in name] + if len(languages) > 0: + language = languages[0] else: - language = matches[0] - logger.info(f"Automatically detected language from language model name: {language}") - + language = languages[0] + logger.info(f"Automatically detected language from model name: {language}") return language def formatted_preds(self, logits, samples, ignore_first_token=True, padding_mask=None, input_ids=None, **kwargs): @@ -439,26 +461,29 @@ def _pool_tokens(self, sequence_output, padding_mask, strategy, ignore_first_tok return pooled_vecs -class Bert(LanguageModel): +class HFLanguageModel(LanguageModel): """ - A BERT model that wraps Hugging Face's implementation + A model that wraps Hugging Face's implementation (https://github.com/huggingface/transformers) to fit the LanguageModel class. - Paper: https://arxiv.org/abs/1810.04805. """ - def __init__(self): - super(Bert, self).__init__() + def __init__(self, name: str): + super().__init__() self.model = None - self.name = "bert" + self.name = name - @classmethod - def from_scratch(cls, vocab_size, name="bert", language="en"): - bert = cls() - bert.name = name - bert.language = language - config = BertConfig(vocab_size=vocab_size) - bert.model = BertModel(config) - return bert + class_prefix = HF_PARAMETERS_BY_MODEL.get(self.name)["prefix"] + self.config_class: PretrainedConfig = getattr(transformers, class_prefix+"Config", None) + self.model_class: PreTrainedModel = getattr(transformers, class_prefix+"Model", None) + + # @classmethod + # def from_scratch(cls, vocab_size, name="bert", language="en"): + # bert = cls() + # bert.name = name + # bert.language = language + # config = BertConfig(vocab_size=vocab_size) + # bert.model = BertModel(config) + # return bert @classmethod @silence_transformers_logs @@ -472,24 +497,25 @@ def load(cls, pretrained_model_name_or_path: Union[Path, str], language: str = N :param pretrained_model_name_or_path: The path of the saved pretrained model or the name of the model. """ - bert = cls() + self = cls() if "haystack_lm_name" in kwargs: - bert.name = kwargs["haystack_lm_name"] + self.name = kwargs["haystack_lm_name"] else: - bert.name = pretrained_model_name_or_path + self.name = pretrained_model_name_or_path # We need to differentiate between loading model using Haystack format and Pytorch-Transformers format haystack_lm_config = Path(pretrained_model_name_or_path) / "language_model_config.json" if os.path.exists(haystack_lm_config): # Haystack style - bert_config = BertConfig.from_pretrained(haystack_lm_config) + model_config = self.config_class.from_pretrained(haystack_lm_config) haystack_lm_model = Path(pretrained_model_name_or_path) / "language_model.bin" - bert.model = BertModel.from_pretrained(haystack_lm_model, config=bert_config, **kwargs) - bert.language = bert.model.config.language + self.model = self.model_class.from_pretrained(haystack_lm_model, config=model_config, **kwargs) + self.language = self.model.config.language else: # Pytorch-transformer Style - bert.model = BertModel.from_pretrained(str(pretrained_model_name_or_path), **kwargs) - bert.language = cls._get_or_infer_language_from_name(language, pretrained_model_name_or_path) - return bert + self.model = self.model_class.from_pretrained(str(pretrained_model_name_or_path), **kwargs) + self.language = language or cls._infer_language_from_name(pretrained_model_name_or_path) + return self + def forward( self, @@ -535,96 +561,94 @@ def disable_hidden_states_output(self): self.model.encoder.config.output_hidden_states = False -class Albert(LanguageModel): + +class HFLanguageModelWithPooler(HFLanguageModel): """ - An ALBERT model that wraps the Hugging Face's implementation - (https://github.com/huggingface/transformers) to fit the LanguageModel class. + A model that wraps Hugging Face's implementation + (https://github.com/huggingface/transformers) to fit the LanguageModel class, + with an extra pooler. + + NOTE: + - Unlike the other BERT variants, these don't output the `pooled_output`. An additional pooler is initialized. """ - def __init__(self): - super(Albert, self).__init__() - self.model = None - self.name = "albert" + def __init__(self, name: str): + super().__init__(name=name) + self.pooler = None @classmethod @silence_transformers_logs def load(cls, pretrained_model_name_or_path: Union[Path, str], language: str = None, **kwargs): """ - Load a language model by supplying one of the following: + Load a pretrained model by supplying one of the following: - * The name of a remote model on s3 (for example: "albert-base"). - * A local path of a model trained using transformers (for example: "some_dir/huggingface_model") - * A local path of a model trained using Haystack (for example: "some_dir/Haystack_model") + * The name of a remote model on s3 (for example, "distilbert-base-german-cased") + * A local path of a model trained using transformers (for example, "some_dir/huggingface_model") + * A local path of a model trained using Haystack (for example, "some_dir/haystack_model") - :param pretrained_model_name_or_path: Name or path of a model. - :param language: (Optional) The language the model was trained for (for example "german"). - If not supplied, Haystack tries to infer it from the model name. - :return: Language Model + :param pretrained_model_name_or_path: The path of the saved pretrained model or its name. """ - albert = cls() - if "haystack_lm_name" in kwargs: - albert.name = kwargs["haystack_lm_name"] - else: - albert.name = pretrained_model_name_or_path - # We need to differentiate between loading model using Haystack format and Pytorch-Transformers format - haystack_lm_config = Path(pretrained_model_name_or_path) / "language_model_config.json" - if os.path.exists(haystack_lm_config): - # Haystack style - config = AlbertConfig.from_pretrained(haystack_lm_config) - haystack_lm_model = Path(pretrained_model_name_or_path) / "language_model.bin" - albert.model = AlbertModel.from_pretrained(haystack_lm_model, config=config, **kwargs) - albert.language = albert.model.config.language - else: - # Huggingface transformer Style - albert.model = AlbertModel.from_pretrained(str(pretrained_model_name_or_path), **kwargs) - albert.language = cls._get_or_infer_language_from_name(language, pretrained_model_name_or_path) - return albert + self = super().load(cls, pretrained_model_name_or_path, language, **kwargs) + config = self.model.config - def forward( + # These models do not provide a pooled_output by default. Therefore, we need to initialize an extra pooler. + # The pooler takes the first hidden representation & feeds it to a dense layer of (hidden_dim x hidden_dim). + # We don't want a dropout in the end of the pooler, since we do that already in the adaptive model before we + # feed everything to the prediction head + sequence_summary_config = HF_PARAMETERS_BY_MODEL.get(self.name)["sequence_summary_config"] + for key, value in sequence_summary_config.items(): + setattr(config, key, value) + + self.pooler = SequenceSummary(config) + self.pooler.apply(self.model._init_weights) + return self + + def forward( # type: ignore self, input_ids: torch.Tensor, - segment_ids: torch.Tensor, padding_mask: torch.Tensor, output_hidden_states: Optional[bool] = None, output_attentions: Optional[bool] = None, **kwargs, ): """ - Perform the forward pass of the Albert model. + Perform the forward pass of the DistilBERT model. - :param input_ids: The IDs of each token in the input sequence. Is a tensor of shape [batch_size, max_seq_len]. - :param segment_ids: The ID of the segment. For example, in next sentence prediction, the tokens in the - first sentence are marked with 0 and the tokens in the second sentence are marked with 1. - It is a tensor of shape [batch_size, max_seq_len]. + :param input_ids: The IDs of each token in the input sequence. It's a tensor of shape [batch_size, max_seq_len]. :param padding_mask: A mask that assigns 1 to valid input tokens and 0 to padding tokens of shape [batch_size, max_seq_len]. :param output_hidden_states: When set to `True`, outputs hidden states in addition to the embeddings. :param output_attentions: When set to `True`, outputs attentions in addition to the embeddings. :return: Embeddings for each token in the input sequence. """ - if output_hidden_states is None: - output_hidden_states = self.model.encoder.config.output_hidden_states - if output_attentions is None: - output_attentions = self.model.encoder.config.output_attentions + output_tuple = super().forward(input_ids=input_ids, padding_mask=padding_mask, output_hidden_states=output_hidden_states, output_attentions=output_attentions, **kwargs) + pooled_output = self.pooler(output_tuple[0]) + return (output_tuple[0], pooled_output) + output_tuple[1:] - output_tuple = self.model( - input_ids, - token_type_ids=segment_ids, - attention_mask=padding_mask, - output_hidden_states=output_hidden_states, - output_attentions=output_attentions, - return_dict=False, - ) - return output_tuple - def enable_hidden_states_output(self): - self.model.encoder.config.output_hidden_states = True - def disable_hidden_states_output(self): - self.model.encoder.config.output_hidden_states = False +class Bert(HFLanguageModel): + """ + A BERT model that wraps Hugging Face's implementation + (https://github.com/huggingface/transformers) to fit the LanguageModel class. + Paper: https://arxiv.org/abs/1810.04805. + """ + + def __init__(self): + super().__init__(name = "bert") + + +class Albert(HFLanguageModel): + """ + An ALBERT model that wraps the Hugging Face's implementation + (https://github.com/huggingface/transformers) to fit the LanguageModel class. + """ + + def __init__(self): + super().__init__(name = "albert") -class Roberta(LanguageModel): +class Roberta(HFLanguageModel): """ A roberta model that wraps the Hugging Face's implementation (https://github.com/huggingface/transformers) to fit the LanguageModel class. @@ -632,533 +656,100 @@ class Roberta(LanguageModel): """ def __init__(self): - super(Roberta, self).__init__() - self.model = None - self.name = "roberta" - - @classmethod - @silence_transformers_logs - def load(cls, pretrained_model_name_or_path: Union[Path, str], language: str = None, **kwargs): - """ - Load a language model by supplying one of the following: + super().__init__(name = "roberta") - * The name of a remote model on s3 (for example: "roberta-base"). - * A local path of a model trained using transformers (for example: "some_dir/huggingface_model"). - * A local path of a model trained using Haystack (for example: "some_dir/haystack_model"). - :param pretrained_model_name_or_path: Name or path of a model. - :param language: (Optional) The language the model was trained for (for example: "german"). - If not supplied, Haystack tries to infer it from the model name. - :return: Language Model - """ - roberta = cls() - if "haystack_lm_name" in kwargs: - roberta.name = kwargs["haystack_lm_name"] - else: - roberta.name = pretrained_model_name_or_path - # We need to differentiate between loading model using Haystack format and Pytorch-Transformers format - haystack_lm_config = Path(pretrained_model_name_or_path) / "language_model_config.json" - if os.path.exists(haystack_lm_config): - # Haystack style - config = RobertaConfig.from_pretrained(haystack_lm_config) - haystack_lm_model = Path(pretrained_model_name_or_path) / "language_model.bin" - roberta.model = RobertaModel.from_pretrained(haystack_lm_model, config=config, **kwargs) - roberta.language = roberta.model.config.language - else: - # Huggingface transformer Style - roberta.model = RobertaModel.from_pretrained(str(pretrained_model_name_or_path), **kwargs) - roberta.language = cls._get_or_infer_language_from_name(language, pretrained_model_name_or_path) - return roberta +class XLMRoberta(HFLanguageModel): + """ + A roberta model that wraps the Hugging Face's implementation + (https://github.com/huggingface/transformers) to fit the LanguageModel class. + Paper: https://arxiv.org/abs/1907.11692 + """ - def forward( - self, - input_ids: torch.Tensor, - segment_ids: torch.Tensor, - padding_mask: torch.Tensor, - output_hidden_states: Optional[bool] = None, - output_attentions: Optional[bool] = None, - **kwargs, - ): - """ - Perform the forward pass of the Roberta model. + def __init__(self): + super().__init__(name = "xlm_roberta") - :param input_ids: The IDs of each token in the input sequence. It's a tensor of shape [batch_size, max_seq_len]. - :param segment_ids: The ID of the segment. For example, in next sentence prediction, the tokens in the - first sentence are marked with 0 and the tokens in the second sentence are marked with 1. - It is a tensor of shape [batch_size, max_seq_len]. - :param padding_mask: A mask that assigns 1 to valid input tokens and 0 to padding tokens - of shape [batch_size, max_seq_len]. - :param output_hidden_states: When set to `True`, outputs hidden states in addition to the embeddings. - :param output_attentions: When set to `True`, outputs attentions in addition to the embeddings. - :return: Embeddings for each token in the input sequence. - """ - if output_hidden_states is None: - output_hidden_states = self.model.encoder.config.output_hidden_states - if output_attentions is None: - output_attentions = self.model.encoder.config.output_attentions - output_tuple = self.model( - input_ids, - token_type_ids=segment_ids, - attention_mask=padding_mask, - output_hidden_states=output_hidden_states, - output_attentions=output_attentions, - return_dict=False, - ) - return output_tuple +class DistilBert(HFLanguageModelWithPooler): + """ + A DistilBERT model that wraps Hugging Face's implementation + (https://github.com/huggingface/transformers) to fit the LanguageModel class. - def enable_hidden_states_output(self): - self.model.encoder.config.output_hidden_states = True + NOTE: + - DistilBert doesn't have `token_type_ids`, you don't need to indicate which + token belongs to which segment. Just separate your segments with the separation + token `tokenizer.sep_token` (or [SEP]). + - Unlike the other BERT variants, DistilBert does not output the + `pooled_output`. An additional pooler is initialized. + """ - def disable_hidden_states_output(self): - self.model.encoder.config.output_hidden_states = False + def __init__(self): + super().__init__(name = "distilbert") -class XLMRoberta(LanguageModel): +class XLNet(HFLanguageModelWithPooler): """ - A roberta model that wraps the Hugging Face's implementation + A XLNet model that wraps the Hugging Face's implementation (https://github.com/huggingface/transformers) to fit the LanguageModel class. - Paper: https://arxiv.org/abs/1907.11692 + Paper: https://arxiv.org/abs/1906.08237 """ def __init__(self): - super(XLMRoberta, self).__init__() - self.model = None - self.name = "xlm_roberta" + super().__init__(name = "xlnet") - @classmethod - @silence_transformers_logs - def load(cls, pretrained_model_name_or_path: Union[Path, str], language: str = None, **kwargs): - """ - Load a language model by supplying one fo the following: - * The name of a remote model on s3 (for example: "xlm-roberta-base") - * A local path of a model trained using transformers (for example: "some_dir/huggingface_model"). - * A local path of a model trained using Haystack (for example: "some_dir/haystack_model"). +class Electra(HFLanguageModelWithPooler): + """ + ELECTRA is a new pre-training approach which trains two transformer models: + the generator and the discriminator. The generator replaces tokens in a sequence, + and is therefore trained as a masked language model. The discriminator, which is + the model we're interested in, tries to identify which tokens were replaced by + the generator in the sequence. - :param pretrained_model_name_or_path: Name or path of a model. - :param language: (Optional) The language the model was trained for (for example, "german"). - If not supplied, Haystack tries to infer it from the model name. - :return: Language Model - """ - xlm_roberta = cls() - if "haystack_lm_name" in kwargs: - xlm_roberta.name = kwargs["haystack_lm_name"] - else: - xlm_roberta.name = pretrained_model_name_or_path - # We need to differentiate between loading model using Haystack format and Pytorch-Transformers format - haystack_lm_config = Path(pretrained_model_name_or_path) / "language_model_config.json" - if os.path.exists(haystack_lm_config): - # Haystack style - config = XLMRobertaConfig.from_pretrained(haystack_lm_config) - haystack_lm_model = Path(pretrained_model_name_or_path) / "language_model.bin" - xlm_roberta.model = XLMRobertaModel.from_pretrained(haystack_lm_model, config=config, **kwargs) - xlm_roberta.language = xlm_roberta.model.config.language - else: - # Huggingface transformer Style - xlm_roberta.model = XLMRobertaModel.from_pretrained(str(pretrained_model_name_or_path), **kwargs) - xlm_roberta.language = cls._get_or_infer_language_from_name(language, pretrained_model_name_or_path) - return xlm_roberta + The ELECTRA model here wraps Hugging Face's implementation + (https://github.com/huggingface/transformers) to fit the LanguageModel class. - def forward( - self, - input_ids: torch.Tensor, - segment_ids: torch.Tensor, - padding_mask: torch.Tensor, - output_hidden_states: Optional[bool] = None, - output_attentions: Optional[bool] = None, - **kwargs, - ): - """ - Perform the forward pass of the XLMRoberta model. + NOTE: + - Electra does not output the `pooled_output`. An additional pooler is initialized. + """ - :param input_ids: The IDs of each token in the input sequence. It's a tensor of shape [batch_size, max_seq_len]. - :param segment_ids: The ID of the segment. For example, in next sentence prediction, the tokens in the - first sentence are marked with 0 and the tokens in the second sentence are marked with 1. - It is a tensor of shape [batch_size, max_seq_len]. - :param padding_mask: A mask that assigns 1 to valid input tokens and 0 to padding tokens - of shape [batch_size, max_seq_len]. - :param output_hidden_states: When set to `True`, outputs hidden states in addition to the embeddings. - :param output_attentions: When set to `True`, outputs attentions in addition to the embeddings. - :return: Embeddings for each token in the input sequence. - """ - if output_hidden_states is None: - output_hidden_states = self.model.encoder.config.output_hidden_states - if output_attentions is None: - output_attentions = self.model.encoder.config.output_attentions + def __init__(self): + super().__init__(name = "electra") - output_tuple = self.model( - input_ids, - token_type_ids=segment_ids, - attention_mask=padding_mask, - output_hidden_states=output_hidden_states, - output_attentions=output_attentions, - return_dict=False, - ) - return output_tuple - def enable_hidden_states_output(self): - self.model.encoder.config.output_hidden_states = True +class Camembert(HFLanguageModel): + """ + A Camembert model that wraps the Hugging Face's implementation + (https://github.com/huggingface/transformers) to fit the LanguageModel class. + """ - def disable_hidden_states_output(self): - self.model.encoder.config.output_hidden_states = False + def __init__(self): + super().__init__(name = "camembert") -class DistilBert(LanguageModel): +class BigBird(HFLanguageModel): """ - A DistilBERT model that wraps Hugging Face's implementation + A BERT model that wraps Hugging Face's implementation (https://github.com/huggingface/transformers) to fit the LanguageModel class. + Paper: https://arxiv.org/abs/1810.04805 + """ + + def __init__(self): + super().__init__("big_bird") + + +class DebertaV2(HFLanguageModelWithPooler): + """ + This is a wrapper around the DebertaV2 model from Hugging Face's transformers library. + It is also compatible with DebertaV3 as DebertaV3 only changes the pretraining procedure. NOTE: - - DistilBert doesn’t have `token_type_ids`, you don’t need to indicate which - token belongs to which segment. Just separate your segments with the separation - token `tokenizer.sep_token` (or [SEP]). - - Unlike the other BERT variants, DistilBert does not output the - `pooled_output`. An additional pooler is initialized. + - DebertaV2 does not output the `pooled_output`. An additional pooler is initialized. """ def __init__(self): - super(DistilBert, self).__init__() - self.model = None - self.name = "distilbert" - self.pooler = None + super().__init__(name = "deberta-v2") - @classmethod - @silence_transformers_logs - def load(cls, pretrained_model_name_or_path: Union[Path, str], language: str = None, **kwargs): - """ - Load a pretrained model by supplying one of the following: - - * The name of a remote model on s3 (for example, "distilbert-base-german-cased") - * A local path of a model trained using transformers (for example, "some_dir/huggingface_model") - * A local path of a model trained using Haystack (for example, "some_dir/haystack_model") - - :param pretrained_model_name_or_path: The path of the saved pretrained model or its name. - """ - distilbert = cls() - if "haystack_lm_name" in kwargs: - distilbert.name = kwargs["haystack_lm_name"] - else: - distilbert.name = pretrained_model_name_or_path - # We need to differentiate between loading model using Haystack format and Pytorch-Transformers format - haystack_lm_config = Path(pretrained_model_name_or_path) / "language_model_config.json" - if os.path.exists(haystack_lm_config): - # Haystack style - config = DistilBertConfig.from_pretrained(haystack_lm_config) - haystack_lm_model = Path(pretrained_model_name_or_path) / "language_model.bin" - distilbert.model = DistilBertModel.from_pretrained(haystack_lm_model, config=config, **kwargs) - distilbert.language = distilbert.model.config.language - else: - # Pytorch-transformer Style - distilbert.model = DistilBertModel.from_pretrained(str(pretrained_model_name_or_path), **kwargs) - distilbert.language = cls._get_or_infer_language_from_name(language, pretrained_model_name_or_path) - config = distilbert.model.config - - # DistilBERT does not provide a pooled_output by default. Therefore, we need to initialize an extra pooler. - # The pooler takes the first hidden representation & feeds it to a dense layer of (hidden_dim x hidden_dim). - # We don't want a dropout in the end of the pooler, since we do that already in the adaptive model before we - # feed everything to the prediction head - config.summary_last_dropout = 0 - config.summary_type = "first" - config.summary_activation = "tanh" - distilbert.pooler = SequenceSummary(config) - distilbert.pooler.apply(distilbert.model._init_weights) - return distilbert - - def forward( # type: ignore - self, - input_ids: torch.Tensor, - padding_mask: torch.Tensor, - output_hidden_states: Optional[bool] = None, - output_attentions: Optional[bool] = None, - **kwargs, - ): - """ - Perform the forward pass of the DistilBERT model. - - :param input_ids: The IDs of each token in the input sequence. It's a tensor of shape [batch_size, max_seq_len]. - :param padding_mask: A mask that assigns 1 to valid input tokens and 0 to padding tokens - of shape [batch_size, max_seq_len]. - :param output_hidden_states: When set to `True`, outputs hidden states in addition to the embeddings. - :param output_attentions: When set to `True`, outputs attentions in addition to the embeddings. - :return: Embeddings for each token in the input sequence. - """ - if output_hidden_states is None: - output_hidden_states = self.model.encoder.config.output_hidden_states - if output_attentions is None: - output_attentions = self.model.encoder.config.output_attentions - - output_tuple = self.model( - input_ids, - attention_mask=padding_mask, - output_hidden_states=output_hidden_states, - output_attentions=output_attentions, - return_dict=False, - ) - # We need to manually aggregate that to get a pooled output (one vec per seq) - pooled_output = self.pooler(output_tuple[0]) - return (output_tuple[0], pooled_output) + output_tuple[1:] - - def enable_hidden_states_output(self): - self.model.config.output_hidden_states = True - - def disable_hidden_states_output(self): - self.model.config.output_hidden_states = False - - -class XLNet(LanguageModel): - """ - A XLNet model that wraps the Hugging Face's implementation - (https://github.com/huggingface/transformers) to fit the LanguageModel class. - Paper: https://arxiv.org/abs/1906.08237 - """ - - def __init__(self): - super(XLNet, self).__init__() - self.model = None - self.name = "xlnet" - self.pooler = None - - @classmethod - @silence_transformers_logs - def load(cls, pretrained_model_name_or_path: Union[Path, str], language: str = None, **kwargs): - """ - Load a language model by supplying one of the following: - - * The name of a remote model on s3 (for example, "xlnet-base-cased"). - * A local path of a model trained using transformers (for example, "some_dir/huggingface_model"). - * Alocal path of a model trained using Haystack (for example, "some_dir/haystack_model"). - - :param pretrained_model_name_or_path: Name or path of a model. - :param language: (Optional) The language the model was trained for (for example, "german"). - If not supplied, Haystack tries to infer it from the model name. - :return: Language Model - """ - xlnet = cls() - if "haystack_lm_name" in kwargs: - xlnet.name = kwargs["haystack_lm_name"] - else: - xlnet.name = pretrained_model_name_or_path - # We need to differentiate between loading model using Haystack format and Pytorch-Transformers format - haystack_lm_config = Path(pretrained_model_name_or_path) / "language_model_config.json" - if os.path.exists(haystack_lm_config): - # Haystack style - config = XLNetConfig.from_pretrained(haystack_lm_config) - haystack_lm_model = Path(pretrained_model_name_or_path) / "language_model.bin" - xlnet.model = XLNetModel.from_pretrained(haystack_lm_model, config=config, **kwargs) - xlnet.language = xlnet.model.config.language - else: - # Pytorch-transformer Style - xlnet.model = XLNetModel.from_pretrained(str(pretrained_model_name_or_path), **kwargs) - xlnet.language = cls._get_or_infer_language_from_name(language, pretrained_model_name_or_path) - config = xlnet.model.config - # XLNet does not provide a pooled_output by default. Therefore, we need to initialize an extra pooler. - # The pooler takes the last hidden representation & feeds it to a dense layer of (hidden_dim x hidden_dim). - # We don't want a dropout in the end of the pooler, since we do that already in the adaptive model before we - # feed everything to the prediction head - config.summary_last_dropout = 0 - xlnet.pooler = SequenceSummary(config) - xlnet.pooler.apply(xlnet.model._init_weights) - return xlnet - - def forward( - self, - input_ids: torch.Tensor, - segment_ids: torch.Tensor, - padding_mask: torch.Tensor, - output_hidden_states: Optional[bool] = None, - output_attentions: Optional[bool] = None, - **kwargs, - ): - """ - Perform the forward pass of the XLNet model. - - :param input_ids: The IDs of each token in the input sequence. It's a tensor of shape [batch_size, max_seq_len]. - :param segment_ids: The ID of the segment. For example, in next sentence prediction, the tokens in the - first sentence are marked with 0 and the tokens in the second sentence are marked with 1. - It is a tensor of shape [batch_size, max_seq_len]. - :param padding_mask: A mask that assigns 1 to valid input tokens and 0 to padding tokens - of shape [batch_size, max_seq_len]. - :param output_hidden_states: When set to `True`, outputs hidden states in addition to the embeddings. - :param output_attentions: When set to `True`, outputs attentions in addition to the embeddings. - :return: Embeddings for each token in the input sequence. - """ - if output_hidden_states is None: - output_hidden_states = self.model.encoder.config.output_hidden_states - if output_attentions is None: - output_attentions = self.model.encoder.config.output_attentions - - # Note: XLNet has a couple of special input tensors for pretraining / text generation (perm_mask, target_mapping ...) - # We will need to implement them, if we wanna support LM adaptation - output_tuple = self.model( - input_ids, - attention_mask=padding_mask, - output_hidden_states=output_hidden_states, - output_attentions=output_attentions, - return_dict=False, - ) - # XLNet also only returns the sequence_output (one vec per token) - # We need to manually aggregate that to get a pooled output (one vec per seq) - # TODO verify that this is really doing correct pooling - pooled_output = self.pooler(output_tuple[0]) - return (output_tuple[0], pooled_output) + output_tuple[1:] - - def enable_hidden_states_output(self): - self.model.output_hidden_states = True - - def disable_hidden_states_output(self): - self.model.output_hidden_states = False - - -class Electra(LanguageModel): - """ - ELECTRA is a new pre-training approach which trains two transformer models: - the generator and the discriminator. The generator replaces tokens in a sequence, - and is therefore trained as a masked language model. The discriminator, which is - the model we're interested in, tries to identify which tokens were replaced by - the generator in the sequence. - - The ELECTRA model here wraps Hugging Face's implementation - (https://github.com/huggingface/transformers) to fit the LanguageModel class. - - NOTE: - - Electra does not output the `pooled_output`. An additional pooler is initialized. - """ - - def __init__(self): - super(Electra, self).__init__() - self.model = None - self.name = "electra" - self.pooler = None - - @classmethod - @silence_transformers_logs - def load(cls, pretrained_model_name_or_path: Union[Path, str], language: str = None, **kwargs): - """ - Load a pretrained model by supplying one of the following - - * The name of a remote model on s3 (for example, "google/electra-base-discriminator"). - * A local path of a model trained using transformers ("some_dir/huggingface_model"). - * A local path of a model trained using Haystack ("some_dir/haystack_model"). - - :param pretrained_model_name_or_path: The path of the saved pretrained model or its name. - """ - electra = cls() - if "haystack_lm_name" in kwargs: - electra.name = kwargs["haystack_lm_name"] - else: - electra.name = pretrained_model_name_or_path - # We need to differentiate between loading model using Haystack format and Transformers format - haystack_lm_config = Path(pretrained_model_name_or_path) / "language_model_config.json" - if os.path.exists(haystack_lm_config): - # Haystack style - config = ElectraConfig.from_pretrained(haystack_lm_config) - haystack_lm_model = Path(pretrained_model_name_or_path) / "language_model.bin" - electra.model = ElectraModel.from_pretrained(haystack_lm_model, config=config, **kwargs) - electra.language = electra.model.config.language - else: - # Transformers Style - electra.model = ElectraModel.from_pretrained(str(pretrained_model_name_or_path), **kwargs) - electra.language = cls._get_or_infer_language_from_name(language, pretrained_model_name_or_path) - config = electra.model.config - - # ELECTRA does not provide a pooled_output by default. Therefore, we need to initialize an extra pooler. - # The pooler takes the first hidden representation & feeds it to a dense layer of (hidden_dim x hidden_dim). - # We don't want a dropout in the end of the pooler, since we do that already in the adaptive model before we - # feed everything to the prediction head. - # Note: ELECTRA uses gelu as activation (BERT uses tanh instead) - config.summary_last_dropout = 0 - config.summary_type = "first" - config.summary_activation = "gelu" - config.summary_use_proj = False - electra.pooler = SequenceSummary(config) - electra.pooler.apply(electra.model._init_weights) - return electra - - def forward( - self, - input_ids: torch.Tensor, - segment_ids: torch.Tensor, - padding_mask: torch.Tensor, - output_hidden_states: Optional[bool] = None, - output_attentions: Optional[bool] = None, - **kwargs, - ): - """ - Perform the forward pass of the ELECTRA model. - - :param input_ids: The IDs of each token in the input sequence. It's a tensor of shape [batch_size, max_seq_len]. - :param padding_mask: A mask that assigns 1 to valid input tokens and 0 to padding tokens - of shape [batch_size, max_seq_len]. - :param output_hidden_states: When set to `True`, outputs hidden states in addition to the embeddings. - :param output_attentions: When set to `True`, outputs attentions in addition to the embeddings. - :return: Embeddings for each token in the input sequence. - """ - output_tuple = self.model(input_ids, token_type_ids=segment_ids, attention_mask=padding_mask, return_dict=False) - - if output_hidden_states is None: - output_hidden_states = self.model.encoder.config.output_hidden_states - if output_attentions is None: - output_attentions = self.model.encoder.config.output_attentions - - output_tuple = self.model( - input_ids, - attention_mask=padding_mask, - output_hidden_states=output_hidden_states, - output_attentions=output_attentions, - ) - # We need to manually aggregate that to get a pooled output (one vec per seq) - pooled_output = self.pooler(output_tuple[0]) - return (output_tuple[0], pooled_output) + output_tuple[1:] - - def disable_hidden_states_output(self): - self.model.config.output_hidden_states = False - - -class Camembert(Roberta): - """ - A Camembert model that wraps the Hugging Face's implementation - (https://github.com/huggingface/transformers) to fit the LanguageModel class. - """ - - def __init__(self): - super(Camembert, self).__init__() - self.model = None - self.name = "camembert" - - @classmethod - @silence_transformers_logs - def load(cls, pretrained_model_name_or_path: Union[Path, str], language: str = None, **kwargs): - """ - Load a language model by supplying one of the following: - - * The name of a remote model on s3 (for example, "camembert-base"). - * A local path of a model trained using transformers (for example, "some_dir/huggingface_model"). - * A local path of a model trained using Haystack (for example, "some_dir/haystack_model"). - - :param pretrained_model_name_or_path: Name or path of a model. - :param language: (Optional) The language the model was trained for (for example, "german"). - If not supplied, Haystack tries to infer it from the model name. - :return: Language Model - """ - camembert = cls() - if "haystack_lm_name" in kwargs: - camembert.name = kwargs["haystack_lm_name"] - else: - camembert.name = pretrained_model_name_or_path - # We need to differentiate between loading model using Haystack format and Pytorch-Transformers format - haystack_lm_config = Path(pretrained_model_name_or_path) / "language_model_config.json" - if os.path.exists(haystack_lm_config): - # Haystack style - config = CamembertConfig.from_pretrained(haystack_lm_config) - haystack_lm_model = Path(pretrained_model_name_or_path) / "language_model.bin" - camembert.model = CamembertModel.from_pretrained(haystack_lm_model, config=config, **kwargs) - camembert.language = camembert.model.config.language - else: - # Huggingface transformer Style - camembert.model = CamembertModel.from_pretrained(str(pretrained_model_name_or_path), **kwargs) - camembert.language = cls._get_or_infer_language_from_name(language, pretrained_model_name_or_path) - return camembert class DPRQuestionEncoder(LanguageModel): @@ -1167,7 +758,7 @@ class DPRQuestionEncoder(LanguageModel): """ def __init__(self): - super(DPRQuestionEncoder, self).__init__() + super().__init__() self.model = None self.name = "dpr_question_encoder" @@ -1249,9 +840,7 @@ def load( dpr_question_encoder.model.base_model.bert_model = AutoModel.from_pretrained( str(pretrained_model_name_or_path), use_auth_token=use_auth_token, **original_config_dict ) - dpr_question_encoder.language = cls._get_or_infer_language_from_name( - language, pretrained_model_name_or_path - ) + dpr_question_encoder.language = language or cls._infer_language_from_name(pretrained_model_name_or_path) return dpr_question_encoder @@ -1277,7 +866,7 @@ def save(self, save_dir: Union[str, Path], state_dict: Optional[Dict[Any, Any]] new_key = key.split("_encoder.bert_model.", 1)[1] state_dict[new_key] = state_dict.pop(key) - super(DPRQuestionEncoder, self).save(save_dir=save_dir, state_dict=state_dict) + super().save(save_dir=save_dir, state_dict=state_dict) def forward( # type: ignore self, @@ -1323,7 +912,7 @@ class DPRContextEncoder(LanguageModel): """ def __init__(self): - super(DPRContextEncoder, self).__init__() + super().__init__() self.model = None self.name = "dpr_context_encoder" @@ -1409,7 +998,7 @@ def load( dpr_context_encoder.model.base_model.bert_model = AutoModel.from_pretrained( str(pretrained_model_name_or_path), use_auth_token=use_auth_token, **original_config_dict ) - dpr_context_encoder.language = cls._get_or_infer_language_from_name(language, pretrained_model_name_or_path) + dpr_context_encoder.language = language or cls._infer_language_from_name(pretrained_model_name_or_path) return dpr_context_encoder @@ -1436,7 +1025,7 @@ def save(self, save_dir: Union[str, Path], state_dict: Optional[Dict[Any, Any]] new_key = key.split("_encoder.bert_model.", 1)[1] state_dict[new_key] = state_dict.pop(key) - super(DPRContextEncoder, self).save(save_dir=save_dir, state_dict=state_dict) + super().save(save_dir=save_dir, state_dict=state_dict) def forward( # type: ignore self, @@ -1478,197 +1067,3 @@ def enable_hidden_states_output(self): def disable_hidden_states_output(self): self.model.ctx_encoder.config.output_hidden_states = False - - -class BigBird(LanguageModel): - """ - A BERT model that wraps Hugging Face's implementation - (https://github.com/huggingface/transformers) to fit the LanguageModel class. - Paper: https://arxiv.org/abs/1810.04805 - """ - - def __init__(self): - super(BigBird, self).__init__() - self.model = None - self.name = "big_bird" - - @classmethod - def from_scratch(cls, vocab_size, name="big_bird", language="en"): - big_bird = cls() - big_bird.name = name - big_bird.language = language - config = BigBirdConfig(vocab_size=vocab_size) - big_bird.model = BigBirdModel(config) - return big_bird - - @classmethod - @silence_transformers_logs - def load(cls, pretrained_model_name_or_path: Union[Path, str], language: str = None, **kwargs): - """ - Load a pretrained model by supplying one of the following: - - * The name of a remote model on s3 (for example, "bert-base-cased"). - * A local path of a model trained using transformers (for example, "some_dir/huggingface_model"). - * A local path of a model trained using Haystack (for example, "some_dir/haystack_model"). - - :param pretrained_model_name_or_path: The path of the saved pretrained model or its name. - """ - big_bird = cls() - if "haystack_lm_name" in kwargs: - big_bird.name = kwargs["haystack_lm_name"] - else: - big_bird.name = pretrained_model_name_or_path - # We need to differentiate between loading model using Haystack format and Pytorch-Transformers format - haystack_lm_config = Path(pretrained_model_name_or_path) / "language_model_config.json" - if os.path.exists(haystack_lm_config): - # Haystack style - big_bird_config = BigBirdConfig.from_pretrained(haystack_lm_config) - haystack_lm_model = Path(pretrained_model_name_or_path) / "language_model.bin" - big_bird.model = BigBirdModel.from_pretrained(haystack_lm_model, config=big_bird_config, **kwargs) - big_bird.language = big_bird.model.config.language - else: - # Pytorch-transformer Style - big_bird.model = BigBirdModel.from_pretrained(str(pretrained_model_name_or_path), **kwargs) - big_bird.language = cls._get_or_infer_language_from_name(language, pretrained_model_name_or_path) - return big_bird - - def forward( - self, - input_ids: torch.Tensor, - segment_ids: torch.Tensor, - padding_mask: torch.Tensor, - output_hidden_states: Optional[bool] = None, - output_attentions: Optional[bool] = None, - **kwargs, - ): - """ - Perform the forward pass of the BigBird model. - - :param input_ids: The IDs of each token in the input sequence. It's a tensor of shape [batch_size, max_seq_len]. - :param segment_ids: The ID of the segment. For example, in next sentence prediction, the tokens in the - first sentence are marked with 0 and the tokens in the second sentence are marked with 1. - It is a tensor of shape [batch_size, max_seq_len]. - :param padding_mask: A mask that assigns 1 to valid input tokens and 0 to padding tokens - of shape [batch_size, max_seq_len]. - :param output_hidden_states: When set to `True`, outputs hidden states in addition to the embeddings. - :param output_attentions: When set to `True`, outputs attentions in addition to the embeddings. - :return: Embeddings for each token in the input sequence. - """ - if output_hidden_states is None: - output_hidden_states = self.model.encoder.config.output_hidden_states - if output_attentions is None: - output_attentions = self.model.encoder.config.output_attentions - - output_tuple = self.model( - input_ids, - token_type_ids=segment_ids, - attention_mask=padding_mask, - output_hidden_states=output_hidden_states, - output_attentions=output_attentions, - return_dict=False, - ) - return output_tuple - - def enable_hidden_states_output(self): - self.model.encoder.config.output_hidden_states = True - - def disable_hidden_states_output(self): - self.model.encoder.config.output_hidden_states = False - - -class DebertaV2(LanguageModel): - """ - This is a wrapper around the DebertaV2 model from Hugging Face's transformers library. - It is also compatible with DebertaV3 as DebertaV3 only changes the pretraining procedure. - - NOTE: - - DebertaV2 does not output the `pooled_output`. An additional pooler is initialized. - """ - - def __init__(self): - super().__init__() - self.model = None - self.name = "deberta-v2" - self.pooler = None - - @classmethod - @silence_transformers_logs - def load(cls, pretrained_model_name_or_path: Union[Path, str], language: str = None, **kwargs): - """ - Load a pretrained model by supplying one of the following: - - * A remote name from the Hugging Face's model hub (for example: microsoft/deberta-v3-base). - * A local path of a model trained using transformers (for example: some_dir/huggingface_model). - * A local path of a model trained using Haystack (for example: some_dir/haystack_model). - - :param pretrained_model_name_or_path: The path to the saved pretrained model or the name of the model. - """ - debertav2 = cls() - if "haystack_lm_name" in kwargs: - debertav2.name = kwargs["haystack_lm_name"] - else: - debertav2.name = pretrained_model_name_or_path - # We need to differentiate between loading model using Haystack format and Transformers format - haystack_lm_config = Path(pretrained_model_name_or_path) / "language_model_config.json" - if os.path.exists(haystack_lm_config): - # Haystack style - config = DebertaV2Config.from_pretrained(haystack_lm_config) - haystack_lm_model = Path(pretrained_model_name_or_path) / "language_model.bin" - debertav2.model = DebertaV2Model.from_pretrained(haystack_lm_model, config=config, **kwargs) - debertav2.language = debertav2.model.config.language - else: - # Transformers Style - debertav2.model = DebertaV2Model.from_pretrained(str(pretrained_model_name_or_path), **kwargs) - debertav2.language = cls._get_or_infer_language_from_name(language, pretrained_model_name_or_path) - config = debertav2.model.config - - # DebertaV2 does not provide a pooled_output by default. Therefore, we need to initialize an extra pooler. - # The pooler takes the first hidden representation & feeds it to a dense layer of (hidden_dim x hidden_dim). - # We don't want a dropout in the end of the pooler, since we do that already in the adaptive model before we - # feed everything to the prediction head. - config.summary_last_dropout = 0 - config.summary_type = "first" - config.summary_activation = "tanh" - config.summary_use_proj = False - debertav2.pooler = SequenceSummary(config) - debertav2.pooler.apply(debertav2.model._init_weights) - return debertav2 - - def forward( - self, - input_ids: torch.Tensor, - segment_ids: torch.Tensor, - padding_mask: torch.Tensor, - output_hidden_states: Optional[bool] = None, - output_attentions: Optional[bool] = None, - **kwargs, - ): - """ - Perform the forward pass of the DebertaV2 model. - - :param input_ids: The IDs of each token in the input sequence. Is a tensor of shape [batch_size, max_seq_len]. - :param padding_mask: A mask that assigns 1 to valid input tokens and 0 to padding tokens - of shape [batch_size, max_seq_len]. - :param output_hidden_states: When set to `True`, outputs hidden states in addition to the embeddings. - :param output_attentions: When set to `True`, outputs attentions in addition to the embeddings. - :return: Embeddings for each token in the input sequence. - """ - output_tuple = self.model(input_ids, token_type_ids=segment_ids, attention_mask=padding_mask, return_dict=False) - - if output_hidden_states is None: - output_hidden_states = self.model.encoder.config.output_hidden_states - if output_attentions is None: - output_attentions = self.model.encoder.config.output_attentions - - output_tuple = self.model( - input_ids, - attention_mask=padding_mask, - output_hidden_states=output_hidden_states, - output_attentions=output_attentions, - ) - # We need to manually aggregate that to get a pooled output (one vec per seq) - pooled_output = self.pooler(output_tuple[0]) - return (output_tuple[0], pooled_output) + output_tuple[1:] - - def disable_hidden_states_output(self): - self.model.config.output_hidden_states = False diff --git a/haystack/modeling/visual.py b/haystack/modeling/visual.py index e45f4d7786..d2084bdc5e 100644 --- a/haystack/modeling/visual.py +++ b/haystack/modeling/visual.py @@ -91,7 +91,7 @@ """ WORKER_M = r""" 0 -/|\ +/|\ /'\ """ WORKER_F = r""" 0 diff --git a/haystack/nodes/retriever/_embedding_encoder.py b/haystack/nodes/retriever/_embedding_encoder.py index c434959bcd..496e455739 100644 --- a/haystack/nodes/retriever/_embedding_encoder.py +++ b/haystack/nodes/retriever/_embedding_encoder.py @@ -306,9 +306,70 @@ def save(self, save_dir: Union[Path, str]): raise NotImplementedError("save method can only be used with sentence-transformers EmbeddingRetriever(s)") +class _Data2VecVisionEmbeddingEncoder(_BaseEmbeddingEncoder): + def __init__(self, retriever: "EmbeddingRetriever"): + + self.embedding_model = Inferencer.load( + retriever.embedding_model, + revision=retriever.model_version, + task_type="embeddings", + extraction_strategy=retriever.pooling_strategy, + extraction_layer=retriever.emb_extraction_layer, + gpu=retriever.use_gpu, + batch_size=retriever.batch_size, + max_seq_len=retriever.max_seq_len, + num_processes=0, + use_auth_token=retriever.use_auth_token, + ) + # Check that document_store has the right similarity function + similarity = retriever.document_store.similarity + # If we are using a sentence transformer model + if "sentence" in retriever.embedding_model.lower() and similarity != "cosine": + logger.warning( + f"You seem to be using a Sentence Transformer with the {similarity} function. " + f"We recommend using cosine instead. " + f"This can be set when initializing the DocumentStore" + ) + elif "dpr" in retriever.embedding_model.lower() and similarity != "dot_product": + logger.warning( + f"You seem to be using a DPR model with the {similarity} function. " + f"We recommend using dot_product instead. " + f"This can be set when initializing the DocumentStore" + ) + + def embed(self, texts: Union[List[List[str]], List[str], str]) -> List[np.ndarray]: + # TODO: FARM's `sample_to_features_text` need to fix following warning - + # tokenization_utils.py:460: FutureWarning: `is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead. + emb = self.embedding_model.inference_from_dicts(dicts=[{"text": t} for t in texts]) + emb = [(r["vec"]) for r in emb] + return emb + + def embed_queries(self, texts: List[str]) -> List[np.ndarray]: + return self.embed(texts) + + def embed_documents(self, docs: List[Document]) -> List[np.ndarray]: + passages = [d.content for d in docs] # type: ignore + return self.embed(passages) + + def train( + self, + training_data: List[Dict[str, Any]], + learning_rate: float = 2e-5, + n_epochs: int = 1, + num_warmup_steps: int = None, + batch_size: int = 16, + ): + raise NotImplementedError("train method can only be used with sentence-transformers EmbeddingRetriever(s)") + + def save(self, save_dir: Union[Path, str]): + raise NotImplementedError("save method can only be used with sentence-transformers EmbeddingRetriever(s)") + + + _EMBEDDING_ENCODERS: Dict[str, Callable] = { "farm": _DefaultEmbeddingEncoder, "transformers": _DefaultEmbeddingEncoder, "sentence_transformers": _SentenceTransformersEmbeddingEncoder, "retribert": _RetribertEmbeddingEncoder, + "data2vec_vision": _Data2VecVisionEmbeddingEncoder, } From 91caa7f80c5ca55ac23b7af15e9b658a523399ce Mon Sep 17 00:00:00 2001 From: ZanSara Date: Fri, 10 Jun 2022 10:37:44 +0200 Subject: [PATCH 02/89] restructure language_model.py --- haystack/modeling/data_handler/data_silo.py | 7 +- haystack/modeling/infer.py | 8 +- haystack/modeling/model/adaptive_model.py | 24 +- haystack/modeling/model/language_model.py | 484 ++++++++++---------- haystack/modeling/training/base.py | 40 +- test/modeling/test_modeling_dpr.py | 6 - 6 files changed, 306 insertions(+), 263 deletions(-) diff --git a/haystack/modeling/data_handler/data_silo.py b/haystack/modeling/data_handler/data_silo.py index 2ab4753930..062b1d85c3 100644 --- a/haystack/modeling/data_handler/data_silo.py +++ b/haystack/modeling/data_handler/data_silo.py @@ -811,7 +811,12 @@ def _run_teacher(self, batch: dict) -> List[torch.Tensor]: """ Run the teacher model on the given batch. """ - return self.teacher.inferencer.model(**batch) + params = {'input_ids': batch["input_ids"], 'segment_ids': batch["segment_ids"], 'padding_mask': batch["padding_mask"]} + if 'output_hidden_states' in batch.keys(): + params['output_hidden_states'] = batch["output_hidden_states"] + if 'output_attentions' in batch.keys(): + params['output_attentions'] = batch["output_attentions"] + return self.teacher.inferencer.model(**params) def _pass_batches( self, diff --git a/haystack/modeling/infer.py b/haystack/modeling/infer.py index 85b828b22b..8245c81bef 100644 --- a/haystack/modeling/infer.py +++ b/haystack/modeling/infer.py @@ -511,7 +511,13 @@ def _get_predictions_and_aggregate(self, dataset: Dataset, tensor_names: List, b with torch.no_grad(): # Aggregation works on preds, not logits. We want as much processing happening in one batch + on GPU # So we transform logits to preds here as well - logits = self.model.forward(**batch) + logits = self.model.forward( + input_ids=batch["input_ids"], + segment_ids=batch["segment_ids"], + padding_mask=batch["padding_mask"], + output_hidden_states=batch.get("output_hidden_states", False), + output_attentions=batch.get("output_attentions", False) + ) # preds = self.model.logits_to_preds(logits, **batch)[0] (This must somehow be useful for SQuAD) preds = self.model.logits_to_preds(logits, **batch) unaggregated_preds_all.append(preds) diff --git a/haystack/modeling/model/adaptive_model.py b/haystack/modeling/model/adaptive_model.py index ac126e485b..b4bcf22854 100644 --- a/haystack/modeling/model/adaptive_model.py +++ b/haystack/modeling/model/adaptive_model.py @@ -334,7 +334,7 @@ def convert_from_transformers( :return: AdaptiveModel """ - lm = LanguageModel.load(model_name_or_path, revision=revision, use_auth_token=use_auth_token, **kwargs) + lm = LanguageModel.load(model_name_or_path, revision=revision, auth_token=use_auth_token, **kwargs) if task_type is None: # Infer task type from config architecture = lm.model.config.architectures[0] @@ -462,20 +462,34 @@ def prepare_labels(self, **kwargs): all_labels.append(labels) return all_labels - def forward(self, output_hidden_states: bool = False, output_attentions: bool = False, **kwargs): + def forward(self, + input_ids: torch.Tensor, + segment_ids: torch.Tensor, + padding_mask: torch.Tensor, + output_hidden_states: bool = False, + output_attentions: bool = False + ): """ Push data through the whole model and returns logits. The data will propagate through the language model and each of the attached prediction heads. - :param kwargs: Holds all arguments that need to be passed to the language model - and prediction head(s). + :param input_ids: The IDs of each token in the input sequence. It's a tensor of shape [batch_size, max_seq_len]. + :param segment_ids: The ID of the segment. For example, in next sentence prediction, the tokens in the + first sentence are marked with 0 and the tokens in the second sentence are marked with 1. + It is a tensor of shape [batch_size, max_seq_len]. + :param padding_mask: A mask that assigns 1 to valid input tokens and 0 to padding tokens + of shape [batch_size, max_seq_len]. :param output_hidden_states: Whether to output hidden states :param output_attentions: Whether to output attentions :return: All logits as torch.tensor or multiple tensors. """ # Run forward pass of language model output_tuple = self.language_model.forward( - **kwargs, output_hidden_states=output_hidden_states, output_attentions=output_attentions + input_ids=input_ids, + segment_ids=segment_ids, + padding_mask=padding_mask, + output_hidden_states=output_hidden_states, + output_attentions=output_attentions ) if output_hidden_states: if output_attentions: diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index d1eda885ca..71d3f5538f 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -24,16 +24,15 @@ import json import logging import os +from abc import ABC, abstractmethod from pathlib import Path from functools import wraps +from black import out import numpy as np import torch from torch import nn import transformers -from transformers import ( - PretrainedConfig, - PreTrainedModel, -) +from transformers import PretrainedConfig, PreTrainedModel from transformers import AutoModel, AutoConfig from transformers.modeling_utils import SequenceSummary @@ -42,50 +41,21 @@ HF_PARAMETERS_BY_MODEL = { - "bert": { - "prefix": "Bert", - }, - "xlm.*roberta": { - "prefix": "XLMRoberta", - }, - "roberta.*xml": { - "prefix": "XLMRoberta", - }, - "bigbird": { - "prefix": "BigBird", - }, - "roberta": { - "prefix": "Roberta", - }, - "codebert.*mlm": { - "prefix": "Roberta", - }, - "mlm.*codebert": { - "prefix": "Roberta", - }, - "camembert": { - "prefix": "Camembert", - }, - "umberto": { - "prefix": "Camembert", - }, - "albert": { - "prefix": "Albert", - }, + "bert": {"prefix": "Bert"}, + "xlm.*roberta": {"prefix": "XLMRoberta"}, + "roberta.*xml": {"prefix": "XLMRoberta"}, + "bigbird": {"prefix": "BigBird"}, + "roberta": {"prefix": "Roberta"}, + "codebert.*mlm": {"prefix": "Roberta"}, + "mlm.*codebert": {"prefix": "Roberta"}, + "camembert": {"prefix": "Camembert"}, + "umberto": {"prefix": "Camembert"}, + "albert": {"prefix": "Albert"}, "distilbert": { "prefix": "DistilBert", - "sequence_summary_config": { - "summary_last_dropout": 0, - "summary_type": "first", - "summary_activation": "tanh", - } - }, - "xlnet": { - "prefix": "XLNet", - "sequence_summary_config": { - "summary_last_dropout": 0, - } + "sequence_summary_config": {"summary_last_dropout": 0, "summary_type": "first", "summary_activation": "tanh"}, }, + "xlnet": {"prefix": "XLNet", "sequence_summary_config": {"summary_last_dropout": 0}}, "electra": { "prefix": "Electra", "sequence_summary_config": { @@ -93,39 +63,33 @@ "summary_type": "first", "summary_activation": "gelu", "summary_use_proj": False, - } - }, - "word2vec": { - "prefix": "WordEmbedding_LM", - }, - "glove": { - "prefix": "WordEmbedding_LM", - }, - "minilm": { - "prefix": "Bert", + }, }, + "word2vec": {"prefix": "WordEmbedding_LM"}, + "glove": {"prefix": "WordEmbedding_LM"}, + "minilm": {"prefix": "Bert"}, "deberta-v2": { "prefix": "DebertaV2", "sequence_summary_config": { "summary_last_dropout": 0, "summary_type": "first", "summary_activati": "tanh", - "summary_use_proj": False - } + "summary_use_proj": False, + }, }, } HF_MODEL_TYPES = { - "xlm-roberta": "XLMRoberta", - "roberta": "Roberta", - "camembert": "Camembert", + "bert": "Bert", "albert": "Albert", + "roberta": "Roberta", + "xlm-roberta": "XLMRoberta", "distilbert": "DistilBert", - "bert": "Bert", "xlnet": "XLNet", "electra": "Electra", + "camembert": "Camembert", "big_bird": "BigBird", - "deberta-v2": "DebertaV2", + "deberta-v2": "DebertaV2", } HF_MODEL_STRINGS_HINTS = { @@ -179,7 +143,7 @@ def quiet_from_pretrained_func(cls, *args, **kwargs): OUTPUT_DIM_NAMES = ["dim", "hidden_size", "d_model"] # TODO analyse if LMs can be completely used through HF transformers -class LanguageModel(nn.Module): +class LanguageModel(nn.Module, ABC): """ The parent class for any kind of model that can embed language into a semantic vector space. Practically speaking, these models read in tokenized sentences and return vectors that capture the meaning of sentences @@ -196,16 +160,22 @@ def __init_subclass__(cls, **kwargs): super().__init_subclass__(**kwargs) cls.subclasses[cls.__name__] = cls - def forward(self, input_ids: torch.Tensor, segment_ids: torch.Tensor, padding_mask: torch.Tensor, **kwargs): + def __init__(self): + super().__init__() + + @abstractmethod + def forward(self, input_ids: torch.Tensor, segment_ids: torch.Tensor, padding_mask: torch.Tensor, output_hidden_states: Optional[bool] = None, output_attentions: Optional[bool] = None): raise NotImplementedError - @classmethod + @staticmethod def load( - cls, - pretrained_model_name_or_path: Union[Path, str], - language: str = None, - use_auth_token: Union[bool, str] = None, - **kwargs, + pretrained_model_name_or_path: Union[Path, str], + language: str = None, + n_added_tokens: int = 0, + language_model_class: Optional[str] = None, + auth_token: Optional[str] = None, + revision: Optional[str] = None, + **kwargs ): """ Load a pretrained language model by doing one of the following: @@ -246,72 +216,63 @@ def load( :param pretrained_model_name_or_path: The path of the saved pretrained model or its name. :param revision: The version of the model to use from the Hugging Face model hub. This can be a tag name, a branch name, or a commit hash. - :param language_model_class: (Optional) Name of the language model class to load (for example `Bert`). + :param language_model_class: (Optional) Name of the language model class to load (for example `Bert`). Unused if the model is local. """ - n_added_tokens = kwargs.pop("n_added_tokens", 0) - language_model_class = kwargs.pop("language_model_class", None) - kwargs["revision"] = kwargs.get("revision", None) logger.info("LOADING MODEL") logger.info("=============") + config_file = Path(pretrained_model_name_or_path) / "language_model_config.json" + if os.path.exists(config_file): - logger.info(f"Model found locally at {pretrained_model_name_or_path}") # it's a local directory in Haystack format + logger.info(f"Model found locally at {pretrained_model_name_or_path}") config = json.load(open(config_file)) - language_model = cls.subclasses[config["name"]].load(pretrained_model_name_or_path) + language_model_class = config["name"] else: + # It's from the model hub logger.info(f"Could not find {pretrained_model_name_or_path} locally.") logger.info(f"Looking on Transformers Model Hub (in local cache and online)...") if language_model_class is None: - language_model_class = cls.get_language_model_class( - pretrained_model_name_or_path, use_auth_token=use_auth_token, **kwargs + language_model_class = LanguageModel.get_language_model_class( + pretrained_model_name_or_path, auth_token=auth_token, **kwargs ) - - if language_model_class: - language_model = cls.subclasses[language_model_class].load( - pretrained_model_name_or_path, use_auth_token=use_auth_token, **kwargs + if not language_model_class: + raise Exception( + f"Model not found for {pretrained_model_name_or_path}. Either supply the local path for a saved " + f"model or one of bert/roberta/xlnet/albert/distilbert models that can be downloaded from remote. " + f"Ensure that the model class name can be inferred from the directory name when loading a " + f"Transformers' model." ) - else: - language_model = None - - if not language_model: - raise Exception( - f"Model not found for {pretrained_model_name_or_path}. Either supply the local path for a saved " - f"model or one of bert/roberta/xlnet/albert/distilbert models that can be downloaded from remote. " - f"Ensure that the model class name can be inferred from the directory name when loading a " - f"Transformers' model." - ) + language_model = LanguageModel.subclasses[language_model_class]( + pretrained_model_name_or_path, + auth_token=auth_token, + n_added_tokens=n_added_tokens, + language=language, + revision=revision, + **kwargs + ) logger.info(f"Loaded {pretrained_model_name_or_path}") - - # resize embeddings in case of custom vocab - if n_added_tokens != 0: - # TODO verify for other models than BERT - model_emb_size = language_model.model.resize_token_embeddings(new_num_tokens=None).num_embeddings - vocab_size = model_emb_size + n_added_tokens - logger.info( - f"Resizing embedding layer of LM from {model_emb_size} to {vocab_size} to cope with custom vocab." - ) - language_model.model.resize_token_embeddings(vocab_size) - # verify - model_emb_size = language_model.model.resize_token_embeddings(new_num_tokens=None).num_embeddings - assert vocab_size == model_emb_size - return language_model @staticmethod - def get_language_model_class(model_name_or_path, use_auth_token: Union[str, bool] = None, **kwargs): + def get_language_model_class(model_name_or_path, auth_token: Optional[str] = None, revision: Optional[str] = None, **kwargs): """ Given a model name, try to use AutoConfig to understand which model type it is. In case it's not successful, tries to infer the type from the name of the model. """ # it's transformers format (either from model hub or local) model_name_or_path = str(model_name_or_path) - config = AutoConfig.from_pretrained(model_name_or_path, use_auth_token=use_auth_token, **kwargs) + config = AutoConfig.from_pretrained( + pretrained_model_name_or_path=model_name_or_path, + use_auth_token=auth_token or False, + revision=revision, + **kwargs + ) language_model_class = HF_MODEL_TYPES.get(config.model_type, None) # Handle special cases if not language_model_class: - + # DPR if config.model_type == "dpr": if config.architectures[0] == "DPRQuestionEncoder": @@ -320,7 +281,7 @@ def get_language_model_class(model_name_or_path, use_auth_token: Union[str, bool language_model_class = "DPRContextEncoder" elif config.archictectures[0] == "DPRReader": raise NotImplementedError("DPRReader models are currently not supported.") - + # Infer from model name if still not found else: logger.warning("Could not infer the class from config. Trying to infer class from model name.") @@ -363,9 +324,7 @@ def save(self, save_dir: Union[str, Path], state_dict: Dict[Any, Any] = None): """ # Save Weights save_name = Path(save_dir) / "language_model.bin" - model_to_save = ( - self.model.module if hasattr(self.model, "module") else self.model - ) # Only save the model itself + model_to_save = self.model.module if hasattr(self.model, "module") else self.model # Only save the model itself if not state_dict: state_dict = model_to_save.state_dict() @@ -467,27 +426,8 @@ class HFLanguageModel(LanguageModel): (https://github.com/huggingface/transformers) to fit the LanguageModel class. """ - def __init__(self, name: str): - super().__init__() - self.model = None - self.name = name - - class_prefix = HF_PARAMETERS_BY_MODEL.get(self.name)["prefix"] - self.config_class: PretrainedConfig = getattr(transformers, class_prefix+"Config", None) - self.model_class: PreTrainedModel = getattr(transformers, class_prefix+"Model", None) - - # @classmethod - # def from_scratch(cls, vocab_size, name="bert", language="en"): - # bert = cls() - # bert.name = name - # bert.language = language - # config = BertConfig(vocab_size=vocab_size) - # bert.model = BertModel(config) - # return bert - - @classmethod @silence_transformers_logs - def load(cls, pretrained_model_name_or_path: Union[Path, str], language: str = None, **kwargs): + def __init__(self, pretrained_model_name_or_path: Union[Path, str], model_type: str, language: str = None, n_added_tokens: int = 0, auth_token: Optional[str] = None, **kwargs): """ Load a pretrained model by supplying one of the following: @@ -497,25 +437,47 @@ def load(cls, pretrained_model_name_or_path: Union[Path, str], language: str = N :param pretrained_model_name_or_path: The path of the saved pretrained model or the name of the model. """ - self = cls() - if "haystack_lm_name" in kwargs: - self.name = kwargs["haystack_lm_name"] - else: - self.name = pretrained_model_name_or_path + super().__init__() + self.name = kwargs["haystack_lm_name"] if "haystack_lm_name" in kwargs else pretrained_model_name_or_path + + class_prefix = HF_PARAMETERS_BY_MODEL.get(model_type)["prefix"] + config_class: PretrainedConfig = getattr(transformers, class_prefix + "Config", None) + model_class: PreTrainedModel = getattr(transformers, class_prefix + "Model", None) + # We need to differentiate between loading model using Haystack format and Pytorch-Transformers format haystack_lm_config = Path(pretrained_model_name_or_path) / "language_model_config.json" if os.path.exists(haystack_lm_config): # Haystack style - model_config = self.config_class.from_pretrained(haystack_lm_config) haystack_lm_model = Path(pretrained_model_name_or_path) / "language_model.bin" - self.model = self.model_class.from_pretrained(haystack_lm_model, config=model_config, **kwargs) + model_config = config_class.from_pretrained(haystack_lm_config) + self.model = model_class.from_pretrained(haystack_lm_model, config=model_config, use_auth_token=auth_token or False, **kwargs) self.language = self.model.config.language else: # Pytorch-transformer Style - self.model = self.model_class.from_pretrained(str(pretrained_model_name_or_path), **kwargs) - self.language = language or cls._infer_language_from_name(pretrained_model_name_or_path) - return self + self.model = model_class.from_pretrained(str(pretrained_model_name_or_path), use_auth_token=auth_token or False, **kwargs) + self.language = language or self._infer_language_from_name(pretrained_model_name_or_path) + + # resize embeddings in case of custom vocab + if n_added_tokens != 0: + # TODO verify for other models than BERT + model_emb_size = self.model.resize_token_embeddings(new_num_tokens=None).num_embeddings + vocab_size = model_emb_size + n_added_tokens + logger.info( + f"Resizing embedding layer of LM from {model_emb_size} to {vocab_size} to cope with custom vocab." + ) + self.model.resize_token_embeddings(vocab_size) + # verify + model_emb_size = self.model.resize_token_embeddings(new_num_tokens=None).num_embeddings + assert vocab_size == model_emb_size + # @classmethod + # def from_scratch(cls, vocab_size, name="bert", language="en"): + # bert = cls() + # bert.name = name + # bert.language = language + # config = BertConfig(vocab_size=vocab_size) + # bert.model = BertModel(config) + # return bert def forward( self, @@ -523,11 +485,10 @@ def forward( segment_ids: torch.Tensor, padding_mask: torch.Tensor, output_hidden_states: Optional[bool] = None, - output_attentions: Optional[bool] = None, - **kwargs, + output_attentions: Optional[bool] = None ): """ - Perform the forward pass of the BERT model. + Perform the forward pass of the model. :param input_ids: The IDs of each token in the input sequence. It's a tensor of shape [batch_size, max_seq_len]. :param segment_ids: The ID of the segment. For example, in next sentence prediction, the tokens in the @@ -561,7 +522,6 @@ def disable_hidden_states_output(self): self.model.encoder.config.output_hidden_states = False - class HFLanguageModelWithPooler(HFLanguageModel): """ A model that wraps Hugging Face's implementation @@ -572,13 +532,7 @@ class HFLanguageModelWithPooler(HFLanguageModel): - Unlike the other BERT variants, these don't output the `pooled_output`. An additional pooler is initialized. """ - def __init__(self, name: str): - super().__init__(name=name) - self.pooler = None - - @classmethod - @silence_transformers_logs - def load(cls, pretrained_model_name_or_path: Union[Path, str], language: str = None, **kwargs): + def __init__(self, pretrained_model_name_or_path: Union[Path, str], language: str = None, n_added_tokens: int = 0, **kwargs): """ Load a pretrained model by supplying one of the following: @@ -588,7 +542,8 @@ def load(cls, pretrained_model_name_or_path: Union[Path, str], language: str = N :param pretrained_model_name_or_path: The path of the saved pretrained model or its name. """ - self = super().load(cls, pretrained_model_name_or_path, language, **kwargs) + super().__init__(pretrained_model_name_or_path, language, n_added_tokens, **kwargs) + self.pooler = None config = self.model.config # These models do not provide a pooled_output by default. Therefore, we need to initialize an extra pooler. @@ -601,18 +556,18 @@ def load(cls, pretrained_model_name_or_path: Union[Path, str], language: str = N self.pooler = SequenceSummary(config) self.pooler.apply(self.model._init_weights) - return self def forward( # type: ignore self, input_ids: torch.Tensor, + segment_ids: torch.Tensor, padding_mask: torch.Tensor, output_hidden_states: Optional[bool] = None, output_attentions: Optional[bool] = None, **kwargs, ): """ - Perform the forward pass of the DistilBERT model. + Perform the forward pass of the model. :param input_ids: The IDs of each token in the input sequence. It's a tensor of shape [batch_size, max_seq_len]. :param padding_mask: A mask that assigns 1 to valid input tokens and 0 to padding tokens @@ -621,12 +576,18 @@ def forward( # type: ignore :param output_attentions: When set to `True`, outputs attentions in addition to the embeddings. :return: Embeddings for each token in the input sequence. """ - output_tuple = super().forward(input_ids=input_ids, padding_mask=padding_mask, output_hidden_states=output_hidden_states, output_attentions=output_attentions, **kwargs) + output_tuple = super().forward( + input_ids=input_ids, + segment_ids=segment_ids, + padding_mask=padding_mask, + output_hidden_states=output_hidden_states, + output_attentions=output_attentions, + **kwargs, + ) pooled_output = self.pooler(output_tuple[0]) return (output_tuple[0], pooled_output) + output_tuple[1:] - class Bert(HFLanguageModel): """ A BERT model that wraps Hugging Face's implementation @@ -634,8 +595,14 @@ class Bert(HFLanguageModel): Paper: https://arxiv.org/abs/1810.04805. """ - def __init__(self): - super().__init__(name = "bert") + def __init__(self, pretrained_model_name_or_path: Union[Path, str], language: str = None, n_added_tokens: int = 0, **kwargs): + super().__init__( + pretrained_model_name_or_path=pretrained_model_name_or_path, + language=language, + n_added_tokens=n_added_tokens, + model_type="bert", + **kwargs + ) class Albert(HFLanguageModel): @@ -644,8 +611,14 @@ class Albert(HFLanguageModel): (https://github.com/huggingface/transformers) to fit the LanguageModel class. """ - def __init__(self): - super().__init__(name = "albert") + def __init__(self, pretrained_model_name_or_path: Union[Path, str], language: str = None, n_added_tokens: int = 0, **kwargs): + super().__init__( + pretrained_model_name_or_path=pretrained_model_name_or_path, + language=language, + n_added_tokens=n_added_tokens, + model_type="albert", + **kwargs + ) class Roberta(HFLanguageModel): @@ -655,8 +628,14 @@ class Roberta(HFLanguageModel): Paper: https://arxiv.org/abs/1907.11692 """ - def __init__(self): - super().__init__(name = "roberta") + def __init__(self, pretrained_model_name_or_path: Union[Path, str], language: str = None, n_added_tokens: int = 0, **kwargs): + super().__init__( + pretrained_model_name_or_path=pretrained_model_name_or_path, + language=language, + n_added_tokens=n_added_tokens, + model_type="roberta", + **kwargs + ) class XLMRoberta(HFLanguageModel): @@ -666,9 +645,14 @@ class XLMRoberta(HFLanguageModel): Paper: https://arxiv.org/abs/1907.11692 """ - def __init__(self): - super().__init__(name = "xlm_roberta") - + def __init__(self, pretrained_model_name_or_path: Union[Path, str], language: str = None, n_added_tokens: int = 0, **kwargs): + super().__init__( + pretrained_model_name_or_path=pretrained_model_name_or_path, + language=language, + n_added_tokens=n_added_tokens, + model_type="xlm-roberta", + **kwargs + ) class DistilBert(HFLanguageModelWithPooler): """ @@ -683,9 +667,14 @@ class DistilBert(HFLanguageModelWithPooler): `pooled_output`. An additional pooler is initialized. """ - def __init__(self): - super().__init__(name = "distilbert") - + def __init__(self, pretrained_model_name_or_path: Union[Path, str], language: str = None, n_added_tokens: int = 0, **kwargs): + super().__init__( + pretrained_model_name_or_path=pretrained_model_name_or_path, + language=language, + n_added_tokens=n_added_tokens, + model_type="distilbert", + **kwargs + ) class XLNet(HFLanguageModelWithPooler): """ @@ -694,9 +683,14 @@ class XLNet(HFLanguageModelWithPooler): Paper: https://arxiv.org/abs/1906.08237 """ - def __init__(self): - super().__init__(name = "xlnet") - + def __init__(self, pretrained_model_name_or_path: Union[Path, str], language: str = None, n_added_tokens: int = 0, **kwargs): + super().__init__( + pretrained_model_name_or_path=pretrained_model_name_or_path, + language=language, + n_added_tokens=n_added_tokens, + model_type="xlnet", + **kwargs + ) class Electra(HFLanguageModelWithPooler): """ @@ -713,9 +707,14 @@ class Electra(HFLanguageModelWithPooler): - Electra does not output the `pooled_output`. An additional pooler is initialized. """ - def __init__(self): - super().__init__(name = "electra") - + def __init__(self, pretrained_model_name_or_path: Union[Path, str], language: str = None, n_added_tokens: int = 0, **kwargs): + super().__init__( + pretrained_model_name_or_path=pretrained_model_name_or_path, + language=language, + n_added_tokens=n_added_tokens, + model_type="electra", + **kwargs + ) class Camembert(HFLanguageModel): """ @@ -723,9 +722,14 @@ class Camembert(HFLanguageModel): (https://github.com/huggingface/transformers) to fit the LanguageModel class. """ - def __init__(self): - super().__init__(name = "camembert") - + def __init__(self, pretrained_model_name_or_path: Union[Path, str], language: str = None, n_added_tokens: int = 0, **kwargs): + super().__init__( + pretrained_model_name_or_path=pretrained_model_name_or_path, + language=language, + n_added_tokens=n_added_tokens, + model_type="camembert", + **kwargs + ) class BigBird(HFLanguageModel): """ @@ -734,9 +738,14 @@ class BigBird(HFLanguageModel): Paper: https://arxiv.org/abs/1810.04805 """ - def __init__(self): - super().__init__("big_bird") - + def __init__(self, pretrained_model_name_or_path: Union[Path, str], language: str = None, n_added_tokens: int = 0, **kwargs): + super().__init__( + pretrained_model_name_or_path=pretrained_model_name_or_path, + language=language, + n_added_tokens=n_added_tokens, + model_type="bigbird", + **kwargs + ) class DebertaV2(HFLanguageModelWithPooler): """ @@ -747,28 +756,27 @@ class DebertaV2(HFLanguageModelWithPooler): - DebertaV2 does not output the `pooled_output`. An additional pooler is initialized. """ - def __init__(self): - super().__init__(name = "deberta-v2") - - + def __init__(self, pretrained_model_name_or_path: Union[Path, str], language: str = None, n_added_tokens: int = 0, **kwargs): + super().__init__( + pretrained_model_name_or_path=pretrained_model_name_or_path, + language=language, + n_added_tokens=n_added_tokens, + model_type="deberta-v2", + **kwargs + ) class DPRQuestionEncoder(LanguageModel): """ A DPRQuestionEncoder model that wraps Hugging Face's implementation. """ - def __init__(self): - super().__init__() - self.model = None - self.name = "dpr_question_encoder" - - @classmethod @silence_transformers_logs - def load( - cls, + def __init__( + self, pretrained_model_name_or_path: Union[Path, str], language: str = None, - use_auth_token: Union[str, bool] = None, + n_added_tokens: int = 0, + auth_token: Optional[str] = None, **kwargs, ): """ @@ -780,11 +788,11 @@ def load( :param pretrained_model_name_or_path: The path of the base pretrained language model whose weights are used to initialize DPRQuestionEncoder. """ - dpr_question_encoder = cls() + super().__init__() if "haystack_lm_name" in kwargs: - dpr_question_encoder.name = kwargs["haystack_lm_name"] + self.name = kwargs["haystack_lm_name"] else: - dpr_question_encoder.name = pretrained_model_name_or_path + self.name = pretrained_model_name_or_path # We need to differentiate between loading model using Haystack format and Pytorch-Transformers format haystack_lm_config = Path(pretrained_model_name_or_path) / "language_model_config.json" @@ -795,7 +803,7 @@ def load( if original_model_config.model_type == "dpr": dpr_config = transformers.DPRConfig.from_pretrained(haystack_lm_config) - dpr_question_encoder.model = transformers.DPRQuestionEncoder.from_pretrained( + self.model = transformers.DPRQuestionEncoder.from_pretrained( haystack_lm_model, config=dpr_config, **kwargs ) else: @@ -806,22 +814,22 @@ def load( ) original_config_dict = vars(original_model_config) original_config_dict.update(kwargs) - dpr_question_encoder.model = transformers.DPRQuestionEncoder( + self.model = transformers.DPRQuestionEncoder( config=transformers.DPRConfig(**original_config_dict) ) - language_model_class = cls.get_language_model_class(haystack_lm_config, use_auth_token, **kwargs) - dpr_question_encoder.model.base_model.bert_model = ( - cls.subclasses[language_model_class].load(str(pretrained_model_name_or_path)).model + language_model_class = DPRQuestionEncoder.get_language_model_class(haystack_lm_config, auth_token or False, **kwargs) + self.model.base_model.bert_model = ( + DPRQuestionEncoder.subclasses[language_model_class](str(pretrained_model_name_or_path)).model ) - dpr_question_encoder.language = dpr_question_encoder.model.config.language + self.language = self.model.config.language else: original_model_config = AutoConfig.from_pretrained( - pretrained_model_name_or_path, use_auth_token=use_auth_token + pretrained_model_name_or_path, use_auth_token=auth_token or False ) if original_model_config.model_type == "dpr": # "pretrained dpr model": load existing pretrained DPRQuestionEncoder model - dpr_question_encoder.model = transformers.DPRQuestionEncoder.from_pretrained( - str(pretrained_model_name_or_path), use_auth_token=use_auth_token, **kwargs + self.model = transformers.DPRQuestionEncoder.from_pretrained( + str(pretrained_model_name_or_path), use_auth_token=auth_token or False, **kwargs ) else: # "from scratch": load weights from different architecture (e.g. bert) into DPRQuestionEncoder @@ -834,15 +842,14 @@ def load( ) original_config_dict = vars(original_model_config) original_config_dict.update(kwargs) - dpr_question_encoder.model = transformers.DPRQuestionEncoder( + self.model = transformers.DPRQuestionEncoder( config=transformers.DPRConfig(**original_config_dict) ) - dpr_question_encoder.model.base_model.bert_model = AutoModel.from_pretrained( - str(pretrained_model_name_or_path), use_auth_token=use_auth_token, **original_config_dict + self.model.base_model.bert_model = AutoModel.from_pretrained( + str(pretrained_model_name_or_path), use_auth_token=auth_token or False, **original_config_dict ) - dpr_question_encoder.language = language or cls._infer_language_from_name(pretrained_model_name_or_path) + self.language = language or DPRQuestionEncoder._infer_language_from_name(pretrained_model_name_or_path) - return dpr_question_encoder def save(self, save_dir: Union[str, Path], state_dict: Optional[Dict[Any, Any]] = None): """ @@ -910,19 +917,13 @@ class DPRContextEncoder(LanguageModel): """ A DPRContextEncoder model that wraps Hugging Face's implementation. """ - - def __init__(self): - super().__init__() - self.model = None - self.name = "dpr_context_encoder" - - @classmethod @silence_transformers_logs - def load( - cls, + def __init__( + self, pretrained_model_name_or_path: Union[Path, str], language: str = None, - use_auth_token: Union[str, bool] = None, + n_added_tokens: int = 0, + auth_token: Optional[str] = None, **kwargs, ): """ @@ -934,11 +935,11 @@ def load( :param pretrained_model_name_or_path: The path of the base pretrained language model whose weights are used to initialize DPRContextEncoder. """ - dpr_context_encoder = cls() + super().__init__() if "haystack_lm_name" in kwargs: - dpr_context_encoder.name = kwargs["haystack_lm_name"] + self.name = kwargs["haystack_lm_name"] else: - dpr_context_encoder.name = pretrained_model_name_or_path + self.name = pretrained_model_name_or_path # We need to differentiate between loading model using Haystack format and Pytorch-Transformers format haystack_lm_config = Path(pretrained_model_name_or_path) / "language_model_config.json" @@ -949,8 +950,8 @@ def load( if original_model_config.model_type == "dpr": dpr_config = transformers.DPRConfig.from_pretrained(haystack_lm_config) - dpr_context_encoder.model = transformers.DPRContextEncoder.from_pretrained( - haystack_lm_model, config=dpr_config, use_auth_token=use_auth_token, **kwargs + self.model = transformers.DPRContextEncoder.from_pretrained( + haystack_lm_model, config=dpr_config, use_auth_token=auth_token or False, **kwargs ) else: if original_model_config.model_type != "bert": @@ -960,26 +961,24 @@ def load( ) original_config_dict = vars(original_model_config) original_config_dict.update(kwargs) - dpr_context_encoder.model = transformers.DPRContextEncoder( + self.model = transformers.DPRContextEncoder( config=transformers.DPRConfig(**original_config_dict) ) - language_model_class = cls.get_language_model_class(haystack_lm_config, **kwargs) - dpr_context_encoder.model.base_model.bert_model = ( - cls.subclasses[language_model_class] - .load(str(pretrained_model_name_or_path), use_auth_token=use_auth_token) - .model + language_model_class = DPRQuestionEncoder.get_language_model_class(haystack_lm_config, **kwargs) + self.model.base_model.bert_model = ( + DPRContextEncoder.subclasses[language_model_class](str(pretrained_model_name_or_path), auth_token=auth_token).model ) - dpr_context_encoder.language = dpr_context_encoder.model.config.language + self.language = self.model.config.language else: # Pytorch-transformer Style original_model_config = AutoConfig.from_pretrained( - pretrained_model_name_or_path, use_auth_token=use_auth_token + pretrained_model_name_or_path, use_auth_token=auth_token or False ) if original_model_config.model_type == "dpr": # "pretrained dpr model": load existing pretrained DPRContextEncoder model - dpr_context_encoder.model = transformers.DPRContextEncoder.from_pretrained( - str(pretrained_model_name_or_path), use_auth_token=use_auth_token, **kwargs + self.model = transformers.DPRContextEncoder.from_pretrained( + str(pretrained_model_name_or_path), use_auth_token=auth_token or False, **kwargs ) else: # "from scratch": load weights from different architecture (e.g. bert) into DPRContextEncoder @@ -992,15 +991,14 @@ def load( ) original_config_dict = vars(original_model_config) original_config_dict.update(kwargs) - dpr_context_encoder.model = transformers.DPRContextEncoder( + self.model = transformers.DPRContextEncoder( config=transformers.DPRConfig(**original_config_dict) ) - dpr_context_encoder.model.base_model.bert_model = AutoModel.from_pretrained( - str(pretrained_model_name_or_path), use_auth_token=use_auth_token, **original_config_dict + self.model.base_model.bert_model = AutoModel.from_pretrained( + str(pretrained_model_name_or_path), use_auth_token=auth_token or False, **original_config_dict ) - dpr_context_encoder.language = language or cls._infer_language_from_name(pretrained_model_name_or_path) + self.language = language or DPRContextEncoder._infer_language_from_name(pretrained_model_name_or_path) - return dpr_context_encoder def save(self, save_dir: Union[str, Path], state_dict: Optional[Dict[Any, Any]] = None): """ diff --git a/haystack/modeling/training/base.py b/haystack/modeling/training/base.py index 67a126c2fd..5031a20d50 100644 --- a/haystack/modeling/training/base.py +++ b/haystack/modeling/training/base.py @@ -767,7 +767,14 @@ def compute_loss(self, batch: dict, step: int) -> torch.Tensor: keys = list(batch.keys()) keys = [key for key in keys if key.startswith("teacher_output")] teacher_logits = [batch.pop(key) for key in keys] - logits = self.model.forward(**batch) + + params = {'input_ids': batch["input_ids"], 'segment_ids': batch["segment_ids"], 'padding_mask': batch["padding_mask"]} + if 'output_hidden_states' in batch.keys(): + params['output_hidden_states'] = batch["output_hidden_states"] + if 'output_attentions' in batch.keys(): + params['output_attentions'] = batch["output_attentions"] + logits = self.model.forward(**params) + student_loss = self.model.logits_to_loss(logits=logits, global_step=self.global_step, **batch) distillation_loss = self.distillation_loss_fn( student_logits=logits[0] / self.temperature, teacher_logits=teacher_logits[0] / self.temperature @@ -899,7 +906,12 @@ def __init__( self.loss = DataParallel(self.loss).to(device) def compute_loss(self, batch: dict, step: int) -> torch.Tensor: - return self.backward_propagate(torch.sum(self.loss(batch)), step) + params = {'input_ids': batch["input_ids"], 'segment_ids': batch["segment_ids"], 'padding_mask': batch["padding_mask"]} + if 'output_hidden_states' in batch.keys(): + params['output_hidden_states'] = batch["output_hidden_states"] + if 'output_attentions' in batch.keys(): + params['output_attentions'] = batch["output_attentions"] + return self.backward_propagate(torch.sum(self.loss(**params)), step) class DistillationLoss(Module): @@ -945,14 +957,28 @@ def __init__(self, model: Union[DataParallel, AdaptiveModel], teacher_model: Mod else: self.dim_mappings.append(None) - def forward(self, batch): + def forward( + self, + input_ids: torch.Tensor, + segment_ids: torch.Tensor, + padding_mask: torch.Tensor + ): with torch.no_grad(): _, teacher_hidden_states, teacher_attentions = self.teacher_model.forward( - **batch, output_attentions=True, output_hidden_states=True + input_ids=input_ids, + segment_ids=segment_ids, + padding_mask=padding_mask, + output_attentions=True, + output_hidden_states=True ) - - _, hidden_states, attentions = self.model.forward(**batch, output_attentions=True, output_hidden_states=True) - loss = torch.tensor(0.0, device=batch["input_ids"].device) + _, hidden_states, attentions = self.model.forward( + input_ids=input_ids, + segment_ids=segment_ids, + padding_mask=padding_mask, + output_attentions=True, + output_hidden_states=True + ) + loss = torch.tensor(0.0, device=input_ids.device) # calculating attention loss for student_attention, teacher_attention, dim_mapping in zip( diff --git a/test/modeling/test_modeling_dpr.py b/test/modeling/test_modeling_dpr.py index c6a30c0212..4e66955a71 100644 --- a/test/modeling/test_modeling_dpr.py +++ b/test/modeling/test_modeling_dpr.py @@ -1038,9 +1038,3 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path, query_and_passage_ # ) # # trainer2.train() - - -if __name__ == "__main__": - # test_dpr_training() - test_dpr_context_only() - # test_dpr_modules() From 23d38ec89f960b7dd1b551480ef47e3f4339a89b Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 15 Jun 2022 13:15:43 +0200 Subject: [PATCH 03/89] Working on removing Tokenizer --- haystack/errors.py | 9 + haystack/modeling/data_handler/processor.py | 18 +- haystack/modeling/model/_mappings.py | 128 +++++++ haystack/modeling/model/language_model.py | 102 ++--- haystack/modeling/model/tokenization.py | 351 ++++-------------- haystack/nodes/retriever/dense.py | 12 +- test/modeling/test_modeling_dpr.py | 46 +-- test/modeling/test_modeling_processor.py | 6 +- .../test_modeling_processor_saving_loading.py | 4 +- test/modeling/test_tokenization.py | 40 +- 10 files changed, 306 insertions(+), 410 deletions(-) create mode 100644 haystack/modeling/model/_mappings.py diff --git a/haystack/errors.py b/haystack/errors.py index ca7680f8dd..d5b1da13a2 100644 --- a/haystack/errors.py +++ b/haystack/errors.py @@ -35,6 +35,15 @@ def __repr__(self): return str(self) +class ModelingError(HaystackError): + """Exception for issues raised by the modeling module""" + + def __init__( + self, message: Optional[str] = None, docs_link: Optional[str] = "https://haystack.deepset.ai/" + ): + super().__init__(message=message, docs_link=docs_link) + + class PipelineError(HaystackError): """Exception for issues raised within a pipeline""" diff --git a/haystack/modeling/data_handler/processor.py b/haystack/modeling/data_handler/processor.py index b44f2e432b..b149478b8e 100644 --- a/haystack/modeling/data_handler/processor.py +++ b/haystack/modeling/data_handler/processor.py @@ -176,11 +176,11 @@ def load_from_dir(cls, load_dir: str): "Loading tokenizer from deprecated config. " "If you used `custom_vocab` or `never_split_chars`, this won't work anymore." ) - tokenizer = Tokenizer.load( + tokenizer = get_tokenizer load_dir, tokenizer_class=config["tokenizer"], do_lower_case=config["lower_case"] ) else: - tokenizer = Tokenizer.load(load_dir, tokenizer_class=config["tokenizer"]) + tokenizer = get_tokenizerload_dir, tokenizer_class=config["tokenizer"]) # we have to delete the tokenizer string from config, because we pass it as Object del config["tokenizer"] @@ -216,7 +216,7 @@ def convert_from_transformers( **kwargs, ): tokenizer_args = tokenizer_args or {} - tokenizer = Tokenizer.load( + tokenizer = get_tokenizer tokenizer_name_or_path, tokenizer_class=tokenizer_class, use_fast=use_fast, @@ -916,8 +916,8 @@ def load_from_dir(cls, load_dir: str): processor_config_file = Path(load_dir) / "processor_config.json" config = json.load(open(processor_config_file)) # init tokenizer - query_tokenizer = Tokenizer.load(load_dir, tokenizer_class=config["query_tokenizer"], subfolder="query") - passage_tokenizer = Tokenizer.load(load_dir, tokenizer_class=config["passage_tokenizer"], subfolder="passage") + query_tokenizer = get_tokenizerload_dir, tokenizer_class=config["query_tokenizer"], subfolder="query") + passage_tokenizer = get_tokenizerload_dir, tokenizer_class=config["passage_tokenizer"], subfolder="passage") # we have to delete the tokenizer string from config, because we pass it as Object del config["query_tokenizer"] @@ -1320,9 +1320,9 @@ def load_from_dir(cls, load_dir: str): processor_config_file = Path(load_dir) / "processor_config.json" config = json.load(open(processor_config_file)) # init tokenizer - query_tokenizer = Tokenizer.load(load_dir, tokenizer_class=config["query_tokenizer"], subfolder="query") - passage_tokenizer = Tokenizer.load(load_dir, tokenizer_class=config["passage_tokenizer"], subfolder="passage") - table_tokenizer = Tokenizer.load(load_dir, tokenizer_class=config["table_tokenizer"], subfolder="table") + query_tokenizer = get_tokenizerload_dir, tokenizer_class=config["query_tokenizer"], subfolder="query") + passage_tokenizer = get_tokenizerload_dir, tokenizer_class=config["passage_tokenizer"], subfolder="passage") + table_tokenizer = get_tokenizerload_dir, tokenizer_class=config["table_tokenizer"], subfolder="table") # we have to delete the tokenizer string from config, because we pass it as Object del config["query_tokenizer"] @@ -1944,7 +1944,7 @@ def load_from_dir(cls, load_dir: str): processor_config_file = Path(load_dir) / "processor_config.json" config = json.load(open(processor_config_file)) # init tokenizer - tokenizer = Tokenizer.load(load_dir, tokenizer_class=config["tokenizer"]) + tokenizer = get_tokenizerload_dir, tokenizer_class=config["tokenizer"]) # we have to delete the tokenizer string from config, because we pass it as Object del config["tokenizer"] diff --git a/haystack/modeling/model/_mappings.py b/haystack/modeling/model/_mappings.py new file mode 100644 index 0000000000..f37822af38 --- /dev/null +++ b/haystack/modeling/model/_mappings.py @@ -0,0 +1,128 @@ +HF_PARAMETERS_BY_MODEL = { + "bert": {"prefix": "Bert"}, + "xlm.*roberta": {"prefix": "XLMRoberta"}, + "roberta.*xml": {"prefix": "XLMRoberta"}, + "bigbird": {"prefix": "BigBird"}, + "roberta": {"prefix": "Roberta"}, + "codebert.*mlm": {"prefix": "Roberta"}, + "mlm.*codebert": {"prefix": "Roberta"}, + "camembert": {"prefix": "Camembert"}, + "umberto": {"prefix": "Camembert"}, + "albert": {"prefix": "Albert"}, + "distilbert": { + "prefix": "DistilBert", + "sequence_summary_config": {"summary_last_dropout": 0, "summary_type": "first", "summary_activation": "tanh"}, + }, + "xlnet": {"prefix": "XLNet", "sequence_summary_config": {"summary_last_dropout": 0}}, + "electra": { + "prefix": "Electra", + "sequence_summary_config": { + "summary_last_dropout": 0, + "summary_type": "first", + "summary_activation": "gelu", + "summary_use_proj": False, + }, + }, + "word2vec": {"prefix": "WordEmbedding_LM"}, + "glove": {"prefix": "WordEmbedding_LM"}, + "minilm": {"prefix": "Bert"}, + "deberta-v2": { + "prefix": "DebertaV2", + "sequence_summary_config": { + "summary_last_dropout": 0, + "summary_type": "first", + "summary_activati": "tanh", + "summary_use_proj": False, + }, + }, + "data2vec-vision": { + "prefix": "Data2VecVision", + } +} + +HF_MODEL_TYPES = { + "bert": "Bert", + "albert": "Albert", + "roberta": "Roberta", + "xlm-roberta": "XLMRoberta", + "distilbert": "DistilBert", + "xlnet": "XLNet", + "electra": "Electra", + "camembert": "Camembert", + "big_bird": "BigBird", + "deberta-v2": "DebertaV2", + "data2vec-vision": "Data2VecVision", +} + +HF_MODEL_STRINGS_HINTS = { + "xlm.*roberta|roberta.*xlm": "XLMRoberta", + "bigbird": "BigBird", + "roberta": "Roberta", + "codebert": "Roberta", + "camembert": "Camembert", + "albert": "Albert", + "distilbert": "DistilBert", + "bert": "Bert", + "xlnet": "XLNet", + "electra": "Electra", + "word2vec": "WordEmbedding_LM", + "glove": "WordEmbedding_LM", + "minilm": "Bert", + "dpr-question_encoder": "DPRQuestionEncoder", + "dpr-ctx_encoder": "DPRContextEncoder", + "data2vec-vision": "Data2VecVision", +} + +KNOWN_LANGUAGES = ("german", "english", "chinese", "indian", "french", "polish", "spanish", "multilingual") +KNOWN_LANGUAGE_SPECIFIC_MODELS = (("camembert", "french"), ("umberto", "italian")) + + + + + +TOKENIZERS_PARAMS = { + "Albert": {"keep_accents": True}, + "XLMRoberta": {}, + "Roberta": {}, + "DistilBert": {}, + "Bert": {}, + "XLNet": {"keep_accents": True}, + "Electra": {}, + "Camembert": {}, + "DPRQuestionEncoder": {}, + "DPRContextEncoder": {}, + "BigBird": {}, + "DebertaV2": {}, +} + +TOKENIZERS_MAPPING = { + "albert": "Albert", + "xlm-roberta": "XLMRoberta", + "roberta": "Roberta", + "distilbert": "DistilBert", + "bert": "Bert", + "xlnet": "XLNet", + "electra": "Electra", + "camembert": "Camembert", + "big_bird": "BigBird", + "deberta-v2": "DebertaV2", +} + +TOKENIZERS_STRING_HINTS = { + "albert": "Albert", + "bigbird": "BigBird", + "xlm-roberta": "XLMRoberta", + "roberta": "Roberta", + "codebert": "Roberta", + "camembert": "Camembert", + "umberto": "Camembert", + "distilbert": "DistilBert", + "debertav2": "DebertaV2", + "debertav3": "DebertaV2", + "bert": "Bert", + "xlnet": "XLNet", + "electra": "Electra", + "minilm": "Bert", + "dpr-question_encoder": "DPRQuestionEncoder", + "dpr-ctx_encoder": "DPRContextEncoder", +} \ No newline at end of file diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 71d3f5538f..3c60552bfb 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -27,7 +27,6 @@ from abc import ABC, abstractmethod from pathlib import Path from functools import wraps -from black import out import numpy as np import torch from torch import nn @@ -36,82 +35,16 @@ from transformers import AutoModel, AutoConfig from transformers.modeling_utils import SequenceSummary - -logger = logging.getLogger(__name__) +from haystack.modeling.model._mappings import ( + HF_PARAMETERS_BY_MODEL, + HF_MODEL_TYPES, + HF_MODEL_STRINGS_HINTS, + KNOWN_LANGUAGE_SPECIFIC_MODELS, + KNOWN_LANGUAGES +) -HF_PARAMETERS_BY_MODEL = { - "bert": {"prefix": "Bert"}, - "xlm.*roberta": {"prefix": "XLMRoberta"}, - "roberta.*xml": {"prefix": "XLMRoberta"}, - "bigbird": {"prefix": "BigBird"}, - "roberta": {"prefix": "Roberta"}, - "codebert.*mlm": {"prefix": "Roberta"}, - "mlm.*codebert": {"prefix": "Roberta"}, - "camembert": {"prefix": "Camembert"}, - "umberto": {"prefix": "Camembert"}, - "albert": {"prefix": "Albert"}, - "distilbert": { - "prefix": "DistilBert", - "sequence_summary_config": {"summary_last_dropout": 0, "summary_type": "first", "summary_activation": "tanh"}, - }, - "xlnet": {"prefix": "XLNet", "sequence_summary_config": {"summary_last_dropout": 0}}, - "electra": { - "prefix": "Electra", - "sequence_summary_config": { - "summary_last_dropout": 0, - "summary_type": "first", - "summary_activation": "gelu", - "summary_use_proj": False, - }, - }, - "word2vec": {"prefix": "WordEmbedding_LM"}, - "glove": {"prefix": "WordEmbedding_LM"}, - "minilm": {"prefix": "Bert"}, - "deberta-v2": { - "prefix": "DebertaV2", - "sequence_summary_config": { - "summary_last_dropout": 0, - "summary_type": "first", - "summary_activati": "tanh", - "summary_use_proj": False, - }, - }, -} - -HF_MODEL_TYPES = { - "bert": "Bert", - "albert": "Albert", - "roberta": "Roberta", - "xlm-roberta": "XLMRoberta", - "distilbert": "DistilBert", - "xlnet": "XLNet", - "electra": "Electra", - "camembert": "Camembert", - "big_bird": "BigBird", - "deberta-v2": "DebertaV2", -} - -HF_MODEL_STRINGS_HINTS = { - "xlm.*roberta|roberta.*xlm": "XLMRoberta", - "bigbird": "BigBird", - "roberta": "Roberta", - "codebert": "Roberta", - "camembert": "Camembert", - "albert": "Albert", - "distilbert": "DistilBert", - "bert": "Bert", - "xlnet": "XLNet", - "electra": "Electra", - "word2vec": "WordEmbedding_LM", - "glove": "WordEmbedding_LM", - "minilm": "Bert", - "dpr-question_encoder": "DPRQuestionEncoder", - "dpr-ctx_encoder": "DPRContextEncoder", -} - -KNOWN_LANGUAGES = ("german", "english", "chinese", "indian", "french", "polish", "spanish", "multilingual") -KNOWN_LANGUAGE_SPECIFIC_MODELS = (("camembert", "french"), ("umberto", "italian")) +logger = logging.getLogger(__name__) def silence_transformers_logs(from_pretrained_func): @@ -142,6 +75,7 @@ def quiet_from_pretrained_func(cls, *args, **kwargs): # in the output vectors OUTPUT_DIM_NAMES = ["dim", "hidden_size", "d_model"] + # TODO analyse if LMs can be completely used through HF transformers class LanguageModel(nn.Module, ABC): """ @@ -765,6 +699,24 @@ def __init__(self, pretrained_model_name_or_path: Union[Path, str], language: st **kwargs ) + +class Data2VecVision(HFLanguageModel): + """ + A Data2Vec (Vision) model that wraps Hugging Face's implementation + (https://github.com/huggingface/transformers) to fit the LanguageModel class. + Paper: https://arxiv.org/abs/1810.04805. + """ + + def __init__(self, pretrained_model_name_or_path: Union[Path, str], language: str = None, n_added_tokens: int = 0, **kwargs): + super().__init__( + pretrained_model_name_or_path=pretrained_model_name_or_path, + language=language, + n_added_tokens=n_added_tokens, + model_type="data2vec-vision", + **kwargs + ) + + class DPRQuestionEncoder(LanguageModel): """ A DPRQuestionEncoder model that wraps Hugging Face's implementation. diff --git a/haystack/modeling/model/tokenization.py b/haystack/modeling/model/tokenization.py index 0fdf406e8f..a3a08d10c5 100644 --- a/haystack/modeling/model/tokenization.py +++ b/haystack/modeling/model/tokenization.py @@ -21,36 +21,13 @@ import re import logging import numpy as np -from transformers import ( - AutoTokenizer, - AlbertTokenizer, - AlbertTokenizerFast, - BertTokenizer, - BertTokenizerFast, - DistilBertTokenizer, - DistilBertTokenizerFast, - ElectraTokenizer, - ElectraTokenizerFast, - RobertaTokenizer, - RobertaTokenizerFast, - XLMRobertaTokenizer, - XLMRobertaTokenizerFast, - XLNetTokenizer, - XLNetTokenizerFast, - CamembertTokenizer, - CamembertTokenizerFast, - DPRContextEncoderTokenizer, - DPRContextEncoderTokenizerFast, - DPRQuestionEncoderTokenizer, - DPRQuestionEncoderTokenizerFast, - BigBirdTokenizer, - BigBirdTokenizerFast, - DebertaV2Tokenizer, - DebertaV2TokenizerFast, -) -from transformers import AutoConfig +from transformers import AutoTokenizer +import transformers +from transformers import AutoConfig, PreTrainedTokenizer +from haystack.errors import ModelingError from haystack.modeling.data_handler.samples import SampleBasket +from haystack.modeling.model._mappings import TOKENIZERS_PARAMS, TOKENIZERS_MAPPING, TOKENIZERS_STRING_HINTS logger = logging.getLogger(__name__) @@ -59,265 +36,95 @@ # Special characters used by the different tokenizers to indicate start of word / whitespace SPECIAL_TOKENIZER_CHARS = r"^(##|Ġ|▁)" -# TODO analyse if tokenizers can be completely used through HF transformers -class Tokenizer: +def get_tokenizer( + pretrained_model_name_or_path: str, + revision: str = None, + tokenizer_classname: str = None, + use_fast: bool = True, + auth_token: Optional[str] = None, + **kwargs, +): """ - Simple Wrapper for Tokenizers from the transformers package. Enables loading of different Tokenizer classes with a uniform interface. + Enables loading of different Tokenizer classes with a uniform interface. Either infer the class from + model config or define it manually via `tokenizer_classname`. + + :param pretrained_model_name_or_path: The path of the saved pretrained model or its name (e.g. `bert-base-uncased`) + :param revision: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. + :param tokenizer_classname: Name of the tokenizer class to load (e.g. `BertTokenizer`) + :param use_fast: Indicate if Haystack should try to load the fast version of the tokenizer (True) or use the Python one (False). Defaults to True. + :param auth_token: The auth_token to use in `PretrainedTokenizer.from_pretrained()`, if required + :param kwargs: other kwargs to pass on to `PretrainedTokenizer.from_pretrained()` + :return: Tokenizer """ + pretrained_model_name_or_path = str(pretrained_model_name_or_path) + + try: + if tokenizer_classname is None: + tokenizer_classname = _infer_tokenizer_classname(pretrained_model_name_or_path, auth_token=auth_token) + + logger.debug(f"Loading tokenizer of type '{tokenizer_classname}'") - @classmethod - def load( - cls, - pretrained_model_name_or_path, - revision=None, - tokenizer_class=None, - use_fast=True, - use_auth_token: Union[bool, str] = None, - **kwargs, - ): - """ - Enables loading of different Tokenizer classes with a uniform interface. Either infer the class from - model config or define it manually via `tokenizer_class`. - - :param pretrained_model_name_or_path: The path of the saved pretrained model or its name (e.g. `bert-base-uncased`) - :type pretrained_model_name_or_path: str - :param revision: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. - :type revision: str - :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`) - :type tokenizer_class: str - :param use_fast: (Optional, False by default) Indicate if Haystack should try to load the fast version of the tokenizer (True) or - use the Python one (False). - Only DistilBERT, BERT and Electra fast tokenizers are supported. - :type use_fast: bool - :param kwargs: - :return: Tokenizer - """ - pretrained_model_name_or_path = str(pretrained_model_name_or_path) - kwargs["revision"] = revision - - if tokenizer_class is None: - tokenizer_class = cls._infer_tokenizer_class(pretrained_model_name_or_path, use_auth_token=use_auth_token) - - logger.debug(f"Loading tokenizer of type '{tokenizer_class}'") # return appropriate tokenizer object - ret = None - if "AutoTokenizer" in tokenizer_class: - ret = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, use_fast=use_fast, **kwargs) - elif "AlbertTokenizer" in tokenizer_class: - if use_fast: - ret = AlbertTokenizerFast.from_pretrained( - pretrained_model_name_or_path, keep_accents=True, use_auth_token=use_auth_token, **kwargs - ) - else: - ret = AlbertTokenizer.from_pretrained( - pretrained_model_name_or_path, keep_accents=True, use_auth_token=use_auth_token, **kwargs - ) - elif "XLMRobertaTokenizer" in tokenizer_class: - if use_fast: - ret = XLMRobertaTokenizerFast.from_pretrained( - pretrained_model_name_or_path, use_auth_token=use_auth_token, **kwargs - ) - else: - ret = XLMRobertaTokenizer.from_pretrained( - pretrained_model_name_or_path, use_auth_token=use_auth_token, **kwargs - ) - elif "RobertaTokenizer" in tokenizer_class: - if use_fast: - ret = RobertaTokenizerFast.from_pretrained( - pretrained_model_name_or_path, use_auth_token=use_auth_token, **kwargs - ) - else: - ret = RobertaTokenizer.from_pretrained( - pretrained_model_name_or_path, use_auth_token=use_auth_token, **kwargs - ) - elif "DistilBertTokenizer" in tokenizer_class: - if use_fast: - ret = DistilBertTokenizerFast.from_pretrained( - pretrained_model_name_or_path, use_auth_token=use_auth_token, **kwargs - ) - else: - ret = DistilBertTokenizer.from_pretrained( - pretrained_model_name_or_path, use_auth_token=use_auth_token, **kwargs - ) - elif "BertTokenizer" in tokenizer_class: - if use_fast: - ret = BertTokenizerFast.from_pretrained( - pretrained_model_name_or_path, use_auth_token=use_auth_token, **kwargs - ) - else: - ret = BertTokenizer.from_pretrained( - pretrained_model_name_or_path, use_auth_token=use_auth_token, **kwargs - ) - elif "XLNetTokenizer" in tokenizer_class: - if use_fast: - ret = XLNetTokenizerFast.from_pretrained( - pretrained_model_name_or_path, keep_accents=True, use_auth_token=use_auth_token, **kwargs - ) - else: - ret = XLNetTokenizer.from_pretrained( - pretrained_model_name_or_path, keep_accents=True, use_auth_token=use_auth_token, **kwargs - ) - elif "ElectraTokenizer" in tokenizer_class: - if use_fast: - ret = ElectraTokenizerFast.from_pretrained( - pretrained_model_name_or_path, use_auth_token=use_auth_token, **kwargs - ) - else: - ret = ElectraTokenizer.from_pretrained( - pretrained_model_name_or_path, use_auth_token=use_auth_token, **kwargs - ) - elif "CamembertTokenizer" in tokenizer_class: - if use_fast: - ret = CamembertTokenizerFast.from_pretrained( - pretrained_model_name_or_path, use_auth_token=use_auth_token, **kwargs - ) - else: - ret = CamembertTokenizer.from_pretrained( - pretrained_model_name_or_path, use_auth_token=use_auth_token, **kwargs - ) - elif "DPRQuestionEncoderTokenizer" in tokenizer_class: - if use_fast: - ret = DPRQuestionEncoderTokenizerFast.from_pretrained( - pretrained_model_name_or_path, use_auth_token=use_auth_token, **kwargs - ) - else: - ret = DPRQuestionEncoderTokenizer.from_pretrained( - pretrained_model_name_or_path, use_auth_token=use_auth_token, **kwargs - ) - elif "DPRContextEncoderTokenizer" in tokenizer_class: - if use_fast: - ret = DPRContextEncoderTokenizerFast.from_pretrained( - pretrained_model_name_or_path, use_auth_token=use_auth_token, **kwargs - ) - else: - ret = DPRContextEncoderTokenizer.from_pretrained( - pretrained_model_name_or_path, use_auth_token=use_auth_token, **kwargs - ) - elif "BigBirdTokenizer" in tokenizer_class: - if use_fast: - ret = BigBirdTokenizerFast.from_pretrained( - pretrained_model_name_or_path, use_auth_token=use_auth_token, **kwargs - ) - else: - ret = BigBirdTokenizer.from_pretrained( - pretrained_model_name_or_path, use_auth_token=use_auth_token, **kwargs - ) - elif "DebertaV2Tokenizer" in tokenizer_class: - if use_fast: - ret = DebertaV2TokenizerFast.from_pretrained( - pretrained_model_name_or_path, use_auth_token=use_auth_token, **kwargs - ) - else: - ret = DebertaV2Tokenizer.from_pretrained( - pretrained_model_name_or_path, use_auth_token=use_auth_token, **kwargs - ) - if ret is None: - raise Exception("Unable to load tokenizer") - return ret - @staticmethod - def _infer_tokenizer_class(pretrained_model_name_or_path, use_auth_token: Union[bool, str] = None): - # Infer Tokenizer from model type in config + suffix = "TokenizerFast" if use_fast else "Tokenizer" + params = TOKENIZERS_PARAMS.get(tokenizer_classname, {}) + tokenizer_class: PreTrainedTokenizer = getattr(transformers, tokenizer_classname + suffix, None) + + return tokenizer_class.from_pretrained(pretrained_model_name_or_path, use_auth_token=auth_token or False, revision=revision, **params, **kwargs) + + except Exception as e: + raise ModelingError("Unable to load tokenizer.") from e + + +def _infer_tokenizer_classname(pretrained_model_name_or_path, auth_token: Union[bool, str] = None): + """ + Infer Tokenizer from model type in config + """ + try: + config = AutoConfig.from_pretrained(pretrained_model_name_or_path, use_auth_token=auth_token or False) + except OSError: + # Haystack model (no 'config.json' file) try: - config = AutoConfig.from_pretrained(pretrained_model_name_or_path, use_auth_token=use_auth_token) - except OSError: - # Haystack model (no 'config.json' file) - try: - config = AutoConfig.from_pretrained( - pretrained_model_name_or_path + "/language_model_config.json", use_auth_token=use_auth_token - ) - except Exception as e: - logger.warning("No config file found. Trying to infer Tokenizer type from model name") - tokenizer_class = Tokenizer._infer_tokenizer_class_from_string(pretrained_model_name_or_path) - return tokenizer_class + config = AutoConfig.from_pretrained( + pretrained_model_name_or_path + "/language_model_config.json", use_auth_token=auth_token or False + ) + model_type = config.model_type + tokenizer_classname = TOKENIZERS_MAPPING.get(config.model_type, None) - model_type = config.model_type + except Exception as e: + logger.warning("No config file found. Trying to infer Tokenizer type from model name") + tokenizer_classname = Tokenizer._infer_tokenizer_class_from_string(pretrained_model_name_or_path) + return tokenizer_classname - if model_type == "xlm-roberta": - tokenizer_class = "XLMRobertaTokenizer" - elif model_type == "roberta": - if "mlm" in pretrained_model_name_or_path.lower(): - raise NotImplementedError("MLM part of codebert is currently not supported in Haystack") - tokenizer_class = "RobertaTokenizer" - elif model_type == "camembert": - tokenizer_class = "CamembertTokenizer" - elif model_type == "albert": - tokenizer_class = "AlbertTokenizer" - elif model_type == "distilbert": - tokenizer_class = "DistilBertTokenizer" - elif model_type == "bert": - tokenizer_class = "BertTokenizer" - elif model_type == "xlnet": - tokenizer_class = "XLNetTokenizer" - elif model_type == "electra": - tokenizer_class = "ElectraTokenizer" - elif model_type == "dpr": + + + if not tokenizer_classname: + if model_type == "dpr": if config.architectures[0] == "DPRQuestionEncoder": - tokenizer_class = "DPRQuestionEncoderTokenizer" + tokenizer_classname = "DPRQuestionEncoderTokenizer" elif config.architectures[0] == "DPRContextEncoder": - tokenizer_class = "DPRContextEncoderTokenizer" + tokenizer_classname = "DPRContextEncoderTokenizer" elif config.architectures[0] == "DPRReader": raise NotImplementedError("DPRReader models are currently not supported.") - elif model_type == "big_bird": - tokenizer_class = "BigBirdTokenizer" - elif model_type == "deberta-v2": - tokenizer_class = "DebertaV2Tokenizer" + else: # Fall back to inferring type from model name logger.warning( "Could not infer Tokenizer type from config. Trying to infer Tokenizer type from model name." ) - tokenizer_class = Tokenizer._infer_tokenizer_class_from_string(pretrained_model_name_or_path) - - return tokenizer_class - - @staticmethod - def _infer_tokenizer_class_from_string(pretrained_model_name_or_path): - # If inferring tokenizer class from config doesn't succeed, - # fall back to inferring tokenizer class from model name. - if "albert" in pretrained_model_name_or_path.lower(): - tokenizer_class = "AlbertTokenizer" - elif "bigbird" in pretrained_model_name_or_path.lower(): - tokenizer_class = "BigBirdTokenizer" - elif "xlm-roberta" in pretrained_model_name_or_path.lower(): - tokenizer_class = "XLMRobertaTokenizer" - elif "roberta" in pretrained_model_name_or_path.lower(): - tokenizer_class = "RobertaTokenizer" - elif "codebert" in pretrained_model_name_or_path.lower(): - if "mlm" in pretrained_model_name_or_path.lower(): - raise NotImplementedError("MLM part of codebert is currently not supported in Haystack") - tokenizer_class = "RobertaTokenizer" - elif "camembert" in pretrained_model_name_or_path.lower() or "umberto" in pretrained_model_name_or_path.lower(): - tokenizer_class = "CamembertTokenizer" - elif "distilbert" in pretrained_model_name_or_path.lower(): - tokenizer_class = "DistilBertTokenizer" - elif ( - "debertav2" in pretrained_model_name_or_path.lower() or "debertav3" in pretrained_model_name_or_path.lower() - ): - tokenizer_class = "DebertaV2Tokenizer" - elif "bert" in pretrained_model_name_or_path.lower(): - tokenizer_class = "BertTokenizer" - elif "xlnet" in pretrained_model_name_or_path.lower(): - tokenizer_class = "XLNetTokenizer" - elif "electra" in pretrained_model_name_or_path.lower(): - tokenizer_class = "ElectraTokenizer" - elif "minilm" in pretrained_model_name_or_path.lower(): - tokenizer_class = "BertTokenizer" - elif "dpr-question_encoder" in pretrained_model_name_or_path.lower(): - tokenizer_class = "DPRQuestionEncoderTokenizer" - elif "dpr-ctx_encoder" in pretrained_model_name_or_path.lower(): - tokenizer_class = "DPRContextEncoderTokenizer" - else: - raise ValueError( - f"Could not infer tokenizer_class from model config or " - f"name '{pretrained_model_name_or_path}'. Set arg `tokenizer_class` " - f"in Tokenizer.load() to one of: AlbertTokenizer, XLMRobertaTokenizer, " - f"RobertaTokenizer, DistilBertTokenizer, BertTokenizer, XLNetTokenizer, " - f"CamembertTokenizer, ElectraTokenizer, DPRQuestionEncoderTokenizer," - f"DPRContextEncoderTokenizer." - ) + candidates = [value for key, value in TOKENIZERS_STRING_HINTS.items() if key in pretrained_model_name_or_path] + if not candidates: + raise ValueError( + f"Could not infer tokenizer_class from model config or " + f"name '{pretrained_model_name_or_path}'. Set arg `tokenizer_classname` " + f"in get_tokenizer) to one of: {'Tokenizer, '.join(TOKENIZERS_MAPPING.values())}." + ) + tokenizer_classname = candidates[0] - return tokenizer_class + if tokenizer_classname == "Roberta" and "mlm" in pretrained_model_name_or_path.lower(): + raise NotImplementedError("MLM part of codebert is currently not supported in Haystack") + + return tokenizer_classname def tokenize_batch_question_answering(pre_baskets, tokenizer, indices): @@ -413,7 +220,7 @@ def tokenize_with_metadata(text: str, tokenizer) -> Dict[str, Any]: type is lost which might be helpful for certain NLP tasks ( e.g tab for tables). :param text: Text to tokenize - :param tokenizer: Tokenizer (e.g. from Tokenizer.load()) + :param tokenizer: Tokenizer (e.g. from get_tokenizer)) :return: Dictionary with "tokens", "offsets" and "start_of_word" """ # normalize all other whitespace characters to " " @@ -485,7 +292,7 @@ def truncate_sequences( :param seq_a: First sequence of tokens/offsets/... :param seq_b: Optional second sequence of tokens/offsets/... - :param tokenizer: Tokenizer (e.g. from Tokenizer.load()) + :param tokenizer: Tokenizer (e.g. from get_tokenizer)) :param max_seq_len: :param truncation_strategy: how the sequence(s) should be truncated down. Default: "longest_first" (see above for other options). :param with_special_tokens: If true, it'll remove some additional tokens to have exactly enough space for later adding special tokens (CLS, SEP etc.) @@ -517,7 +324,7 @@ def _words_to_tokens(words, word_offsets, tokenizer): :type words: list :param word_offsets: Character indices where each word begins in the original text :type word_offsets: list - :param tokenizer: Tokenizer (e.g. from Tokenizer.load()) + :param tokenizer: Tokenizer (e.g. from get_tokenizer)) :return: tokens, offsets, start_of_word """ tokens = [] diff --git a/haystack/nodes/retriever/dense.py b/haystack/nodes/retriever/dense.py index 834cd46011..0add22f735 100644 --- a/haystack/nodes/retriever/dense.py +++ b/haystack/nodes/retriever/dense.py @@ -20,7 +20,7 @@ from haystack.document_stores import BaseDocumentStore from haystack.nodes.retriever.base import BaseRetriever from haystack.nodes.retriever._embedding_encoder import _EMBEDDING_ENCODERS -from haystack.modeling.model.tokenization import Tokenizer +from haystack.modeling.model.tokenization import get_tokenizer from haystack.modeling.model.language_model import LanguageModel from haystack.modeling.model.biadaptive_model import BiAdaptiveModel from haystack.modeling.model.triadaptive_model import TriAdaptiveModel @@ -158,7 +158,7 @@ def __init__( tokenizers_default_classes["passage"] = None # type: ignore # Init & Load Encoders - self.query_tokenizer = Tokenizer.load( + self.query_tokenizer = get_tokenizer pretrained_model_name_or_path=query_embedding_model, revision=model_version, do_lower_case=True, @@ -172,7 +172,7 @@ def __init__( language_model_class="DPRQuestionEncoder", use_auth_token=use_auth_token, ) - self.passage_tokenizer = Tokenizer.load( + self.passage_tokenizer = get_tokenizer pretrained_model_name_or_path=passage_embedding_model, revision=model_version, do_lower_case=True, @@ -867,7 +867,7 @@ def __init__( tokenizers_default_classes["table"] = None # type: ignore # Init & Load Encoders - self.query_tokenizer = Tokenizer.load( + self.query_tokenizer = get_tokenizer pretrained_model_name_or_path=query_embedding_model, revision=model_version, do_lower_case=True, @@ -881,7 +881,7 @@ def __init__( language_model_class="DPRQuestionEncoder", use_auth_token=use_auth_token, ) - self.passage_tokenizer = Tokenizer.load( + self.passage_tokenizer = get_tokenizer pretrained_model_name_or_path=passage_embedding_model, revision=model_version, do_lower_case=True, @@ -895,7 +895,7 @@ def __init__( language_model_class="DPRContextEncoder", use_auth_token=use_auth_token, ) - self.table_tokenizer = Tokenizer.load( + self.table_tokenizer = get_tokenizer pretrained_model_name_or_path=table_embedding_model, revision=model_version, do_lower_case=True, diff --git a/test/modeling/test_modeling_dpr.py b/test/modeling/test_modeling_dpr.py index 4e66955a71..b3805126b4 100644 --- a/test/modeling/test_modeling_dpr.py +++ b/test/modeling/test_modeling_dpr.py @@ -12,7 +12,7 @@ from haystack.modeling.model.biadaptive_model import BiAdaptiveModel from haystack.modeling.model.language_model import LanguageModel, DPRContextEncoder, DPRQuestionEncoder from haystack.modeling.model.prediction_head import TextSimilarityHead -from haystack.modeling.model.tokenization import Tokenizer +from haystack.modeling.model.tokenization import get_tokenizer from haystack.modeling.utils import set_all_seeds, initialize_device_settings @@ -24,10 +24,10 @@ def test_dpr_modules(caplog=None): devices, n_gpu = initialize_device_settings(use_cuda=True) # 1.Create question and passage tokenizers - query_tokenizer = Tokenizer.load( + query_tokenizer = get_tokenizer pretrained_model_name_or_path="facebook/dpr-question_encoder-single-nq-base", do_lower_case=True, use_fast=True ) - passage_tokenizer = Tokenizer.load( + passage_tokenizer = get_tokenizer pretrained_model_name_or_path="facebook/dpr-ctx_encoder-single-nq-base", do_lower_case=True, use_fast=True ) @@ -343,9 +343,9 @@ def test_dpr_processor(embed_title, passage_ids, passage_attns, use_fast, num_ha ] query_tok = "facebook/dpr-question_encoder-single-nq-base" - query_tokenizer = Tokenizer.load(query_tok, use_fast=use_fast) + query_tokenizer = get_tokenizerquery_tok, use_fast=use_fast) passage_tok = "facebook/dpr-ctx_encoder-single-nq-base" - passage_tokenizer = Tokenizer.load(passage_tok, use_fast=use_fast) + passage_tokenizer = get_tokenizerpassage_tok, use_fast=use_fast) processor = TextSimilarityProcessor( query_tokenizer=query_tokenizer, passage_tokenizer=passage_tokenizer, @@ -400,9 +400,9 @@ def test_dpr_processor_empty_title(use_fast, embed_title): } query_tok = "facebook/dpr-question_encoder-single-nq-base" - query_tokenizer = Tokenizer.load(query_tok, use_fast=use_fast) + query_tokenizer = get_tokenizerquery_tok, use_fast=use_fast) passage_tok = "facebook/dpr-ctx_encoder-single-nq-base" - passage_tokenizer = Tokenizer.load(passage_tok, use_fast=use_fast) + passage_tokenizer = get_tokenizerpassage_tok, use_fast=use_fast) processor = TextSimilarityProcessor( query_tokenizer=query_tokenizer, passage_tokenizer=passage_tokenizer, @@ -485,9 +485,9 @@ def test_dpr_problematic(): ] query_tok = "facebook/dpr-question_encoder-single-nq-base" - query_tokenizer = Tokenizer.load(query_tok, use_fast=True) + query_tokenizer = get_tokenizerquery_tok, use_fast=True) passage_tok = "facebook/dpr-ctx_encoder-single-nq-base" - passage_tokenizer = Tokenizer.load(passage_tok, use_fast=True) + passage_tokenizer = get_tokenizerpassage_tok, use_fast=True) processor = TextSimilarityProcessor( query_tokenizer=query_tokenizer, passage_tokenizer=passage_tokenizer, @@ -516,9 +516,9 @@ def test_dpr_query_only(): ] query_tok = "facebook/dpr-question_encoder-single-nq-base" - query_tokenizer = Tokenizer.load(query_tok, use_fast=True) + query_tokenizer = get_tokenizerquery_tok, use_fast=True) passage_tok = "facebook/dpr-ctx_encoder-single-nq-base" - passage_tokenizer = Tokenizer.load(passage_tok, use_fast=True) + passage_tokenizer = get_tokenizerpassage_tok, use_fast=True) processor = TextSimilarityProcessor( query_tokenizer=query_tokenizer, passage_tokenizer=passage_tokenizer, @@ -578,9 +578,9 @@ def test_dpr_context_only(): ] query_tok = "facebook/dpr-question_encoder-single-nq-base" - query_tokenizer = Tokenizer.load(query_tok, use_fast=True) + query_tokenizer = get_tokenizerquery_tok, use_fast=True) passage_tok = "facebook/dpr-ctx_encoder-single-nq-base" - passage_tokenizer = Tokenizer.load(passage_tok, use_fast=True) + passage_tokenizer = get_tokenizerpassage_tok, use_fast=True) processor = TextSimilarityProcessor( query_tokenizer=query_tokenizer, passage_tokenizer=passage_tokenizer, @@ -629,9 +629,9 @@ def test_dpr_processor_save_load(tmp_path): } query_tok = "facebook/dpr-question_encoder-single-nq-base" - query_tokenizer = Tokenizer.load(query_tok, use_fast=True) + query_tokenizer = get_tokenizerquery_tok, use_fast=True) passage_tok = "facebook/dpr-ctx_encoder-single-nq-base" - passage_tokenizer = Tokenizer.load(passage_tok, use_fast=True) + passage_tokenizer = get_tokenizerpassage_tok, use_fast=True) processor = TextSimilarityProcessor( query_tokenizer=query_tokenizer, passage_tokenizer=passage_tokenizer, @@ -689,13 +689,13 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path, query_and_passage_ # load model from model hub query_embedding_model = query_and_passage_model["query"] passage_embedding_model = query_and_passage_model["passage"] - query_tokenizer = Tokenizer.load( + query_tokenizer = get_tokenizer pretrained_model_name_or_path=query_embedding_model ) # tokenizer class is inferred automatically query_encoder = LanguageModel.load( pretrained_model_name_or_path=query_embedding_model, language_model_class="DPRQuestionEncoder" ) - passage_tokenizer = Tokenizer.load(pretrained_model_name_or_path=passage_embedding_model) + passage_tokenizer = get_tokenizerpretrained_model_name_or_path=passage_embedding_model) passage_encoder = LanguageModel.load( pretrained_model_name_or_path=passage_embedding_model, language_model_class="DPRContextEncoder" ) @@ -737,13 +737,13 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path, query_and_passage_ passage_tokenizer.save_pretrained(save_dir + f"/{passage_encoder_dir}") # load model from disk - loaded_query_tokenizer = Tokenizer.load( + loaded_query_tokenizer = get_tokenizer pretrained_model_name_or_path=Path(save_dir) / query_encoder_dir, use_fast=True ) # tokenizer class is inferred automatically loaded_query_encoder = LanguageModel.load( pretrained_model_name_or_path=Path(save_dir) / query_encoder_dir, language_model_class="DPRQuestionEncoder" ) - loaded_passage_tokenizer = Tokenizer.load( + loaded_passage_tokenizer = get_tokenizer pretrained_model_name_or_path=Path(save_dir) / passage_encoder_dir, use_fast=True ) loaded_passage_encoder = LanguageModel.load( @@ -849,13 +849,13 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path, query_and_passage_ loaded_passage_tokenizer.save_pretrained(save_dir + f"/{passage_encoder_dir}") # load model from disk - query_tokenizer = Tokenizer.load( + query_tokenizer = get_tokenizer pretrained_model_name_or_path=Path(save_dir) / query_encoder_dir ) # tokenizer class is inferred automatically query_encoder = LanguageModel.load( pretrained_model_name_or_path=Path(save_dir) / query_encoder_dir, language_model_class="DPRQuestionEncoder" ) - passage_tokenizer = Tokenizer.load(pretrained_model_name_or_path=Path(save_dir) / passage_encoder_dir) + passage_tokenizer = get_tokenizerpretrained_model_name_or_path=Path(save_dir) / passage_encoder_dir) passage_encoder = LanguageModel.load( pretrained_model_name_or_path=Path(save_dir) / passage_encoder_dir, language_model_class="DPRContextEncoder" ) @@ -942,9 +942,9 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path, query_and_passage_ # # device, n_gpu = initialize_device_settings(use_cuda=False) # -# query_tokenizer = Tokenizer.load(pretrained_model_name_or_path=question_lang_model, +# query_tokenizer = get_tokenizerpretrained_model_name_or_path=question_lang_model, # do_lower_case=do_lower_case, use_fast=use_fast) -# passage_tokenizer = Tokenizer.load(pretrained_model_name_or_path=passage_lang_model, +# passage_tokenizer = get_tokenizerpretrained_model_name_or_path=passage_lang_model, # do_lower_case=do_lower_case, use_fast=use_fast) # label_list = ["hard_negative", "positive"] # diff --git a/test/modeling/test_modeling_processor.py b/test/modeling/test_modeling_processor.py index 8744aeb6cb..49b2ba4bb4 100644 --- a/test/modeling/test_modeling_processor.py +++ b/test/modeling/test_modeling_processor.py @@ -4,7 +4,7 @@ from transformers import AutoTokenizer from haystack.modeling.data_handler.processor import SquadProcessor -from haystack.modeling.model.tokenization import Tokenizer +from haystack.modeling.model.tokenization import get_tokenizer from ..conftest import SAMPLES_PATH @@ -24,7 +24,7 @@ def test_dataset_from_dicts_qa_inference(caplog=None): sample_types = ["answer-wrong", "answer-offset-wrong", "noanswer", "vanilla"] for model in models: - tokenizer = Tokenizer.load(pretrained_model_name_or_path=model, use_fast=True) + tokenizer = get_tokenizerpretrained_model_name_or_path=model, use_fast=True) processor = SquadProcessor(tokenizer, max_seq_len=256, data_dir=None) for sample_type in sample_types: @@ -251,7 +251,7 @@ def test_dataset_from_dicts_qa_labelconversion(caplog=None): sample_types = ["answer-wrong", "answer-offset-wrong", "noanswer", "vanilla"] for model in models: - tokenizer = Tokenizer.load(pretrained_model_name_or_path=model, use_fast=True) + tokenizer = get_tokenizerpretrained_model_name_or_path=model, use_fast=True) processor = SquadProcessor(tokenizer, max_seq_len=256, data_dir=None) for sample_type in sample_types: diff --git a/test/modeling/test_modeling_processor_saving_loading.py b/test/modeling/test_modeling_processor_saving_loading.py index 8972422364..c4318b0e1f 100644 --- a/test/modeling/test_modeling_processor_saving_loading.py +++ b/test/modeling/test_modeling_processor_saving_loading.py @@ -2,7 +2,7 @@ from pathlib import Path from haystack.modeling.data_handler.processor import SquadProcessor -from haystack.modeling.model.tokenization import Tokenizer +from haystack.modeling.model.tokenization import get_tokenizer from haystack.modeling.utils import set_all_seeds import torch @@ -16,7 +16,7 @@ def test_processor_saving_loading(tmp_path, caplog): set_all_seeds(seed=42) lang_model = "roberta-base" - tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) + tokenizer = get_tokenizerpretrained_model_name_or_path=lang_model, do_lower_case=False) processor = SquadProcessor( tokenizer=tokenizer, diff --git a/test/modeling/test_tokenization.py b/test/modeling/test_tokenization.py index 486b338f77..36b9692886 100644 --- a/test/modeling/test_tokenization.py +++ b/test/modeling/test_tokenization.py @@ -13,7 +13,7 @@ from tokenizers.pre_tokenizers import WhitespaceSplit -from haystack.modeling.model.tokenization import Tokenizer +from haystack.modeling.model.tokenization import get_tokenizer import numpy as np @@ -40,27 +40,27 @@ def test_basic_loading(caplog): caplog.set_level(logging.CRITICAL) # slow tokenizers - tokenizer = Tokenizer.load(pretrained_model_name_or_path="bert-base-cased", do_lower_case=True, use_fast=False) + tokenizer = get_tokenizerpretrained_model_name_or_path="bert-base-cased", do_lower_case=True, use_fast=False) assert type(tokenizer) == BertTokenizer assert tokenizer.basic_tokenizer.do_lower_case == True - tokenizer = Tokenizer.load(pretrained_model_name_or_path="xlnet-base-cased", do_lower_case=True, use_fast=False) + tokenizer = get_tokenizerpretrained_model_name_or_path="xlnet-base-cased", do_lower_case=True, use_fast=False) assert type(tokenizer) == XLNetTokenizer assert tokenizer.do_lower_case == True - tokenizer = Tokenizer.load(pretrained_model_name_or_path="roberta-base", use_fast=False) + tokenizer = get_tokenizerpretrained_model_name_or_path="roberta-base", use_fast=False) assert type(tokenizer) == RobertaTokenizer # fast tokenizers - tokenizer = Tokenizer.load(pretrained_model_name_or_path="bert-base-cased", do_lower_case=True) + tokenizer = get_tokenizerpretrained_model_name_or_path="bert-base-cased", do_lower_case=True) assert type(tokenizer) == BertTokenizerFast assert tokenizer.do_lower_case == True - tokenizer = Tokenizer.load(pretrained_model_name_or_path="xlnet-base-cased", do_lower_case=True) + tokenizer = get_tokenizerpretrained_model_name_or_path="xlnet-base-cased", do_lower_case=True) assert type(tokenizer) == XLNetTokenizerFast assert tokenizer.do_lower_case == True - tokenizer = Tokenizer.load(pretrained_model_name_or_path="roberta-base") + tokenizer = get_tokenizerpretrained_model_name_or_path="roberta-base") assert type(tokenizer) == RobertaTokenizerFast @@ -69,7 +69,7 @@ def test_bert_tokenizer_all_meta(caplog): lang_model = "bert-base-cased" - tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) + tokenizer = get_tokenizerpretrained_model_name_or_path=lang_model, do_lower_case=False) basic_text = "Some Text with neverseentokens plus !215?#. and a combined-token_with/chars" @@ -199,9 +199,9 @@ def test_save_load(tmp_path, caplog): tokenizers = [] for lang_name in lang_names: if "xlnet" in lang_name.lower(): - t = Tokenizer.load(lang_name, lower_case=False, use_fast=True, from_slow=True) + t = get_tokenizerlang_name, lower_case=False, use_fast=True, from_slow=True) else: - t = Tokenizer.load(lang_name, lower_case=False) + t = get_tokenizerlang_name, lower_case=False) t.add_tokens(new_tokens=["neverseentokens"]) tokenizers.append(t) @@ -211,7 +211,7 @@ def test_save_load(tmp_path, caplog): tokenizer_type = tokenizer.__class__.__name__ save_dir = f"{tmp_path}/testsave/{tokenizer_type}" tokenizer.save_pretrained(save_dir) - tokenizer_loaded = Tokenizer.load(save_dir, tokenizer_class=tokenizer_type) + tokenizer_loaded = get_tokenizersave_dir, tokenizer_class=tokenizer_type) encoded_before = tokenizer.encode_plus(basic_text).encodings[0] encoded_after = tokenizer_loaded.encode_plus(basic_text).encodings[0] data_before = { @@ -225,8 +225,8 @@ def test_save_load(tmp_path, caplog): @pytest.mark.parametrize("model_name", ["bert-base-german-cased", "google/electra-small-discriminator"]) def test_fast_tokenizer_with_examples(caplog, model_name): - fast_tokenizer = Tokenizer.load(model_name, lower_case=False, use_fast=True) - tokenizer = Tokenizer.load(model_name, lower_case=False, use_fast=False) + fast_tokenizer = get_tokenizermodel_name, lower_case=False, use_fast=True) + tokenizer = get_tokenizermodel_name, lower_case=False, use_fast=False) for text in TEXTS: # plain tokenize function @@ -247,7 +247,7 @@ def test_all_tokenizer_on_special_cases(caplog): add_prefix_space = True else: add_prefix_space = False - t = Tokenizer.load(lang_name, lower_case=False, add_prefix_space=add_prefix_space) + t = get_tokenizerlang_name, lower_case=False, add_prefix_space=add_prefix_space) tokenizers.append(t) texts = [ @@ -322,7 +322,7 @@ def test_bert_custom_vocab(caplog): lang_model = "bert-base-cased" - tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False) + tokenizer = get_tokenizerpretrained_model_name_or_path=lang_model, do_lower_case=False) # deprecated: tokenizer.add_custom_vocab("samples/tokenizer/custom_vocab.txt") tokenizer.add_tokens(new_tokens=["neverseentokens"]) @@ -389,7 +389,7 @@ def test_fast_bert_custom_vocab(caplog): lang_model = "bert-base-cased" - tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=False, use_fast=True) + tokenizer = get_tokenizerpretrained_model_name_or_path=lang_model, do_lower_case=False, use_fast=True) # deprecated: tokenizer.add_custom_vocab("samples/tokenizer/custom_vocab.txt") tokenizer.add_tokens(new_tokens=["neverseentokens"]) @@ -458,7 +458,7 @@ def test_fast_bert_custom_vocab(caplog): def test_fast_tokenizer_type(caplog, model_name, tokenizer_type): caplog.set_level(logging.CRITICAL) - tokenizer = Tokenizer.load(model_name, use_fast=True) + tokenizer = get_tokenizermodel_name, use_fast=True) assert type(tokenizer) is tokenizer_type @@ -466,7 +466,7 @@ def test_fast_tokenizer_type(caplog, model_name, tokenizer_type): # def test_fast_bert_tokenizer_strip_accents(caplog): # caplog.set_level(logging.CRITICAL) # -# tokenizer = Tokenizer.load("dbmdz/bert-base-german-uncased", +# tokenizer = get_tokenizer"dbmdz/bert-base-german-uncased", # use_fast=True, # strip_accents=False) # assert type(tokenizer) is BertTokenizerFast @@ -477,13 +477,13 @@ def test_fast_tokenizer_type(caplog, model_name, tokenizer_type): def test_fast_electra_tokenizer(caplog): caplog.set_level(logging.CRITICAL) - tokenizer = Tokenizer.load("dbmdz/electra-base-german-europeana-cased-discriminator", use_fast=True) + tokenizer = get_tokenizer"dbmdz/electra-base-german-europeana-cased-discriminator", use_fast=True) assert type(tokenizer) is ElectraTokenizerFast @pytest.mark.parametrize("model_name", ["bert-base-cased", "distilbert-base-uncased", "deepset/electra-base-squad2"]) def test_detokenization_in_fast_tokenizers(model_name): - tokenizer = Tokenizer.load(pretrained_model_name_or_path=model_name, use_fast=True) + tokenizer = get_tokenizerpretrained_model_name_or_path=model_name, use_fast=True) for text in TEXTS: encoded = tokenizer.encode_plus(text, add_special_tokens=False).encodings[0] From c61ed7963668b59fd14cda6ac6a60863ba7cef80 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 15 Jun 2022 16:56:45 +0200 Subject: [PATCH 04/89] Removing Tokenizer --- haystack/modeling/data_handler/processor.py | 20 +- haystack/modeling/model/language_model.py | 50 ++--- haystack/modeling/model/tokenization.py | 188 ++++++------------ haystack/nodes/retriever/dense.py | 10 +- test/modeling/test_modeling_dpr.py | 44 ++-- test/modeling/test_modeling_processor.py | 4 +- .../test_modeling_processor_saving_loading.py | 2 +- test/modeling/test_tokenization.py | 38 ++-- 8 files changed, 132 insertions(+), 224 deletions(-) diff --git a/haystack/modeling/data_handler/processor.py b/haystack/modeling/data_handler/processor.py index b149478b8e..bb02ca1874 100644 --- a/haystack/modeling/data_handler/processor.py +++ b/haystack/modeling/data_handler/processor.py @@ -18,7 +18,7 @@ from torch.utils.data import TensorDataset from haystack.modeling.model.tokenization import ( - Tokenizer, + get_tokenizer, tokenize_batch_question_answering, tokenize_with_metadata, truncate_sequences, @@ -176,11 +176,11 @@ def load_from_dir(cls, load_dir: str): "Loading tokenizer from deprecated config. " "If you used `custom_vocab` or `never_split_chars`, this won't work anymore." ) - tokenizer = get_tokenizer + tokenizer = get_tokenizer( load_dir, tokenizer_class=config["tokenizer"], do_lower_case=config["lower_case"] ) else: - tokenizer = get_tokenizerload_dir, tokenizer_class=config["tokenizer"]) + tokenizer = get_tokenizer(load_dir, tokenizer_class=config["tokenizer"]) # we have to delete the tokenizer string from config, because we pass it as Object del config["tokenizer"] @@ -216,7 +216,7 @@ def convert_from_transformers( **kwargs, ): tokenizer_args = tokenizer_args or {} - tokenizer = get_tokenizer + tokenizer = get_tokenizer( tokenizer_name_or_path, tokenizer_class=tokenizer_class, use_fast=use_fast, @@ -916,8 +916,8 @@ def load_from_dir(cls, load_dir: str): processor_config_file = Path(load_dir) / "processor_config.json" config = json.load(open(processor_config_file)) # init tokenizer - query_tokenizer = get_tokenizerload_dir, tokenizer_class=config["query_tokenizer"], subfolder="query") - passage_tokenizer = get_tokenizerload_dir, tokenizer_class=config["passage_tokenizer"], subfolder="passage") + query_tokenizer = get_tokenizer(load_dir, tokenizer_class=config["query_tokenizer"], subfolder="query") + passage_tokenizer = get_tokenizer(load_dir, tokenizer_class=config["passage_tokenizer"], subfolder="passage") # we have to delete the tokenizer string from config, because we pass it as Object del config["query_tokenizer"] @@ -1320,9 +1320,9 @@ def load_from_dir(cls, load_dir: str): processor_config_file = Path(load_dir) / "processor_config.json" config = json.load(open(processor_config_file)) # init tokenizer - query_tokenizer = get_tokenizerload_dir, tokenizer_class=config["query_tokenizer"], subfolder="query") - passage_tokenizer = get_tokenizerload_dir, tokenizer_class=config["passage_tokenizer"], subfolder="passage") - table_tokenizer = get_tokenizerload_dir, tokenizer_class=config["table_tokenizer"], subfolder="table") + query_tokenizer = get_tokenizer(load_dir, tokenizer_class=config["query_tokenizer"], subfolder="query") + passage_tokenizer = get_tokenizer(load_dir, tokenizer_class=config["passage_tokenizer"], subfolder="passage") + table_tokenizer = get_tokenizer(load_dir, tokenizer_class=config["table_tokenizer"], subfolder="table") # we have to delete the tokenizer string from config, because we pass it as Object del config["query_tokenizer"] @@ -1944,7 +1944,7 @@ def load_from_dir(cls, load_dir: str): processor_config_file = Path(load_dir) / "processor_config.json" config = json.load(open(processor_config_file)) # init tokenizer - tokenizer = get_tokenizerload_dir, tokenizer_class=config["tokenizer"]) + tokenizer = get_tokenizer(load_dir, tokenizer_class=config["tokenizer"]) # we have to delete the tokenizer string from config, because we pass it as Object del config["tokenizer"] diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 3c60552bfb..32f8b5593d 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -17,7 +17,6 @@ Acknowledgements: Many of the modeling parts here come from the great transformers repository: https://github.com/huggingface/transformers. Thanks for the great work! """ -from __future__ import absolute_import, division, print_function, unicode_literals from typing import Optional, Dict, Any, Union import re @@ -47,6 +46,11 @@ logger = logging.getLogger(__name__) +#: Names of the attributes in various model configs which refer to the number of dimensions in the output vectors +OUTPUT_DIM_NAMES = ["dim", "hidden_size", "d_model"] + + + def silence_transformers_logs(from_pretrained_func): """ A wrapper that raises the log level of Transformers to @@ -71,17 +75,12 @@ def quiet_from_pretrained_func(cls, *args, **kwargs): return quiet_from_pretrained_func -# These are the names of the attributes in various model configs which refer to the number of dimensions -# in the output vectors -OUTPUT_DIM_NAMES = ["dim", "hidden_size", "d_model"] - # TODO analyse if LMs can be completely used through HF transformers class LanguageModel(nn.Module, ABC): """ - The parent class for any kind of model that can embed language into a semantic vector space. Practically - speaking, these models read in tokenized sentences and return vectors that capture the meaning of sentences - or of tokens. + The parent class for any kind of model that can embed language into a semantic vector space. + These models read in tokenized sentences and return vectors that capture the meaning of sentences or of tokens. """ subclasses: dict = {} @@ -98,7 +97,14 @@ def __init__(self): super().__init__() @abstractmethod - def forward(self, input_ids: torch.Tensor, segment_ids: torch.Tensor, padding_mask: torch.Tensor, output_hidden_states: Optional[bool] = None, output_attentions: Optional[bool] = None): + def forward( + self, + input_ids: torch.Tensor, + segment_ids: torch.Tensor, + padding_mask: torch.Tensor, + output_hidden_states: Optional[bool] = None, + output_attentions: Optional[bool] = None + ): raise NotImplementedError @staticmethod @@ -117,32 +123,6 @@ def load( 1. Specifying its name and downloading the model. 2. Pointing to the directory the model is saved in. - Available remote models: - - * bert-base-uncased - * bert-large-uncased - * bert-base-cased - * bert-large-cased - * bert-base-multilingual-uncased - * bert-base-multilingual-cased - * bert-base-chinese - * bert-base-german-cased - * roberta-base - * roberta-large - * xlnet-base-cased - * xlnet-large-cased - * xlm-roberta-base - * xlm-roberta-large - * albert-base-v2 - * albert-large-v2 - * distilbert-base-german-cased - * distilbert-base-multilingual-cased - * google/electra-small-discriminator - * google/electra-base-discriminator - * google/electra-large-discriminator - * facebook/dpr-question_encoder-single-nq-base - * facebook/dpr-ctx_encoder-single-nq-base - See all supported model variations at: https://huggingface.co/models. The appropriate language model class is inferred automatically from model configuration diff --git a/haystack/modeling/model/tokenization.py b/haystack/modeling/model/tokenization.py index a3a08d10c5..63138f47f1 100644 --- a/haystack/modeling/model/tokenization.py +++ b/haystack/modeling/model/tokenization.py @@ -12,34 +12,28 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" -Tokenization classes. -""" -from __future__ import absolute_import, division, print_function, unicode_literals -from typing import Dict, Any, Tuple, Optional, List, Union + +from typing import Dict, Any, Tuple, Optional, List import re import logging import numpy as np -from transformers import AutoTokenizer -import transformers -from transformers import AutoConfig, PreTrainedTokenizer +from transformers import AutoTokenizer, PreTrainedTokenizer, RobertaTokenizer from haystack.errors import ModelingError from haystack.modeling.data_handler.samples import SampleBasket -from haystack.modeling.model._mappings import TOKENIZERS_PARAMS, TOKENIZERS_MAPPING, TOKENIZERS_STRING_HINTS logger = logging.getLogger(__name__) -# Special characters used by the different tokenizers to indicate start of word / whitespace +#: Special characters used by the different tokenizers to indicate start of word / whitespace SPECIAL_TOKENIZER_CHARS = r"^(##|Ġ|▁)" + def get_tokenizer( pretrained_model_name_or_path: str, revision: str = None, - tokenizer_classname: str = None, use_fast: bool = True, auth_token: Optional[str] = None, **kwargs, @@ -50,84 +44,30 @@ def get_tokenizer( :param pretrained_model_name_or_path: The path of the saved pretrained model or its name (e.g. `bert-base-uncased`) :param revision: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. - :param tokenizer_classname: Name of the tokenizer class to load (e.g. `BertTokenizer`) :param use_fast: Indicate if Haystack should try to load the fast version of the tokenizer (True) or use the Python one (False). Defaults to True. :param auth_token: The auth_token to use in `PretrainedTokenizer.from_pretrained()`, if required :param kwargs: other kwargs to pass on to `PretrainedTokenizer.from_pretrained()` :return: Tokenizer """ - pretrained_model_name_or_path = str(pretrained_model_name_or_path) - - try: - if tokenizer_classname is None: - tokenizer_classname = _infer_tokenizer_classname(pretrained_model_name_or_path, auth_token=auth_token) - - logger.debug(f"Loading tokenizer of type '{tokenizer_classname}'") - - # return appropriate tokenizer object - - suffix = "TokenizerFast" if use_fast else "Tokenizer" - params = TOKENIZERS_PARAMS.get(tokenizer_classname, {}) - tokenizer_class: PreTrainedTokenizer = getattr(transformers, tokenizer_classname + suffix, None) - - return tokenizer_class.from_pretrained(pretrained_model_name_or_path, use_auth_token=auth_token or False, revision=revision, **params, **kwargs) - - except Exception as e: - raise ModelingError("Unable to load tokenizer.") from e - - -def _infer_tokenizer_classname(pretrained_model_name_or_path, auth_token: Union[bool, str] = None): - """ - Infer Tokenizer from model type in config - """ - try: - config = AutoConfig.from_pretrained(pretrained_model_name_or_path, use_auth_token=auth_token or False) - except OSError: - # Haystack model (no 'config.json' file) - try: - config = AutoConfig.from_pretrained( - pretrained_model_name_or_path + "/language_model_config.json", use_auth_token=auth_token or False - ) - model_type = config.model_type - tokenizer_classname = TOKENIZERS_MAPPING.get(config.model_type, None) + model_name_or_path = str(pretrained_model_name_or_path) + params = {} - except Exception as e: - logger.warning("No config file found. Trying to infer Tokenizer type from model name") - tokenizer_classname = Tokenizer._infer_tokenizer_class_from_string(pretrained_model_name_or_path) - return tokenizer_classname + if "mlm" in model_name_or_path.lower(): + raise NotImplementedError("MLM part of codebert is currently not supported in Haystack") + if any("albert", "xlnet") in model_name_or_path: + params["keep_accents"] = True + return AutoTokenizer.from_pretrained( + model_name_or_path, + use_auth_token=auth_token or False, + revision=revision, + use_fast=use_fast, + **params, + **kwargs + ) - if not tokenizer_classname: - if model_type == "dpr": - if config.architectures[0] == "DPRQuestionEncoder": - tokenizer_classname = "DPRQuestionEncoderTokenizer" - elif config.architectures[0] == "DPRContextEncoder": - tokenizer_classname = "DPRContextEncoderTokenizer" - elif config.architectures[0] == "DPRReader": - raise NotImplementedError("DPRReader models are currently not supported.") - - else: - # Fall back to inferring type from model name - logger.warning( - "Could not infer Tokenizer type from config. Trying to infer Tokenizer type from model name." - ) - candidates = [value for key, value in TOKENIZERS_STRING_HINTS.items() if key in pretrained_model_name_or_path] - if not candidates: - raise ValueError( - f"Could not infer tokenizer_class from model config or " - f"name '{pretrained_model_name_or_path}'. Set arg `tokenizer_classname` " - f"in get_tokenizer) to one of: {'Tokenizer, '.join(TOKENIZERS_MAPPING.values())}." - ) - tokenizer_classname = candidates[0] - - if tokenizer_classname == "Roberta" and "mlm" in pretrained_model_name_or_path.lower(): - raise NotImplementedError("MLM part of codebert is currently not supported in Haystack") - - return tokenizer_classname - - -def tokenize_batch_question_answering(pre_baskets, tokenizer, indices): +def tokenize_batch_question_answering(pre_baskets: Dict[Any, Any], tokenizer: PreTrainedTokenizer, indices: List[Any]) -> List[SampleBasket]: """ Tokenizes text data for question answering tasks. Tokenization means splitting words into subwords, depending on the tokenizer's vocabulary. @@ -136,16 +76,20 @@ def tokenize_batch_question_answering(pre_baskets, tokenizer, indices): - Then we tokenize each question individually - We construct dicts with question and corresponding document text + tokens + offsets + ids - :param pre_baskets: input dicts with QA info #todo change to input objects + :param pre_baskets: input dicts with QA info #TODO change to input objects :param tokenizer: tokenizer to be used - :param indices: list, indices used during multiprocessing so that IDs assigned to our baskets are unique + :param indices: indices used during multiprocessing so that IDs assigned to our baskets are unique :return: baskets, list containing question and corresponding document information """ - assert len(indices) == len(pre_baskets) - assert tokenizer.is_fast, ( - "Processing QA data is only supported with fast tokenizers for now.\n" - "Please load Tokenizers with 'use_fast=True' option." - ) + if not len(indices) == len(pre_baskets): + raise ValueError("indices and pre_baskets must have the same length") + + if not tokenizer.is_fast: + raise ModelingError( + "Processing QA data is only supported with fast tokenizers for now." + "Please load Tokenizers with 'use_fast=True' option." + ) + baskets = [] # # Tokenize texts in batch mode texts = [d["context"] for d in pre_baskets] @@ -199,9 +143,7 @@ def tokenize_batch_question_answering(pre_baskets, tokenizer, indices): def _get_start_of_word_QA(word_ids): - words = np.array(word_ids) - start_of_word_single = [1] + list(np.ediff1d(words)) - return start_of_word_single + return [1] + list(np.ediff1d(np.array(word_ids))) def tokenize_with_metadata(text: str, tokenizer) -> Dict[str, Any]: @@ -230,36 +172,25 @@ def tokenize_with_metadata(text: str, tokenizer) -> Dict[str, Any]: # Fast Tokenizers return offsets, so we don't need to calculate them ourselves if tokenizer.is_fast: # tokenized = tokenizer(text, return_offsets_mapping=True, return_special_tokens_mask=True) - tokenized2 = tokenizer.encode_plus(text, return_offsets_mapping=True, return_special_tokens_mask=True) + tokenized = tokenizer.encode_plus(text, return_offsets_mapping=True, return_special_tokens_mask=True) - tokens2 = tokenized2["input_ids"] - offsets2 = np.array([x[0] for x in tokenized2["offset_mapping"]]) + tokens = tokenized["input_ids"] + offsets = np.array([x[0] for x in tokenized["offset_mapping"]]) # offsets2 = [x[0] for x in tokenized2["offset_mapping"]] - words = np.array(tokenized2.encodings[0].words) + words = np.array(tokenized.encodings[0].words) # TODO check for validity for all tokenizer and special token types words[0] = -1 words[-1] = words[-2] words += 1 - start_of_word2 = [0] + list(np.ediff1d(words)) - ####### - - # start_of_word3 = [] - # last_word = -1 - # for word_id in tokenized2.encodings[0].words: - # if word_id is None or word_id == last_word: - # start_of_word3.append(0) - # else: - # start_of_word3.append(1) - # last_word = word_id - - tokenized_dict = {"tokens": tokens2, "offsets": offsets2, "start_of_word": start_of_word2} + start_of_word = [0] + list(np.ediff1d(words)) + tokenized_dict = {"tokens": tokens, "offsets": offsets, "start_of_word": start_of_word} else: # split text into "words" (here: simple whitespace tokenizer). words = text.split(" ") word_offsets = [] cumulated = 0 - for idx, word in enumerate(words): + for word in words: word_offsets.append(cumulated) cumulated += len(word) + 1 # 1 because we so far have whitespace tokenizer @@ -317,59 +248,56 @@ def truncate_sequences( return (seq_a, seq_b, overflowing_tokens) -def _words_to_tokens(words, word_offsets, tokenizer): +def _words_to_tokens(words: List[str], word_offsets: List[int], tokenizer: PreTrainedTokenizer) -> Tuple[str, List[str], List[int]]: """ Tokenize "words" into subword tokens while keeping track of offsets and if a token is the start of a word. :param words: list of words. - :type words: list :param word_offsets: Character indices where each word begins in the original text - :type word_offsets: list :param tokenizer: Tokenizer (e.g. from get_tokenizer)) - :return: tokens, offsets, start_of_word + :return: Tuple of (tokens, offsets, start_of_word) """ tokens = [] token_offsets = [] start_of_word = [] - idx = 0 - for w, w_off in zip(words, word_offsets): - idx += 1 - if idx % 500000 == 0: - logger.info(idx) + index = 0 + for index, word, word_offset in enumerate(zip(words, word_offsets)): + if index % 500000 == 0: + logger.info(index) # Get (subword) tokens of single word. # empty / pure whitespace - if len(w) == 0: + if len(word) == 0: continue # For the first word of a text: we just call the regular tokenize function. # For later words: we need to call it with add_prefix_space=True to get the same results with roberta / gpt2 tokenizer # see discussion here. https://github.com/huggingface/transformers/issues/1196 if len(tokens) == 0: - tokens_word = tokenizer.tokenize(w) + tokens_word = tokenizer.tokenize(word) else: if type(tokenizer) == RobertaTokenizer: - tokens_word = tokenizer.tokenize(w, add_prefix_space=True) + tokens_word = tokenizer.tokenize(word, add_prefix_space=True) else: - tokens_word = tokenizer.tokenize(w) + tokens_word = tokenizer.tokenize(word) # Sometimes the tokenizer returns no tokens if len(tokens_word) == 0: continue tokens += tokens_word # get global offset for each token in word + save marker for first tokens of a word - first_tok = True - for tok in tokens_word: - token_offsets.append(w_off) + first_token = True + for token in tokens_word: + token_offsets.append(word_offset) # Depending on the tokenizer type special chars are added to distinguish tokens with preceeding # whitespace (=> "start of a word"). We need to get rid of these to calculate the original length of the token - orig_tok = re.sub(SPECIAL_TOKENIZER_CHARS, "", tok) + original_token = re.sub(SPECIAL_TOKENIZER_CHARS, "", token) # Don't use length of unk token for offset calculation - if orig_tok == tokenizer.special_tokens_map["unk_token"]: - w_off += 1 + if original_token == tokenizer.special_tokens_map["unk_token"]: + word_offset += 1 else: - w_off += len(orig_tok) - if first_tok: + word_offset += len(original_token) + if first_token: start_of_word.append(True) - first_tok = False + first_token = False else: start_of_word.append(False) diff --git a/haystack/nodes/retriever/dense.py b/haystack/nodes/retriever/dense.py index 0add22f735..3d13d81e99 100644 --- a/haystack/nodes/retriever/dense.py +++ b/haystack/nodes/retriever/dense.py @@ -158,7 +158,7 @@ def __init__( tokenizers_default_classes["passage"] = None # type: ignore # Init & Load Encoders - self.query_tokenizer = get_tokenizer + self.query_tokenizer = get_tokenizer( pretrained_model_name_or_path=query_embedding_model, revision=model_version, do_lower_case=True, @@ -172,7 +172,7 @@ def __init__( language_model_class="DPRQuestionEncoder", use_auth_token=use_auth_token, ) - self.passage_tokenizer = get_tokenizer + self.passage_tokenizer = get_tokenizer( pretrained_model_name_or_path=passage_embedding_model, revision=model_version, do_lower_case=True, @@ -867,7 +867,7 @@ def __init__( tokenizers_default_classes["table"] = None # type: ignore # Init & Load Encoders - self.query_tokenizer = get_tokenizer + self.query_tokenizer = get_tokenizer( pretrained_model_name_or_path=query_embedding_model, revision=model_version, do_lower_case=True, @@ -881,7 +881,7 @@ def __init__( language_model_class="DPRQuestionEncoder", use_auth_token=use_auth_token, ) - self.passage_tokenizer = get_tokenizer + self.passage_tokenizer = get_tokenizer( pretrained_model_name_or_path=passage_embedding_model, revision=model_version, do_lower_case=True, @@ -895,7 +895,7 @@ def __init__( language_model_class="DPRContextEncoder", use_auth_token=use_auth_token, ) - self.table_tokenizer = get_tokenizer + self.table_tokenizer = get_tokenizer( pretrained_model_name_or_path=table_embedding_model, revision=model_version, do_lower_case=True, diff --git a/test/modeling/test_modeling_dpr.py b/test/modeling/test_modeling_dpr.py index b3805126b4..978e118260 100644 --- a/test/modeling/test_modeling_dpr.py +++ b/test/modeling/test_modeling_dpr.py @@ -24,10 +24,10 @@ def test_dpr_modules(caplog=None): devices, n_gpu = initialize_device_settings(use_cuda=True) # 1.Create question and passage tokenizers - query_tokenizer = get_tokenizer + query_tokenizer = get_tokenizer( pretrained_model_name_or_path="facebook/dpr-question_encoder-single-nq-base", do_lower_case=True, use_fast=True ) - passage_tokenizer = get_tokenizer + passage_tokenizer = get_tokenizer( pretrained_model_name_or_path="facebook/dpr-ctx_encoder-single-nq-base", do_lower_case=True, use_fast=True ) @@ -343,9 +343,9 @@ def test_dpr_processor(embed_title, passage_ids, passage_attns, use_fast, num_ha ] query_tok = "facebook/dpr-question_encoder-single-nq-base" - query_tokenizer = get_tokenizerquery_tok, use_fast=use_fast) + query_tokenizer = get_tokenizer(query_tok, use_fast=use_fast) passage_tok = "facebook/dpr-ctx_encoder-single-nq-base" - passage_tokenizer = get_tokenizerpassage_tok, use_fast=use_fast) + passage_tokenizer = get_tokenizer(passage_tok, use_fast=use_fast) processor = TextSimilarityProcessor( query_tokenizer=query_tokenizer, passage_tokenizer=passage_tokenizer, @@ -400,9 +400,9 @@ def test_dpr_processor_empty_title(use_fast, embed_title): } query_tok = "facebook/dpr-question_encoder-single-nq-base" - query_tokenizer = get_tokenizerquery_tok, use_fast=use_fast) + query_tokenizer = get_tokenizer(query_tok, use_fast=use_fast) passage_tok = "facebook/dpr-ctx_encoder-single-nq-base" - passage_tokenizer = get_tokenizerpassage_tok, use_fast=use_fast) + passage_tokenizer = get_tokenizer(passage_tok, use_fast=use_fast) processor = TextSimilarityProcessor( query_tokenizer=query_tokenizer, passage_tokenizer=passage_tokenizer, @@ -485,9 +485,9 @@ def test_dpr_problematic(): ] query_tok = "facebook/dpr-question_encoder-single-nq-base" - query_tokenizer = get_tokenizerquery_tok, use_fast=True) + query_tokenizer = get_tokenizer(query_tok, use_fast=True) passage_tok = "facebook/dpr-ctx_encoder-single-nq-base" - passage_tokenizer = get_tokenizerpassage_tok, use_fast=True) + passage_tokenizer = get_tokenizer(passage_tok, use_fast=True) processor = TextSimilarityProcessor( query_tokenizer=query_tokenizer, passage_tokenizer=passage_tokenizer, @@ -516,9 +516,9 @@ def test_dpr_query_only(): ] query_tok = "facebook/dpr-question_encoder-single-nq-base" - query_tokenizer = get_tokenizerquery_tok, use_fast=True) + query_tokenizer = get_tokenizer(query_tok, use_fast=True) passage_tok = "facebook/dpr-ctx_encoder-single-nq-base" - passage_tokenizer = get_tokenizerpassage_tok, use_fast=True) + passage_tokenizer = get_tokenizer(passage_tok, use_fast=True) processor = TextSimilarityProcessor( query_tokenizer=query_tokenizer, passage_tokenizer=passage_tokenizer, @@ -578,9 +578,9 @@ def test_dpr_context_only(): ] query_tok = "facebook/dpr-question_encoder-single-nq-base" - query_tokenizer = get_tokenizerquery_tok, use_fast=True) + query_tokenizer = get_tokenizer(query_tok, use_fast=True) passage_tok = "facebook/dpr-ctx_encoder-single-nq-base" - passage_tokenizer = get_tokenizerpassage_tok, use_fast=True) + passage_tokenizer = get_tokenizer(passage_tok, use_fast=True) processor = TextSimilarityProcessor( query_tokenizer=query_tokenizer, passage_tokenizer=passage_tokenizer, @@ -629,9 +629,9 @@ def test_dpr_processor_save_load(tmp_path): } query_tok = "facebook/dpr-question_encoder-single-nq-base" - query_tokenizer = get_tokenizerquery_tok, use_fast=True) + query_tokenizer = get_tokenizer(query_tok, use_fast=True) passage_tok = "facebook/dpr-ctx_encoder-single-nq-base" - passage_tokenizer = get_tokenizerpassage_tok, use_fast=True) + passage_tokenizer = get_tokenizer(passage_tok, use_fast=True) processor = TextSimilarityProcessor( query_tokenizer=query_tokenizer, passage_tokenizer=passage_tokenizer, @@ -689,13 +689,13 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path, query_and_passage_ # load model from model hub query_embedding_model = query_and_passage_model["query"] passage_embedding_model = query_and_passage_model["passage"] - query_tokenizer = get_tokenizer + query_tokenizer = get_tokenizer( pretrained_model_name_or_path=query_embedding_model ) # tokenizer class is inferred automatically query_encoder = LanguageModel.load( pretrained_model_name_or_path=query_embedding_model, language_model_class="DPRQuestionEncoder" ) - passage_tokenizer = get_tokenizerpretrained_model_name_or_path=passage_embedding_model) + passage_tokenizer = get_tokenizer(pretrained_model_name_or_path=passage_embedding_model) passage_encoder = LanguageModel.load( pretrained_model_name_or_path=passage_embedding_model, language_model_class="DPRContextEncoder" ) @@ -737,13 +737,13 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path, query_and_passage_ passage_tokenizer.save_pretrained(save_dir + f"/{passage_encoder_dir}") # load model from disk - loaded_query_tokenizer = get_tokenizer + loaded_query_tokenizer = get_tokenizer( pretrained_model_name_or_path=Path(save_dir) / query_encoder_dir, use_fast=True ) # tokenizer class is inferred automatically loaded_query_encoder = LanguageModel.load( pretrained_model_name_or_path=Path(save_dir) / query_encoder_dir, language_model_class="DPRQuestionEncoder" ) - loaded_passage_tokenizer = get_tokenizer + loaded_passage_tokenizer = get_tokenizer( pretrained_model_name_or_path=Path(save_dir) / passage_encoder_dir, use_fast=True ) loaded_passage_encoder = LanguageModel.load( @@ -849,13 +849,13 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path, query_and_passage_ loaded_passage_tokenizer.save_pretrained(save_dir + f"/{passage_encoder_dir}") # load model from disk - query_tokenizer = get_tokenizer + query_tokenizer = get_tokenizer( pretrained_model_name_or_path=Path(save_dir) / query_encoder_dir ) # tokenizer class is inferred automatically query_encoder = LanguageModel.load( pretrained_model_name_or_path=Path(save_dir) / query_encoder_dir, language_model_class="DPRQuestionEncoder" ) - passage_tokenizer = get_tokenizerpretrained_model_name_or_path=Path(save_dir) / passage_encoder_dir) + passage_tokenizer = get_tokenizer(pretrained_model_name_or_path=Path(save_dir) / passage_encoder_dir) passage_encoder = LanguageModel.load( pretrained_model_name_or_path=Path(save_dir) / passage_encoder_dir, language_model_class="DPRContextEncoder" ) @@ -942,9 +942,9 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path, query_and_passage_ # # device, n_gpu = initialize_device_settings(use_cuda=False) # -# query_tokenizer = get_tokenizerpretrained_model_name_or_path=question_lang_model, +# query_tokenizer = get_tokenizer(pretrained_model_name_or_path=question_lang_model, # do_lower_case=do_lower_case, use_fast=use_fast) -# passage_tokenizer = get_tokenizerpretrained_model_name_or_path=passage_lang_model, +# passage_tokenizer = get_tokenizer(pretrained_model_name_or_path=passage_lang_model, # do_lower_case=do_lower_case, use_fast=use_fast) # label_list = ["hard_negative", "positive"] # diff --git a/test/modeling/test_modeling_processor.py b/test/modeling/test_modeling_processor.py index 49b2ba4bb4..8e3e6f9328 100644 --- a/test/modeling/test_modeling_processor.py +++ b/test/modeling/test_modeling_processor.py @@ -24,7 +24,7 @@ def test_dataset_from_dicts_qa_inference(caplog=None): sample_types = ["answer-wrong", "answer-offset-wrong", "noanswer", "vanilla"] for model in models: - tokenizer = get_tokenizerpretrained_model_name_or_path=model, use_fast=True) + tokenizer = get_tokenizer(pretrained_model_name_or_path=model, use_fast=True) processor = SquadProcessor(tokenizer, max_seq_len=256, data_dir=None) for sample_type in sample_types: @@ -251,7 +251,7 @@ def test_dataset_from_dicts_qa_labelconversion(caplog=None): sample_types = ["answer-wrong", "answer-offset-wrong", "noanswer", "vanilla"] for model in models: - tokenizer = get_tokenizerpretrained_model_name_or_path=model, use_fast=True) + tokenizer = get_tokenizer(pretrained_model_name_or_path=model, use_fast=True) processor = SquadProcessor(tokenizer, max_seq_len=256, data_dir=None) for sample_type in sample_types: diff --git a/test/modeling/test_modeling_processor_saving_loading.py b/test/modeling/test_modeling_processor_saving_loading.py index c4318b0e1f..154b303f70 100644 --- a/test/modeling/test_modeling_processor_saving_loading.py +++ b/test/modeling/test_modeling_processor_saving_loading.py @@ -16,7 +16,7 @@ def test_processor_saving_loading(tmp_path, caplog): set_all_seeds(seed=42) lang_model = "roberta-base" - tokenizer = get_tokenizerpretrained_model_name_or_path=lang_model, do_lower_case=False) + tokenizer = get_tokenizer(pretrained_model_name_or_path=lang_model, do_lower_case=False) processor = SquadProcessor( tokenizer=tokenizer, diff --git a/test/modeling/test_tokenization.py b/test/modeling/test_tokenization.py index 36b9692886..1a371972aa 100644 --- a/test/modeling/test_tokenization.py +++ b/test/modeling/test_tokenization.py @@ -40,27 +40,27 @@ def test_basic_loading(caplog): caplog.set_level(logging.CRITICAL) # slow tokenizers - tokenizer = get_tokenizerpretrained_model_name_or_path="bert-base-cased", do_lower_case=True, use_fast=False) + tokenizer = get_tokenizer(pretrained_model_name_or_path="bert-base-cased", do_lower_case=True, use_fast=False) assert type(tokenizer) == BertTokenizer assert tokenizer.basic_tokenizer.do_lower_case == True - tokenizer = get_tokenizerpretrained_model_name_or_path="xlnet-base-cased", do_lower_case=True, use_fast=False) + tokenizer = get_tokenizer(pretrained_model_name_or_path="xlnet-base-cased", do_lower_case=True, use_fast=False) assert type(tokenizer) == XLNetTokenizer assert tokenizer.do_lower_case == True - tokenizer = get_tokenizerpretrained_model_name_or_path="roberta-base", use_fast=False) + tokenizer = get_tokenizer(pretrained_model_name_or_path="roberta-base", use_fast=False) assert type(tokenizer) == RobertaTokenizer # fast tokenizers - tokenizer = get_tokenizerpretrained_model_name_or_path="bert-base-cased", do_lower_case=True) + tokenizer = get_tokenizer(pretrained_model_name_or_path="bert-base-cased", do_lower_case=True) assert type(tokenizer) == BertTokenizerFast assert tokenizer.do_lower_case == True - tokenizer = get_tokenizerpretrained_model_name_or_path="xlnet-base-cased", do_lower_case=True) + tokenizer = get_tokenizer(pretrained_model_name_or_path="xlnet-base-cased", do_lower_case=True) assert type(tokenizer) == XLNetTokenizerFast assert tokenizer.do_lower_case == True - tokenizer = get_tokenizerpretrained_model_name_or_path="roberta-base") + tokenizer = get_tokenizer(pretrained_model_name_or_path="roberta-base") assert type(tokenizer) == RobertaTokenizerFast @@ -69,7 +69,7 @@ def test_bert_tokenizer_all_meta(caplog): lang_model = "bert-base-cased" - tokenizer = get_tokenizerpretrained_model_name_or_path=lang_model, do_lower_case=False) + tokenizer = get_tokenizer(pretrained_model_name_or_path=lang_model, do_lower_case=False) basic_text = "Some Text with neverseentokens plus !215?#. and a combined-token_with/chars" @@ -199,9 +199,9 @@ def test_save_load(tmp_path, caplog): tokenizers = [] for lang_name in lang_names: if "xlnet" in lang_name.lower(): - t = get_tokenizerlang_name, lower_case=False, use_fast=True, from_slow=True) + t = get_tokenizer(lang_name, lower_case=False, use_fast=True, from_slow=True) else: - t = get_tokenizerlang_name, lower_case=False) + t = get_tokenizer(lang_name, lower_case=False) t.add_tokens(new_tokens=["neverseentokens"]) tokenizers.append(t) @@ -211,7 +211,7 @@ def test_save_load(tmp_path, caplog): tokenizer_type = tokenizer.__class__.__name__ save_dir = f"{tmp_path}/testsave/{tokenizer_type}" tokenizer.save_pretrained(save_dir) - tokenizer_loaded = get_tokenizersave_dir, tokenizer_class=tokenizer_type) + tokenizer_loaded = get_tokenizer(save_dir, tokenizer_class=tokenizer_type) encoded_before = tokenizer.encode_plus(basic_text).encodings[0] encoded_after = tokenizer_loaded.encode_plus(basic_text).encodings[0] data_before = { @@ -225,8 +225,8 @@ def test_save_load(tmp_path, caplog): @pytest.mark.parametrize("model_name", ["bert-base-german-cased", "google/electra-small-discriminator"]) def test_fast_tokenizer_with_examples(caplog, model_name): - fast_tokenizer = get_tokenizermodel_name, lower_case=False, use_fast=True) - tokenizer = get_tokenizermodel_name, lower_case=False, use_fast=False) + fast_tokenizer = get_tokenizer(model_name, lower_case=False, use_fast=True) + tokenizer = get_tokenizer(model_name, lower_case=False, use_fast=False) for text in TEXTS: # plain tokenize function @@ -247,7 +247,7 @@ def test_all_tokenizer_on_special_cases(caplog): add_prefix_space = True else: add_prefix_space = False - t = get_tokenizerlang_name, lower_case=False, add_prefix_space=add_prefix_space) + t = get_tokenizer(lang_name, lower_case=False, add_prefix_space=add_prefix_space) tokenizers.append(t) texts = [ @@ -322,7 +322,7 @@ def test_bert_custom_vocab(caplog): lang_model = "bert-base-cased" - tokenizer = get_tokenizerpretrained_model_name_or_path=lang_model, do_lower_case=False) + tokenizer = get_tokenizer(pretrained_model_name_or_path=lang_model, do_lower_case=False) # deprecated: tokenizer.add_custom_vocab("samples/tokenizer/custom_vocab.txt") tokenizer.add_tokens(new_tokens=["neverseentokens"]) @@ -389,7 +389,7 @@ def test_fast_bert_custom_vocab(caplog): lang_model = "bert-base-cased" - tokenizer = get_tokenizerpretrained_model_name_or_path=lang_model, do_lower_case=False, use_fast=True) + tokenizer = get_tokenizer(pretrained_model_name_or_path=lang_model, do_lower_case=False, use_fast=True) # deprecated: tokenizer.add_custom_vocab("samples/tokenizer/custom_vocab.txt") tokenizer.add_tokens(new_tokens=["neverseentokens"]) @@ -458,7 +458,7 @@ def test_fast_bert_custom_vocab(caplog): def test_fast_tokenizer_type(caplog, model_name, tokenizer_type): caplog.set_level(logging.CRITICAL) - tokenizer = get_tokenizermodel_name, use_fast=True) + tokenizer = get_tokenizer(model_name, use_fast=True) assert type(tokenizer) is tokenizer_type @@ -466,7 +466,7 @@ def test_fast_tokenizer_type(caplog, model_name, tokenizer_type): # def test_fast_bert_tokenizer_strip_accents(caplog): # caplog.set_level(logging.CRITICAL) # -# tokenizer = get_tokenizer"dbmdz/bert-base-german-uncased", +# tokenizer = get_tokenizer("dbmdz/bert-base-german-uncased", # use_fast=True, # strip_accents=False) # assert type(tokenizer) is BertTokenizerFast @@ -477,13 +477,13 @@ def test_fast_tokenizer_type(caplog, model_name, tokenizer_type): def test_fast_electra_tokenizer(caplog): caplog.set_level(logging.CRITICAL) - tokenizer = get_tokenizer"dbmdz/electra-base-german-europeana-cased-discriminator", use_fast=True) + tokenizer = get_tokenizer("dbmdz/electra-base-german-europeana-cased-discriminator", use_fast=True) assert type(tokenizer) is ElectraTokenizerFast @pytest.mark.parametrize("model_name", ["bert-base-cased", "distilbert-base-uncased", "deepset/electra-base-squad2"]) def test_detokenization_in_fast_tokenizers(model_name): - tokenizer = get_tokenizerpretrained_model_name_or_path=model_name, use_fast=True) + tokenizer = get_tokenizer(pretrained_model_name_or_path=model_name, use_fast=True) for text in TEXTS: encoded = tokenizer.encode_plus(text, add_special_tokens=False).encodings[0] From a7c9bc0d23cd29314eabdb8800c5954e64091475 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Fri, 17 Jun 2022 09:45:43 +0200 Subject: [PATCH 05/89] working on normalizing DPR implementation too --- haystack/modeling/data_handler/processor.py | 12 +- haystack/modeling/model/_mappings.py | 128 ---- haystack/modeling/model/adaptive_model.py | 16 +- haystack/modeling/model/biadaptive_model.py | 30 +- haystack/modeling/model/language_model.py | 703 +++++++----------- haystack/modeling/model/tokenization.py | 14 +- haystack/modeling/model/triadaptive_model.py | 26 +- haystack/modeling/training/base.py | 5 +- haystack/nodes/retriever/dense.py | 12 +- test/modeling/test_modeling_dpr.py | 57 +- .../modeling/test_modeling_prediction_head.py | 4 +- test/modeling/test_modeling_processor.py | 4 +- test/modeling/test_tokenization.py | 10 +- 13 files changed, 382 insertions(+), 639 deletions(-) delete mode 100644 haystack/modeling/model/_mappings.py diff --git a/haystack/modeling/data_handler/processor.py b/haystack/modeling/data_handler/processor.py index bb02ca1874..3164411e12 100644 --- a/haystack/modeling/data_handler/processor.py +++ b/haystack/modeling/data_handler/processor.py @@ -1,4 +1,4 @@ -from typing import Optional, Dict, List, Union, Any, Iterable +from typing import Optional, Dict, List, Union, Any, Iterable, Type import os import json @@ -16,6 +16,8 @@ import requests from tqdm import tqdm from torch.utils.data import TensorDataset +import transformers +from transformers import PreTrainedTokenizer from haystack.modeling.model.tokenization import ( get_tokenizer, @@ -915,9 +917,11 @@ def load_from_dir(cls, load_dir: str): # read config processor_config_file = Path(load_dir) / "processor_config.json" config = json.load(open(processor_config_file)) - # init tokenizer - query_tokenizer = get_tokenizer(load_dir, tokenizer_class=config["query_tokenizer"], subfolder="query") - passage_tokenizer = get_tokenizer(load_dir, tokenizer_class=config["passage_tokenizer"], subfolder="passage") + # init tokenizers + query_tokenizer_class: Type[PreTrainedTokenizer] = getattr(transformers, config["query_tokenizer"]) + query_tokenizer = query_tokenizer_class.from_pretrained(pretrained_model_name_or_path=load_dir, subfolder="query") + passage_tokenizer_class: Type[PreTrainedTokenizer] = getattr(transformers, config["passage_tokenizer"]) + passage_tokenizer = passage_tokenizer_class.from_pretrained(pretrained_model_name_or_path=load_dir, subfolder="passage") # we have to delete the tokenizer string from config, because we pass it as Object del config["query_tokenizer"] diff --git a/haystack/modeling/model/_mappings.py b/haystack/modeling/model/_mappings.py deleted file mode 100644 index f37822af38..0000000000 --- a/haystack/modeling/model/_mappings.py +++ /dev/null @@ -1,128 +0,0 @@ -HF_PARAMETERS_BY_MODEL = { - "bert": {"prefix": "Bert"}, - "xlm.*roberta": {"prefix": "XLMRoberta"}, - "roberta.*xml": {"prefix": "XLMRoberta"}, - "bigbird": {"prefix": "BigBird"}, - "roberta": {"prefix": "Roberta"}, - "codebert.*mlm": {"prefix": "Roberta"}, - "mlm.*codebert": {"prefix": "Roberta"}, - "camembert": {"prefix": "Camembert"}, - "umberto": {"prefix": "Camembert"}, - "albert": {"prefix": "Albert"}, - "distilbert": { - "prefix": "DistilBert", - "sequence_summary_config": {"summary_last_dropout": 0, "summary_type": "first", "summary_activation": "tanh"}, - }, - "xlnet": {"prefix": "XLNet", "sequence_summary_config": {"summary_last_dropout": 0}}, - "electra": { - "prefix": "Electra", - "sequence_summary_config": { - "summary_last_dropout": 0, - "summary_type": "first", - "summary_activation": "gelu", - "summary_use_proj": False, - }, - }, - "word2vec": {"prefix": "WordEmbedding_LM"}, - "glove": {"prefix": "WordEmbedding_LM"}, - "minilm": {"prefix": "Bert"}, - "deberta-v2": { - "prefix": "DebertaV2", - "sequence_summary_config": { - "summary_last_dropout": 0, - "summary_type": "first", - "summary_activati": "tanh", - "summary_use_proj": False, - }, - }, - "data2vec-vision": { - "prefix": "Data2VecVision", - } -} - -HF_MODEL_TYPES = { - "bert": "Bert", - "albert": "Albert", - "roberta": "Roberta", - "xlm-roberta": "XLMRoberta", - "distilbert": "DistilBert", - "xlnet": "XLNet", - "electra": "Electra", - "camembert": "Camembert", - "big_bird": "BigBird", - "deberta-v2": "DebertaV2", - "data2vec-vision": "Data2VecVision", -} - -HF_MODEL_STRINGS_HINTS = { - "xlm.*roberta|roberta.*xlm": "XLMRoberta", - "bigbird": "BigBird", - "roberta": "Roberta", - "codebert": "Roberta", - "camembert": "Camembert", - "albert": "Albert", - "distilbert": "DistilBert", - "bert": "Bert", - "xlnet": "XLNet", - "electra": "Electra", - "word2vec": "WordEmbedding_LM", - "glove": "WordEmbedding_LM", - "minilm": "Bert", - "dpr-question_encoder": "DPRQuestionEncoder", - "dpr-ctx_encoder": "DPRContextEncoder", - "data2vec-vision": "Data2VecVision", -} - -KNOWN_LANGUAGES = ("german", "english", "chinese", "indian", "french", "polish", "spanish", "multilingual") -KNOWN_LANGUAGE_SPECIFIC_MODELS = (("camembert", "french"), ("umberto", "italian")) - - - - - -TOKENIZERS_PARAMS = { - "Albert": {"keep_accents": True}, - "XLMRoberta": {}, - "Roberta": {}, - "DistilBert": {}, - "Bert": {}, - "XLNet": {"keep_accents": True}, - "Electra": {}, - "Camembert": {}, - "DPRQuestionEncoder": {}, - "DPRContextEncoder": {}, - "BigBird": {}, - "DebertaV2": {}, -} - -TOKENIZERS_MAPPING = { - "albert": "Albert", - "xlm-roberta": "XLMRoberta", - "roberta": "Roberta", - "distilbert": "DistilBert", - "bert": "Bert", - "xlnet": "XLNet", - "electra": "Electra", - "camembert": "Camembert", - "big_bird": "BigBird", - "deberta-v2": "DebertaV2", -} - -TOKENIZERS_STRING_HINTS = { - "albert": "Albert", - "bigbird": "BigBird", - "xlm-roberta": "XLMRoberta", - "roberta": "Roberta", - "codebert": "Roberta", - "camembert": "Camembert", - "umberto": "Camembert", - "distilbert": "DistilBert", - "debertav2": "DebertaV2", - "debertav3": "DebertaV2", - "bert": "Bert", - "xlnet": "XLNet", - "electra": "Electra", - "minilm": "Bert", - "dpr-question_encoder": "DPRQuestionEncoder", - "dpr-ctx_encoder": "DPRContextEncoder", -} \ No newline at end of file diff --git a/haystack/modeling/model/adaptive_model.py b/haystack/modeling/model/adaptive_model.py index b4bcf22854..c80ac19083 100644 --- a/haystack/modeling/model/adaptive_model.py +++ b/haystack/modeling/model/adaptive_model.py @@ -13,7 +13,7 @@ from transformers.convert_graph_to_onnx import convert, quantize as quantize_model from haystack.modeling.data_handler.processor import Processor -from haystack.modeling.model.language_model import LanguageModel +from haystack.modeling.model.language_model import get_language_model, LanguageModel from haystack.modeling.model.prediction_head import PredictionHead, QuestionAnsweringHead from haystack.utils.experiment_tracking import Tracker as tracker @@ -196,7 +196,7 @@ def __init__( super(AdaptiveModel, self).__init__() # type: ignore self.device = device self.language_model = language_model.to(device) - self.lm_output_dims = language_model.get_output_dims() + self.lm_output_dims = language_model.output_dims self.prediction_heads = nn.ModuleList([ph.to(device) for ph in prediction_heads]) self.fit_heads_to_lm() self.dropout = nn.Dropout(embeds_dropout_prob) @@ -262,7 +262,6 @@ def load( # type: ignore load_dir: Union[str, Path], device: Union[str, torch.device], strict: bool = True, - lm_name: Optional[str] = None, processor: Optional[Processor] = None, ): """ @@ -277,17 +276,12 @@ def load( # type: ignore :param load_dir: Location where the AdaptiveModel is stored. :param device: To which device we want to sent the model, either torch.device("cpu") or torch.device("cuda"). - :param lm_name: The name to assign to the loaded language model. :param strict: Whether to strictly enforce that the keys loaded from saved model match the ones in the PredictionHead (see torch.nn.module.load_state_dict()). :param processor: Processor to populate prediction head with information coming from tasks. """ device = torch.device(device) - # Language Model - if lm_name: - language_model = LanguageModel.load(load_dir, haystack_lm_name=lm_name) - else: - language_model = LanguageModel.load(load_dir) + language_model = get_language_model(load_dir) # Prediction heads _, ph_config_files = cls._get_prediction_head_files(load_dir) @@ -334,7 +328,7 @@ def convert_from_transformers( :return: AdaptiveModel """ - lm = LanguageModel.load(model_name_or_path, revision=revision, auth_token=use_auth_token, **kwargs) + lm = get_language_model(model_name_or_path, revision=revision, auth_token=use_auth_token, **kwargs) if task_type is None: # Infer task type from config architecture = lm.model.config.architectures[0] @@ -584,7 +578,7 @@ def verify_vocab_size(self, vocab_size: int): msg = ( f"Vocab size of tokenizer {vocab_size} doesn't match with model {model_vocab_len}. " "If you added a custom vocabulary to the tokenizer, " - "make sure to supply 'n_added_tokens' to LanguageModel.load() and BertStyleLM.load()" + "make sure to supply 'n_added_tokens' to get_language_model() and BertStyleLM.load()" ) assert vocab_size == model_vocab_len, msg diff --git a/haystack/modeling/model/biadaptive_model.py b/haystack/modeling/model/biadaptive_model.py index e960fb01dd..3c7625e3ff 100644 --- a/haystack/modeling/model/biadaptive_model.py +++ b/haystack/modeling/model/biadaptive_model.py @@ -8,7 +8,7 @@ from torch import nn from haystack.modeling.data_handler.processor import Processor -from haystack.modeling.model.language_model import LanguageModel +from haystack.modeling.model.language_model import get_language_model, LanguageModel from haystack.modeling.model.prediction_head import PredictionHead, TextSimilarityHead from haystack.utils.experiment_tracking import Tracker as tracker @@ -74,9 +74,9 @@ def __init__( self.device = device self.language_model1 = language_model1.to(device) - self.lm1_output_dims = language_model1.get_output_dims() + self.lm1_output_dims = language_model1.output_dims self.language_model2 = language_model2.to(device) - self.lm2_output_dims = language_model2.get_output_dims() + self.lm2_output_dims = language_model2.output_dims self.dropout1 = nn.Dropout(embeds_dropout_prob) self.dropout2 = nn.Dropout(embeds_dropout_prob) self.prediction_heads = nn.ModuleList([ph.to(device) for ph in prediction_heads]) @@ -140,13 +140,13 @@ def load( """ # Language Model if lm1_name: - language_model1 = LanguageModel.load(os.path.join(load_dir, lm1_name)) + language_model1 = get_language_model(os.path.join(load_dir, lm1_name)) else: - language_model1 = LanguageModel.load(load_dir) + language_model1 = get_language_model(load_dir) if lm2_name: - language_model2 = LanguageModel.load(os.path.join(load_dir, lm2_name)) + language_model2 = get_language_model(os.path.join(load_dir, lm2_name)) else: - language_model2 = LanguageModel.load(load_dir) + language_model2 = get_language_model(load_dir) # Prediction heads ph_config_files = cls._get_prediction_head_files(load_dir) @@ -312,11 +312,15 @@ def forward_lm(self, **kwargs): :return: 2 tensors of pooled_output from the 2 language models. """ pooled_output = [None, None] + if "query_input_ids" in kwargs.keys(): - pooled_output1, hidden_states1 = self.language_model1(**kwargs) + query_params = {key.replace("query_", ""): value for key, value in kwargs.items() if key.startswith("query_")} + pooled_output1, _ = self.language_model1(**query_params) pooled_output[0] = pooled_output1 + if "passage_input_ids" in kwargs.keys(): - pooled_output2, hidden_states2 = self.language_model2(**kwargs) + passage_params = {key.replace("passage_", ""): value for key, value in kwargs.items() if key.startswith("passage_")} + pooled_output2, _ = self.language_model2(**passage_params) pooled_output[1] = pooled_output2 return tuple(pooled_output) @@ -350,7 +354,7 @@ def verify_vocab_size(self, vocab_size1: int, vocab_size2: int): msg = ( f"Vocab size of tokenizer {vocab_size1} doesn't match with model {model1_vocab_len}. " "If you added a custom vocabulary to the tokenizer, " - "make sure to supply 'n_added_tokens' to LanguageModel.load() and BertStyleLM.load()" + "make sure to supply 'n_added_tokens' to get_language_model() and BertStyleLM.load()" ) assert vocab_size1 == model1_vocab_len, msg @@ -359,7 +363,7 @@ def verify_vocab_size(self, vocab_size1: int, vocab_size2: int): msg = ( f"Vocab size of tokenizer {vocab_size1} doesn't match with model {model2_vocab_len}. " "If you added a custom vocabulary to the tokenizer, " - "make sure to supply 'n_added_tokens' to LanguageModel.load() and BertStyleLM.load()" + "make sure to supply 'n_added_tokens' to get_language_model() and BertStyleLM.load()" ) assert vocab_size2 == model2_vocab_len, msg @@ -458,10 +462,10 @@ def convert_from_transformers( :type processor: Processor :return: AdaptiveModel """ - lm1 = LanguageModel.load( + lm1 = get_language_model( pretrained_model_name_or_path=model_name_or_path1, language_model_class="DPRQuestionEncoder" ) - lm2 = LanguageModel.load( + lm2 = get_language_model( pretrained_model_name_or_path=model_name_or_path2, language_model_class="DPRContextEncoder" ) prediction_head = TextSimilarityHead(similarity_function=similarity_function) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 32f8b5593d..5bbe664b48 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -17,7 +17,7 @@ Acknowledgements: Many of the modeling parts here come from the great transformers repository: https://github.com/huggingface/transformers. Thanks for the great work! """ -from typing import Optional, Dict, Any, Union +from typing import Type, Optional, Dict, Any, Union, List import re import json @@ -34,12 +34,19 @@ from transformers import AutoModel, AutoConfig from transformers.modeling_utils import SequenceSummary -from haystack.modeling.model._mappings import ( - HF_PARAMETERS_BY_MODEL, - HF_MODEL_TYPES, - HF_MODEL_STRINGS_HINTS, - KNOWN_LANGUAGE_SPECIFIC_MODELS, - KNOWN_LANGUAGES +from haystack.errors import ModelingError + +LANGUAGE_HINTS = ( + ("german", "german"), + ("english", "english"), + ("chinese", "chinese"), + ("indian", "indian"), + ("french", "french"), + ("camembert", "french"), + ("polish", "polish"), + ("spanish", "spanish"), + ("umberto", "italian"), + ("multilingual", "multilingual"), ) @@ -75,7 +82,6 @@ def quiet_from_pretrained_func(cls, *args, **kwargs): return quiet_from_pretrained_func - # TODO analyse if LMs can be completely used through HF transformers class LanguageModel(nn.Module, ABC): """ @@ -83,18 +89,10 @@ class LanguageModel(nn.Module, ABC): These models read in tokenized sentences and return vectors that capture the meaning of sentences or of tokens. """ - subclasses: dict = {} - - def __init_subclass__(cls, **kwargs): - """ - This automatically keeps track of all available subclasses. - Enables generic load() or all specific LanguageModel implementation. - """ - super().__init_subclass__(**kwargs) - cls.subclasses[cls.__name__] = cls - - def __init__(self): + def __init__(self, name: str): super().__init__() + self._output_dims = None + self.name = name @abstractmethod def forward( @@ -107,126 +105,52 @@ def forward( ): raise NotImplementedError - @staticmethod - def load( - pretrained_model_name_or_path: Union[Path, str], - language: str = None, - n_added_tokens: int = 0, - language_model_class: Optional[str] = None, - auth_token: Optional[str] = None, - revision: Optional[str] = None, - **kwargs - ): + def enable_hidden_states_output(self): """ - Load a pretrained language model by doing one of the following: - - 1. Specifying its name and downloading the model. - 2. Pointing to the directory the model is saved in. - - See all supported model variations at: https://huggingface.co/models. - - The appropriate language model class is inferred automatically from model configuration - or can be manually supplied using `language_model_class`. - - :param pretrained_model_name_or_path: The path of the saved pretrained model or its name. - :param revision: The version of the model to use from the Hugging Face model hub. This can be a tag name, a branch name, or a commit hash. - :param language_model_class: (Optional) Name of the language model class to load (for example `Bert`). Unused if the model is local. + Sets the model to output the hidden states """ - logger.info("LOADING MODEL") - logger.info("=============") - - config_file = Path(pretrained_model_name_or_path) / "language_model_config.json" - - if os.path.exists(config_file): - # it's a local directory in Haystack format - logger.info(f"Model found locally at {pretrained_model_name_or_path}") - config = json.load(open(config_file)) - language_model_class = config["name"] - else: - # It's from the model hub - logger.info(f"Could not find {pretrained_model_name_or_path} locally.") - logger.info(f"Looking on Transformers Model Hub (in local cache and online)...") - if language_model_class is None: - language_model_class = LanguageModel.get_language_model_class( - pretrained_model_name_or_path, auth_token=auth_token, **kwargs - ) - if not language_model_class: - raise Exception( - f"Model not found for {pretrained_model_name_or_path}. Either supply the local path for a saved " - f"model or one of bert/roberta/xlnet/albert/distilbert models that can be downloaded from remote. " - f"Ensure that the model class name can be inferred from the directory name when loading a " - f"Transformers' model." - ) - language_model = LanguageModel.subclasses[language_model_class]( - pretrained_model_name_or_path, - auth_token=auth_token, - n_added_tokens=n_added_tokens, - language=language, - revision=revision, - **kwargs - ) - logger.info(f"Loaded {pretrained_model_name_or_path}") - return language_model - - @staticmethod - def get_language_model_class(model_name_or_path, auth_token: Optional[str] = None, revision: Optional[str] = None, **kwargs): - """ - Given a model name, try to use AutoConfig to understand which model type it is. - In case it's not successful, tries to infer the type from the name of the model. - """ - # it's transformers format (either from model hub or local) - model_name_or_path = str(model_name_or_path) - config = AutoConfig.from_pretrained( - pretrained_model_name_or_path=model_name_or_path, - use_auth_token=auth_token or False, - revision=revision, - **kwargs - ) - language_model_class = HF_MODEL_TYPES.get(config.model_type, None) + self.model.encoder.config.output_hidden_states = True - # Handle special cases - if not language_model_class: + def disable_hidden_states_output(self): + """ + Sets the model to not output the hidden states + """ + self.model.encoder.config.output_hidden_states = False - # DPR - if config.model_type == "dpr": - if config.architectures[0] == "DPRQuestionEncoder": - language_model_class = "DPRQuestionEncoder" - elif config.architectures[0] == "DPRContextEncoder": - language_model_class = "DPRContextEncoder" - elif config.archictectures[0] == "DPRReader": - raise NotImplementedError("DPRReader models are currently not supported.") + @property + def output_dims(self): + """ + The output dimension of this language model + """ + if self._output_dims: + return self._output_dims - # Infer from model name if still not found - else: - logger.warning("Could not infer the class from config. Trying to infer class from model name.") - for regex, class_ in HF_MODEL_STRINGS_HINTS.items(): - if re.match(regex, model_name_or_path): - language_model_class = class_ - break + for odn in OUTPUT_DIM_NAMES: + try: + value = getattr(self.model.config, odn, None) + if value: + self._output_dims = value + return value + except AttributeError as e: + raise ModelingError("Can't get the output dimension before loading the model.") - # Notes for some models - if language_model_class == "Roberta" and "mlm" in model_name_or_path.lower(): - raise NotImplementedError("MLM part of codebert is currently not supported in Haystack.") + raise ModelingError("Could not infer the output dimensions of the language model.") - return language_model_class + def save_config(self, save_dir: Union[Path, str]): + """ + Save the configuration of the language model in Haystack format. + """ + save_filename = Path(save_dir) / "language_model_config.json" + setattr(self.model.config, "name", self.name) + setattr(self.model.config, "language", self.language) - def get_output_dims(self): - config = self.model.config - for odn in OUTPUT_DIM_NAMES: - if odn in dir(config): - return getattr(config, odn) - raise Exception("Could not infer the output dimensions of the language model") + # For DPR models, transformers overwrites the model_type with the one set in DPRConfig + # Therefore, we copy the model_type from the model config to DPRConfig + if self.name == "DPRQuestionEncoder" or self.name == "DPRContextEncoder": + setattr(transformers.DPRConfig, "model_type", self.model.config.model_type) + string = self.model.config.to_json_string() - def save_config(self, save_dir): - save_filename = Path(save_dir) / "language_model_config.json" with open(save_filename, "w") as file: - setattr(self.model.config, "name", self.__class__.__name__) - setattr(self.model.config, "language", self.language) - # For DPR models, transformers overwrites the model_type with the one set in DPRConfig - # Therefore, we copy the model_type from the model config to DPRConfig - if self.__class__.__name__ == "DPRQuestionEncoder" or self.__class__.__name__ == "DPRContextEncoder": - setattr(transformers.DPRConfig, "model_type", self.model.config.model_type) - string = self.model.config.to_json_string() file.write(string) def save(self, save_dir: Union[str, Path], state_dict: Dict[Any, Any] = None): @@ -245,20 +169,13 @@ def save(self, save_dir: Union[str, Path], state_dict: Dict[Any, Any] = None): torch.save(state_dict, save_name) self.save_config(save_dir) - @staticmethod - def _infer_language_from_name(name: str) -> str: - language = "english" - languages = [lang for lang in KNOWN_LANGUAGES if lang in name] - if len(languages) == 0: - languages = [lang for model, lang in KNOWN_LANGUAGE_SPECIFIC_MODELS if model in name] - if len(languages) > 0: - language = languages[0] - else: - language = languages[0] - logger.info(f"Automatically detected language from model name: {language}") - return language - - def formatted_preds(self, logits, samples, ignore_first_token=True, padding_mask=None, input_ids=None, **kwargs): + def formatted_preds( + self, + logits, + samples, + ignore_first_token: bool = True, + padding_mask: torch.Tensor = None + ) -> List[Dict[str, Any]]: """ Extracting vectors from a language model (for example, for extracting sentence embeddings). You can use different pooling strategies and layers by specifying them in the object attributes @@ -277,7 +194,7 @@ def formatted_preds(self, logits, samples, ignore_first_token=True, padding_mask :return: A list of dictionaries containing predictions, for example: [{"context": "some text", "vec": [-0.01, 0.5 ...]}]. """ if not hasattr(self, "extraction_layer") or not hasattr(self, "extraction_strategy"): - raise ValueError( + raise ModelingError( "`extraction_layer` or `extraction_strategy` not specified for LM. " "Make sure to set both, e.g. via Inferencer(extraction_strategy='cls_token', extraction_layer=-1)`" ) @@ -289,12 +206,15 @@ def formatted_preds(self, logits, samples, ignore_first_token=True, padding_mask # aggregate vectors if self.extraction_strategy == "pooled": if self.extraction_layer != -1: - raise ValueError( - f"Pooled output only works for the last layer, but got extraction_layer = {self.extraction_layer}. Please set `extraction_layer=-1`.)" + raise ModelingError( + f"Pooled output only works for the last layer, but got extraction_layer={self.extraction_layer}. " + "Please set `extraction_layer=-1`" ) vecs = pooled_output.cpu().numpy() + elif self.extraction_strategy == "per_token": vecs = sequence_output.cpu().numpy() + elif self.extraction_strategy == "reduce_mean": vecs = self._pool_tokens( sequence_output, padding_mask, self.extraction_strategy, ignore_first_token=ignore_first_token @@ -306,7 +226,7 @@ def formatted_preds(self, logits, samples, ignore_first_token=True, padding_mask elif self.extraction_strategy == "cls_token": vecs = sequence_output[:, 0, :].cpu().numpy() else: - raise NotImplementedError + raise NotImplementedError(f"This extraction strategy ({self.extraction_strategy}) is not supported by Haystack.") preds = [] for vec, sample in zip(vecs, samples): @@ -316,7 +236,7 @@ def formatted_preds(self, logits, samples, ignore_first_token=True, padding_mask preds.append(pred) return preds - def _pool_tokens(self, sequence_output, padding_mask, strategy, ignore_first_token): + def _pool_tokens(self, sequence_output: torch.Tensor, padding_mask: torch.Tensor, strategy: str, ignore_first_token: bool): token_vecs = sequence_output.cpu().numpy() # we only take the aggregated value of non-padding tokens padding_mask = padding_mask.cpu().numpy() @@ -341,7 +261,15 @@ class HFLanguageModel(LanguageModel): """ @silence_transformers_logs - def __init__(self, pretrained_model_name_or_path: Union[Path, str], model_type: str, language: str = None, n_added_tokens: int = 0, auth_token: Optional[str] = None, **kwargs): + def __init__( + self, + pretrained_model_name_or_path: Union[Path, str], + model_type: str, + language: str = None, + n_added_tokens: int = 0, + auth_token: Optional[str] = None, + **kwargs + ): """ Load a pretrained model by supplying one of the following: @@ -349,14 +277,17 @@ def __init__(self, pretrained_model_name_or_path: Union[Path, str], model_type: * A local path of a model trained using transformers (for example, "some_dir/huggingface_model"). * A local path of a model trained using Haystack (for example, "some_dir/haystack_model"). + You can also use `get_language_model()` for a uniform interface across different model types. + :param pretrained_model_name_or_path: The path of the saved pretrained model or the name of the model. + :param model_type: the HuggingFace class name prefix (for example 'Bert', 'Roberta', etc...) + :param language: the model's language ('multilingual' is also accepted) + :param auth_token: the HF token, if necessary """ - super().__init__() - self.name = kwargs["haystack_lm_name"] if "haystack_lm_name" in kwargs else pretrained_model_name_or_path - - class_prefix = HF_PARAMETERS_BY_MODEL.get(model_type)["prefix"] - config_class: PretrainedConfig = getattr(transformers, class_prefix + "Config", None) - model_class: PreTrainedModel = getattr(transformers, class_prefix + "Model", None) + super().__init__(name=model_type) + + config_class: PretrainedConfig = getattr(transformers, model_type + "Config", None) + model_class: PreTrainedModel = getattr(transformers, model_type + "Model", None) # We need to differentiate between loading model using Haystack format and Pytorch-Transformers format haystack_lm_config = Path(pretrained_model_name_or_path) / "language_model_config.json" @@ -369,7 +300,7 @@ def __init__(self, pretrained_model_name_or_path: Union[Path, str], model_type: else: # Pytorch-transformer Style self.model = model_class.from_pretrained(str(pretrained_model_name_or_path), use_auth_token=auth_token or False, **kwargs) - self.language = language or self._infer_language_from_name(pretrained_model_name_or_path) + self.language = language or _infer_language_from_name(pretrained_model_name_or_path) # resize embeddings in case of custom vocab if n_added_tokens != 0: @@ -384,15 +315,6 @@ def __init__(self, pretrained_model_name_or_path: Union[Path, str], model_type: model_emb_size = self.model.resize_token_embeddings(new_num_tokens=None).num_embeddings assert vocab_size == model_emb_size - # @classmethod - # def from_scratch(cls, vocab_size, name="bert", language="en"): - # bert = cls() - # bert.name = name - # bert.language = language - # config = BertConfig(vocab_size=vocab_size) - # bert.model = BertModel(config) - # return bert - def forward( self, input_ids: torch.Tensor, @@ -429,12 +351,6 @@ def forward( ) return output_tuple - def enable_hidden_states_output(self): - self.model.encoder.config.output_hidden_states = True - - def disable_hidden_states_output(self): - self.model.encoder.config.output_hidden_states = False - class HFLanguageModelWithPooler(HFLanguageModel): """ @@ -464,7 +380,7 @@ def __init__(self, pretrained_model_name_or_path: Union[Path, str], language: st # The pooler takes the first hidden representation & feeds it to a dense layer of (hidden_dim x hidden_dim). # We don't want a dropout in the end of the pooler, since we do that already in the adaptive model before we # feed everything to the prediction head - sequence_summary_config = HF_PARAMETERS_BY_MODEL.get(self.name)["sequence_summary_config"] + sequence_summary_config = PARAMETERS_BY_MODEL.get(self.name.lower()) for key, value in sequence_summary_config.items(): setattr(config, key, value) @@ -502,201 +418,6 @@ def forward( # type: ignore return (output_tuple[0], pooled_output) + output_tuple[1:] -class Bert(HFLanguageModel): - """ - A BERT model that wraps Hugging Face's implementation - (https://github.com/huggingface/transformers) to fit the LanguageModel class. - Paper: https://arxiv.org/abs/1810.04805. - """ - - def __init__(self, pretrained_model_name_or_path: Union[Path, str], language: str = None, n_added_tokens: int = 0, **kwargs): - super().__init__( - pretrained_model_name_or_path=pretrained_model_name_or_path, - language=language, - n_added_tokens=n_added_tokens, - model_type="bert", - **kwargs - ) - - -class Albert(HFLanguageModel): - """ - An ALBERT model that wraps the Hugging Face's implementation - (https://github.com/huggingface/transformers) to fit the LanguageModel class. - """ - - def __init__(self, pretrained_model_name_or_path: Union[Path, str], language: str = None, n_added_tokens: int = 0, **kwargs): - super().__init__( - pretrained_model_name_or_path=pretrained_model_name_or_path, - language=language, - n_added_tokens=n_added_tokens, - model_type="albert", - **kwargs - ) - - -class Roberta(HFLanguageModel): - """ - A roberta model that wraps the Hugging Face's implementation - (https://github.com/huggingface/transformers) to fit the LanguageModel class. - Paper: https://arxiv.org/abs/1907.11692 - """ - - def __init__(self, pretrained_model_name_or_path: Union[Path, str], language: str = None, n_added_tokens: int = 0, **kwargs): - super().__init__( - pretrained_model_name_or_path=pretrained_model_name_or_path, - language=language, - n_added_tokens=n_added_tokens, - model_type="roberta", - **kwargs - ) - - -class XLMRoberta(HFLanguageModel): - """ - A roberta model that wraps the Hugging Face's implementation - (https://github.com/huggingface/transformers) to fit the LanguageModel class. - Paper: https://arxiv.org/abs/1907.11692 - """ - - def __init__(self, pretrained_model_name_or_path: Union[Path, str], language: str = None, n_added_tokens: int = 0, **kwargs): - super().__init__( - pretrained_model_name_or_path=pretrained_model_name_or_path, - language=language, - n_added_tokens=n_added_tokens, - model_type="xlm-roberta", - **kwargs - ) - -class DistilBert(HFLanguageModelWithPooler): - """ - A DistilBERT model that wraps Hugging Face's implementation - (https://github.com/huggingface/transformers) to fit the LanguageModel class. - - NOTE: - - DistilBert doesn't have `token_type_ids`, you don't need to indicate which - token belongs to which segment. Just separate your segments with the separation - token `tokenizer.sep_token` (or [SEP]). - - Unlike the other BERT variants, DistilBert does not output the - `pooled_output`. An additional pooler is initialized. - """ - - def __init__(self, pretrained_model_name_or_path: Union[Path, str], language: str = None, n_added_tokens: int = 0, **kwargs): - super().__init__( - pretrained_model_name_or_path=pretrained_model_name_or_path, - language=language, - n_added_tokens=n_added_tokens, - model_type="distilbert", - **kwargs - ) - -class XLNet(HFLanguageModelWithPooler): - """ - A XLNet model that wraps the Hugging Face's implementation - (https://github.com/huggingface/transformers) to fit the LanguageModel class. - Paper: https://arxiv.org/abs/1906.08237 - """ - - def __init__(self, pretrained_model_name_or_path: Union[Path, str], language: str = None, n_added_tokens: int = 0, **kwargs): - super().__init__( - pretrained_model_name_or_path=pretrained_model_name_or_path, - language=language, - n_added_tokens=n_added_tokens, - model_type="xlnet", - **kwargs - ) - -class Electra(HFLanguageModelWithPooler): - """ - ELECTRA is a new pre-training approach which trains two transformer models: - the generator and the discriminator. The generator replaces tokens in a sequence, - and is therefore trained as a masked language model. The discriminator, which is - the model we're interested in, tries to identify which tokens were replaced by - the generator in the sequence. - - The ELECTRA model here wraps Hugging Face's implementation - (https://github.com/huggingface/transformers) to fit the LanguageModel class. - - NOTE: - - Electra does not output the `pooled_output`. An additional pooler is initialized. - """ - - def __init__(self, pretrained_model_name_or_path: Union[Path, str], language: str = None, n_added_tokens: int = 0, **kwargs): - super().__init__( - pretrained_model_name_or_path=pretrained_model_name_or_path, - language=language, - n_added_tokens=n_added_tokens, - model_type="electra", - **kwargs - ) - -class Camembert(HFLanguageModel): - """ - A Camembert model that wraps the Hugging Face's implementation - (https://github.com/huggingface/transformers) to fit the LanguageModel class. - """ - - def __init__(self, pretrained_model_name_or_path: Union[Path, str], language: str = None, n_added_tokens: int = 0, **kwargs): - super().__init__( - pretrained_model_name_or_path=pretrained_model_name_or_path, - language=language, - n_added_tokens=n_added_tokens, - model_type="camembert", - **kwargs - ) - -class BigBird(HFLanguageModel): - """ - A BERT model that wraps Hugging Face's implementation - (https://github.com/huggingface/transformers) to fit the LanguageModel class. - Paper: https://arxiv.org/abs/1810.04805 - """ - - def __init__(self, pretrained_model_name_or_path: Union[Path, str], language: str = None, n_added_tokens: int = 0, **kwargs): - super().__init__( - pretrained_model_name_or_path=pretrained_model_name_or_path, - language=language, - n_added_tokens=n_added_tokens, - model_type="bigbird", - **kwargs - ) - -class DebertaV2(HFLanguageModelWithPooler): - """ - This is a wrapper around the DebertaV2 model from Hugging Face's transformers library. - It is also compatible with DebertaV3 as DebertaV3 only changes the pretraining procedure. - - NOTE: - - DebertaV2 does not output the `pooled_output`. An additional pooler is initialized. - """ - - def __init__(self, pretrained_model_name_or_path: Union[Path, str], language: str = None, n_added_tokens: int = 0, **kwargs): - super().__init__( - pretrained_model_name_or_path=pretrained_model_name_or_path, - language=language, - n_added_tokens=n_added_tokens, - model_type="deberta-v2", - **kwargs - ) - - -class Data2VecVision(HFLanguageModel): - """ - A Data2Vec (Vision) model that wraps Hugging Face's implementation - (https://github.com/huggingface/transformers) to fit the LanguageModel class. - Paper: https://arxiv.org/abs/1810.04805. - """ - - def __init__(self, pretrained_model_name_or_path: Union[Path, str], language: str = None, n_added_tokens: int = 0, **kwargs): - super().__init__( - pretrained_model_name_or_path=pretrained_model_name_or_path, - language=language, - n_added_tokens=n_added_tokens, - model_type="data2vec-vision", - **kwargs - ) - - class DPRQuestionEncoder(LanguageModel): """ A DPRQuestionEncoder model that wraps Hugging Face's implementation. @@ -707,7 +428,6 @@ def __init__( self, pretrained_model_name_or_path: Union[Path, str], language: str = None, - n_added_tokens: int = 0, auth_token: Optional[str] = None, **kwargs, ): @@ -720,11 +440,7 @@ def __init__( :param pretrained_model_name_or_path: The path of the base pretrained language model whose weights are used to initialize DPRQuestionEncoder. """ - super().__init__() - if "haystack_lm_name" in kwargs: - self.name = kwargs["haystack_lm_name"] - else: - self.name = pretrained_model_name_or_path + super().__init__(name="DPRQuestionEncoder") # We need to differentiate between loading model using Haystack format and Pytorch-Transformers format haystack_lm_config = Path(pretrained_model_name_or_path) / "language_model_config.json" @@ -739,7 +455,7 @@ def __init__( haystack_lm_model, config=dpr_config, **kwargs ) else: - if original_model_config.model_type != "bert": + if original_model_config.model_type.lower() != "bert": logger.warning( f"Using a model of type '{original_model_config.model_type}' which might be incompatible with DPR encoders." f"Bert based encoders are supported that need input_ids,token_type_ids,attention_mask as input tensors." @@ -749,16 +465,14 @@ def __init__( self.model = transformers.DPRQuestionEncoder( config=transformers.DPRConfig(**original_config_dict) ) - language_model_class = DPRQuestionEncoder.get_language_model_class(haystack_lm_config, auth_token or False, **kwargs) - self.model.base_model.bert_model = ( - DPRQuestionEncoder.subclasses[language_model_class](str(pretrained_model_name_or_path)).model - ) + self.model.base_model.bert_model = get_language_model(str(pretrained_model_name_or_path), auth_token=auth_token).model + self.language = self.model.config.language else: original_model_config = AutoConfig.from_pretrained( pretrained_model_name_or_path, use_auth_token=auth_token or False ) - if original_model_config.model_type == "dpr": + if "dpr" in original_model_config.model_type.lower(): # "pretrained dpr model": load existing pretrained DPRQuestionEncoder model self.model = transformers.DPRQuestionEncoder.from_pretrained( str(pretrained_model_name_or_path), use_auth_token=auth_token or False, **kwargs @@ -780,7 +494,7 @@ def __init__( self.model.base_model.bert_model = AutoModel.from_pretrained( str(pretrained_model_name_or_path), use_auth_token=auth_token or False, **original_config_dict ) - self.language = language or DPRQuestionEncoder._infer_language_from_name(pretrained_model_name_or_path) + self.language = language or _infer_language_from_name(pretrained_model_name_or_path) def save(self, save_dir: Union[str, Path], state_dict: Optional[Dict[Any, Any]] = None): @@ -793,7 +507,7 @@ def save(self, save_dir: Union[str, Path], state_dict: Optional[Dict[Any, Any]] """ model_to_save = self.model.module if hasattr(self.model, "module") else self.model # Only save the model itself - if self.model.config.model_type != "dpr" and model_to_save.base_model_prefix.startswith("question_"): + if "dpr" not in self.model.config.model_type.lower() and model_to_save.base_model_prefix.startswith("question_"): state_dict = model_to_save.state_dict() if state_dict: keys = state_dict.keys() @@ -809,26 +523,25 @@ def save(self, save_dir: Union[str, Path], state_dict: Optional[Dict[Any, Any]] def forward( # type: ignore self, - query_input_ids: torch.Tensor, - query_segment_ids: torch.Tensor, - query_attention_mask: torch.Tensor, - **kwargs, + input_ids: torch.Tensor, + segment_ids: torch.Tensor, + attention_mask: torch.Tensor, ): """ Perform the forward pass of the DPRQuestionEncoder model. - :param query_input_ids: The IDs of each token in the input sequence. It's a tensor of shape [batch_size, max_seq_len]. - :param query_segment_ids: The ID of the segment. For example, in next sentence prediction, the tokens in the + :param input_ids: The IDs of each token in the input sequence. It's a tensor of shape [batch_size, max_seq_len]. + :param segment_ids: The ID of the segment. For example, in next sentence prediction, the tokens in the first sentence are marked with 0 and the tokens in the second sentence are marked with 1. It is a tensor of shape [batch_size, max_seq_len]. - :param query_attention_mask: A mask that assigns 1 to valid input tokens and 0 to padding tokens + :param attention_mask: A mask that assigns 1 to valid input tokens and 0 to padding tokens of shape [batch_size, max_seq_len]. :return: Embeddings for each token in the input sequence. """ output_tuple = self.model( - input_ids=query_input_ids, - token_type_ids=query_segment_ids, - attention_mask=query_attention_mask, + input_ids=input_ids, + token_type_ids=segment_ids, + attention_mask=attention_mask, return_dict=True, ) if self.model.question_encoder.config.output_hidden_states == True: @@ -854,7 +567,6 @@ def __init__( self, pretrained_model_name_or_path: Union[Path, str], language: str = None, - n_added_tokens: int = 0, auth_token: Optional[str] = None, **kwargs, ): @@ -867,11 +579,7 @@ def __init__( :param pretrained_model_name_or_path: The path of the base pretrained language model whose weights are used to initialize DPRContextEncoder. """ - super().__init__() - if "haystack_lm_name" in kwargs: - self.name = kwargs["haystack_lm_name"] - else: - self.name = pretrained_model_name_or_path + super().__init__(name="DPRContextEncoder") # We need to differentiate between loading model using Haystack format and Pytorch-Transformers format haystack_lm_config = Path(pretrained_model_name_or_path) / "language_model_config.json" @@ -880,13 +588,13 @@ def __init__( original_model_config = AutoConfig.from_pretrained(haystack_lm_config) haystack_lm_model = Path(pretrained_model_name_or_path) / "language_model.bin" - if original_model_config.model_type == "dpr": + if "dpr" in original_model_config.model_type.lower(): dpr_config = transformers.DPRConfig.from_pretrained(haystack_lm_config) self.model = transformers.DPRContextEncoder.from_pretrained( haystack_lm_model, config=dpr_config, use_auth_token=auth_token or False, **kwargs ) else: - if original_model_config.model_type != "bert": + if original_model_config.model_type.lower() != "bert": logger.warning( f"Using a model of type '{original_model_config.model_type}' which might be incompatible with DPR encoders." f"Bert based encoders are supported that need input_ids,token_type_ids,attention_mask as input tensors." @@ -896,10 +604,7 @@ def __init__( self.model = transformers.DPRContextEncoder( config=transformers.DPRConfig(**original_config_dict) ) - language_model_class = DPRQuestionEncoder.get_language_model_class(haystack_lm_config, **kwargs) - self.model.base_model.bert_model = ( - DPRContextEncoder.subclasses[language_model_class](str(pretrained_model_name_or_path), auth_token=auth_token).model - ) + self.model.base_model.bert_model = get_language_model(str(pretrained_model_name_or_path), auth_token=auth_token).model self.language = self.model.config.language else: @@ -907,7 +612,7 @@ def __init__( original_model_config = AutoConfig.from_pretrained( pretrained_model_name_or_path, use_auth_token=auth_token or False ) - if original_model_config.model_type == "dpr": + if "dpr" in original_model_config.model_type.lower(): # "pretrained dpr model": load existing pretrained DPRContextEncoder model self.model = transformers.DPRContextEncoder.from_pretrained( str(pretrained_model_name_or_path), use_auth_token=auth_token or False, **kwargs @@ -916,7 +621,7 @@ def __init__( # "from scratch": load weights from different architecture (e.g. bert) into DPRContextEncoder # but keep config values from original architecture # TODO test for architectures other than BERT, e.g. Electra - if original_model_config.model_type != "bert": + if original_model_config.model_type.lower() != "bert": logger.warning( f"Using a model of type '{original_model_config.model_type}' which might be incompatible with DPR encoders." f"Bert based encoders are supported that need input_ids,token_type_ids,attention_mask as input tensors." @@ -929,7 +634,7 @@ def __init__( self.model.base_model.bert_model = AutoModel.from_pretrained( str(pretrained_model_name_or_path), use_auth_token=auth_token or False, **original_config_dict ) - self.language = language or DPRContextEncoder._infer_language_from_name(pretrained_model_name_or_path) + self.language = language or _infer_language_from_name(pretrained_model_name_or_path) def save(self, save_dir: Union[str, Path], state_dict: Optional[Dict[Any, Any]] = None): @@ -943,7 +648,7 @@ def save(self, save_dir: Union[str, Path], state_dict: Optional[Dict[Any, Any]] self.model.module if hasattr(self.model, "module") else self.model ) # Only save the model it-self - if self.model.config.model_type != "dpr" and model_to_save.base_model_prefix.startswith("ctx_"): + if "dpr" not in self.model.config.model_type.lower() and model_to_save.base_model_prefix.startswith("ctx_"): state_dict = model_to_save.state_dict() if state_dict: keys = state_dict.keys() @@ -959,10 +664,9 @@ def save(self, save_dir: Union[str, Path], state_dict: Optional[Dict[Any, Any]] def forward( # type: ignore self, - passage_input_ids: torch.Tensor, - passage_segment_ids: torch.Tensor, - passage_attention_mask: torch.Tensor, - **kwargs, + input_ids: torch.Tensor, + segment_ids: torch.Tensor, + attention_mask: torch.Tensor, ): """ Perform the forward pass of the DPRContextEncoder model. @@ -975,14 +679,14 @@ def forward( # type: ignore of shape [batch_size, number_of_hard_negative_passages, max_seq_len]. :return: Embeddings for each token in the input sequence. """ - max_seq_len = passage_input_ids.shape[-1] - passage_input_ids = passage_input_ids.view(-1, max_seq_len) - passage_segment_ids = passage_segment_ids.view(-1, max_seq_len) - passage_attention_mask = passage_attention_mask.view(-1, max_seq_len) + max_seq_len = input_ids.shape[-1] + input_ids = input_ids.view(-1, max_seq_len) + segment_ids = segment_ids.view(-1, max_seq_len) + attention_mask = attention_mask.view(-1, max_seq_len) output_tuple = self.model( - input_ids=passage_input_ids, - token_type_ids=passage_segment_ids, - attention_mask=passage_attention_mask, + input_ids=input_ids, + token_type_ids=segment_ids, + attention_mask=attention_mask, return_dict=True, ) if self.model.ctx_encoder.config.output_hidden_states == True: @@ -997,3 +701,166 @@ def enable_hidden_states_output(self): def disable_hidden_states_output(self): self.model.ctx_encoder.config.output_hidden_states = False + + +HUGGINGFACE_TO_HAYSTACK = { + "Albert": HFLanguageModel, + "Bert": HFLanguageModel, + "BigBird": HFLanguageModel, + "Camembert": HFLanguageModel, + "Codebert": HFLanguageModel, + "Data2VecVision": HFLanguageModel, + "DebertaV2": HFLanguageModelWithPooler, + "DistilBert": HFLanguageModelWithPooler, + "DPRContextEncoder": DPRContextEncoder, + "DPRQuestionEncoder": DPRQuestionEncoder, + "Electra": HFLanguageModelWithPooler, + "GloVe": HFLanguageModel, + "MiniLM": HFLanguageModel, + "Roberta": HFLanguageModel, + "Umberto": HFLanguageModel, + "Word2Vec": HFLanguageModel, + "WordEmbedding_LM": HFLanguageModel, + "XLMRoberta": HFLanguageModel, + "XLNet": HFLanguageModelWithPooler, + +} +NAME_HINTS = { + "xlm.*roberta": "XLMRoberta", + "roberta.*xml": "XLMRoberta", + "codebert.*mlm": "Roberta", + "mlm.*codebert": "Roberta", + "deberta-v2": "DebertaV2", + "data2vec-vision": "Data2VecVision", +} +PARAMETERS_BY_MODEL = { + "DistilBert": {"summary_last_dropout": 0, "summary_type": "first", "summary_activation": "tanh"}, + "XLNet": {"summary_last_dropout": 0}, + "Electra": { + "summary_last_dropout": 0, + "summary_type": "first", + "summary_activation": "gelu", + "summary_use_proj": False, + }, + "DebertaV2": { + "summary_last_dropout": 0, + "summary_type": "first", + "summary_activati": "tanh", + "summary_use_proj": False, + }, +} + +def get_language_model( + pretrained_model_name_or_path: Union[Path, str], + language_model_type: Optional[str] = None, + auth_token: Optional[str] = None, + revision: Optional[str] = None, + **kwargs +) -> LanguageModel: + """ + Load a pretrained language model by doing one of the following: + + 1. Specifying its name and downloading the model. + 2. Pointing to the directory the model is saved in. + + See all supported model variations at: https://huggingface.co/models. + + The appropriate language model class is inferred automatically from model configuration + or can be manually supplied using `language_model_class`. + + :param pretrained_model_name_or_path: The path of the saved pretrained model or its name. + :param revision: The version of the model to use from the Hugging Face model hub. This can be a tag name, a branch name, or a commit hash. + :param language_model_type: (Optional) Name of the language model class to load (for example `Bert`). Overrides any other discovered value. + """ + logger.info(f"Loading model '{pretrained_model_name_or_path}'") + + config_file = Path(pretrained_model_name_or_path) / "language_model_config.json" + + if language_model_type is None: + + if os.path.exists(config_file): + # it's a local directory in Haystack format + logger.info(f"Model found locally at {pretrained_model_name_or_path}") + config = json.load(open(config_file)) + language_model_type = config["name"] + + else: + # It's from the model hub + logger.info(f"Could not find '{pretrained_model_name_or_path}' locally.") + logger.info(f"Looking on Transformers Model Hub (in local cache and online)...") + language_model_type = _get_model_type( + pretrained_model_name_or_path, auth_token=auth_token, revision=revision, **kwargs + ) + if not language_model_type: + raise Exception( + f"Model not found for '{pretrained_model_name_or_path}'. Either supply the local path for a saved " + f"model or one of bert/roberta/xlnet/albert/distilbert models that can be downloaded from remote. " + f"Ensure that the model class name can be inferred from the directory name when loading a " + f"Transformers' model." + ) + + # Find the class corresponding to this model type + language_model_class: Type[LanguageModel] = HUGGINGFACE_TO_HAYSTACK.get(language_model_type, None) + if not language_model_class: + raise ValueError( + f"The type of model supplied ({language_model_type}) is not supported by Haystack. " + f"Supported model categories are: {', '.join(HUGGINGFACE_TO_HAYSTACK.keys())}") + + # Instantiate the class for this model + language_model = language_model_class( + pretrained_model_name_or_path, + model_type=language_model_type, + auth_token=auth_token, + **kwargs + ) + logger.info(f"Loaded '{pretrained_model_name_or_path}' ({language_model_type} model)") + return language_model + + +def _get_model_type(model_name_or_path: Union[str, Path], auth_token: Optional[str] = None, revision: Optional[str] = None, **kwargs) -> str: + """ + Given a model name, try to use AutoConfig to understand which model type it is. + In case it's not successful, tries to infer the type from the name of the model. + """ + # Use AutoConfig to understand the model class + model_name_or_path = str(model_name_or_path) + config = AutoConfig.from_pretrained( + pretrained_model_name_or_path=model_name_or_path, + use_auth_token=auth_token or False, + revision=revision, + **kwargs + ) + + # Find if this mode is present in MODEL_TYPE_BY_NAME.keys() even with a different capitalization + model_type = {key.lower(): key for key in HUGGINGFACE_TO_HAYSTACK.keys()}.get(config.model_type.lower(), None) + + if not model_type: + # DPR + if "dpr" in config.model_type: + if config.archictectures[0] == "DPRReader": + raise NotImplementedError("DPRReader models are currently not supported.") + model_type = config.architectures[0] + + else: + logger.warning("Could not infer the class from config. Trying to infer class from model name.") + + # Look for other patterns and variation that hints at the model type + for regex, model_name in NAME_HINTS.keys(): + if re.match(regex, model_name_or_path): + model_type = model_name + break + + if model_type == "Roberta" and "mlm" in model_name_or_path.lower(): + logging.error(f"MLM part of codebert is currently not supported in Haystack: '{model_name_or_path}' may crash later.") + + return model_type + + +def _infer_language_from_name(name: str) -> str: + languages = [lang for hint, lang in LANGUAGE_HINTS if hint.lower() in name.lower()] + if len(languages) > 0: + language = languages[0] + else: + language = "english" + logger.info(f"Automatically detected language from model name: {language}") + return language \ No newline at end of file diff --git a/haystack/modeling/model/tokenization.py b/haystack/modeling/model/tokenization.py index 63138f47f1..6e03e01351 100644 --- a/haystack/modeling/model/tokenization.py +++ b/haystack/modeling/model/tokenization.py @@ -37,30 +37,32 @@ def get_tokenizer( use_fast: bool = True, auth_token: Optional[str] = None, **kwargs, -): +) -> PreTrainedTokenizer: """ - Enables loading of different Tokenizer classes with a uniform interface. Either infer the class from - model config or define it manually via `tokenizer_classname`. + Enables loading of different Tokenizer classes with a uniform interface. + Right now it always returns an instance of `AutoTokenizer`. :param pretrained_model_name_or_path: The path of the saved pretrained model or its name (e.g. `bert-base-uncased`) :param revision: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. :param use_fast: Indicate if Haystack should try to load the fast version of the tokenizer (True) or use the Python one (False). Defaults to True. :param auth_token: The auth_token to use in `PretrainedTokenizer.from_pretrained()`, if required :param kwargs: other kwargs to pass on to `PretrainedTokenizer.from_pretrained()` - :return: Tokenizer + :return: AutoTokenizer instance """ model_name_or_path = str(pretrained_model_name_or_path) params = {} + if auth_token: + params["use_auth_token"] = auth_token + if "mlm" in model_name_or_path.lower(): raise NotImplementedError("MLM part of codebert is currently not supported in Haystack") - if any("albert", "xlnet") in model_name_or_path: + if any(tokenizer_type in model_name_or_path for tokenizer_type in ["albert", "xlnet"]): params["keep_accents"] = True return AutoTokenizer.from_pretrained( model_name_or_path, - use_auth_token=auth_token or False, revision=revision, use_fast=use_fast, **params, diff --git a/haystack/modeling/model/triadaptive_model.py b/haystack/modeling/model/triadaptive_model.py index 9d3e8cfe63..6cc4f56011 100644 --- a/haystack/modeling/model/triadaptive_model.py +++ b/haystack/modeling/model/triadaptive_model.py @@ -7,7 +7,7 @@ from torch import nn from haystack.modeling.data_handler.processor import Processor -from haystack.modeling.model.language_model import LanguageModel +from haystack.modeling.model.language_model import get_language_model, LanguageModel from haystack.modeling.model.prediction_head import PredictionHead from haystack.utils.experiment_tracking import Tracker as tracker @@ -87,11 +87,11 @@ def __init__( super(TriAdaptiveModel, self).__init__() self.device = device self.language_model1 = language_model1.to(device) - self.lm1_output_dims = language_model1.get_output_dims() + self.lm1_output_dims = language_model1.output_dims self.language_model2 = language_model2.to(device) - self.lm2_output_dims = language_model2.get_output_dims() + self.lm2_output_dims = language_model2.output_dims self.language_model3 = language_model3.to(device) - self.lm3_output_dims = language_model3.get_output_dims() + self.lm3_output_dims = language_model3.output_dims self.dropout1 = nn.Dropout(embeds_dropout_prob) self.dropout2 = nn.Dropout(embeds_dropout_prob) self.dropout3 = nn.Dropout(embeds_dropout_prob) @@ -165,17 +165,17 @@ def load( """ # Language Model if lm1_name: - language_model1 = LanguageModel.load(os.path.join(load_dir, lm1_name)) + language_model1 = get_language_model(os.path.join(load_dir, lm1_name)) else: - language_model1 = LanguageModel.load(load_dir) + language_model1 = get_language_model(load_dir) if lm2_name: - language_model2 = LanguageModel.load(os.path.join(load_dir, lm2_name)) + language_model2 = get_language_model(os.path.join(load_dir, lm2_name)) else: - language_model2 = LanguageModel.load(load_dir) + language_model2 = get_language_model(load_dir) if lm3_name: - language_model3 = LanguageModel.load(os.path.join(load_dir, lm3_name)) + language_model3 = get_language_model(os.path.join(load_dir, lm3_name)) else: - language_model3 = LanguageModel.load(load_dir) + language_model3 = get_language_model(load_dir) # Prediction heads ph_config_files = cls._get_prediction_head_files(load_dir) @@ -382,7 +382,7 @@ def verify_vocab_size(self, vocab_size1: int, vocab_size2: int, vocab_size3: int msg = ( f"Vocab size of tokenizer {vocab_size1} doesn't match with model {model1_vocab_len}. " "If you added a custom vocabulary to the tokenizer, " - "make sure to supply 'n_added_tokens' to LanguageModel.load() and BertStyleLM.load()" + "make sure to supply 'n_added_tokens' to get_language_model() and BertStyleLM.load()" ) assert vocab_size1 == model1_vocab_len, msg @@ -391,7 +391,7 @@ def verify_vocab_size(self, vocab_size1: int, vocab_size2: int, vocab_size3: int msg = ( f"Vocab size of tokenizer {vocab_size1} doesn't match with model {model2_vocab_len}. " "If you added a custom vocabulary to the tokenizer, " - "make sure to supply 'n_added_tokens' to LanguageModel.load() and BertStyleLM.load()" + "make sure to supply 'n_added_tokens' to get_language_model() and BertStyleLM.load()" ) assert vocab_size2 == model2_vocab_len, msg @@ -400,7 +400,7 @@ def verify_vocab_size(self, vocab_size1: int, vocab_size2: int, vocab_size3: int msg = ( f"Vocab size of tokenizer {vocab_size3} doesn't match with model {model3_vocab_len}. " "If you added a custom vocabulary to the tokenizer, " - "make sure to supply 'n_added_tokens' to LanguageModel.load() and BertStyleLM.load()" + "make sure to supply 'n_added_tokens' to get_language_model() and BertStyleLM.load()" ) assert vocab_size3 == model1_vocab_len, msg diff --git a/haystack/modeling/training/base.py b/haystack/modeling/training/base.py index 5031a20d50..0d585f75ff 100644 --- a/haystack/modeling/training/base.py +++ b/haystack/modeling/training/base.py @@ -18,7 +18,6 @@ from haystack.modeling.evaluation.eval import Evaluator from haystack.modeling.model.adaptive_model import AdaptiveModel from haystack.modeling.model.optimization import get_scheduler -from haystack.modeling.model.language_model import DebertaV2 from haystack.modeling.utils import GracefulKiller from haystack.utils.experiment_tracking import Tracker as tracker @@ -251,9 +250,7 @@ def train(self): vocab_size1=len(self.data_silo.processor.query_tokenizer), vocab_size2=len(self.data_silo.processor.passage_tokenizer), ) - elif not isinstance( - self.model.language_model, DebertaV2 - ): # DebertaV2 has mismatched vocab size on purpose (see https://github.com/huggingface/transformers/issues/12428) + elif not self.model.language_model.name == "debertav2": # DebertaV2 has mismatched vocab size on purpose (see https://github.com/huggingface/transformers/issues/12428) self.model.verify_vocab_size(vocab_size=len(self.data_silo.processor.tokenizer)) self.model.train() diff --git a/haystack/nodes/retriever/dense.py b/haystack/nodes/retriever/dense.py index 3d13d81e99..33b862e178 100644 --- a/haystack/nodes/retriever/dense.py +++ b/haystack/nodes/retriever/dense.py @@ -21,7 +21,7 @@ from haystack.nodes.retriever.base import BaseRetriever from haystack.nodes.retriever._embedding_encoder import _EMBEDDING_ENCODERS from haystack.modeling.model.tokenization import get_tokenizer -from haystack.modeling.model.language_model import LanguageModel +from haystack.modeling.model.language_model import get_language_model from haystack.modeling.model.biadaptive_model import BiAdaptiveModel from haystack.modeling.model.triadaptive_model import TriAdaptiveModel from haystack.modeling.model.prediction_head import TextSimilarityHead @@ -166,7 +166,7 @@ def __init__( tokenizer_class=tokenizers_default_classes["query"], use_auth_token=use_auth_token, ) - self.query_encoder = LanguageModel.load( + self.query_encoder = get_language_model( pretrained_model_name_or_path=query_embedding_model, revision=model_version, language_model_class="DPRQuestionEncoder", @@ -180,7 +180,7 @@ def __init__( tokenizer_class=tokenizers_default_classes["passage"], use_auth_token=use_auth_token, ) - self.passage_encoder = LanguageModel.load( + self.passage_encoder = get_language_model( pretrained_model_name_or_path=passage_embedding_model, revision=model_version, language_model_class="DPRContextEncoder", @@ -875,7 +875,7 @@ def __init__( tokenizer_class=tokenizers_default_classes["query"], use_auth_token=use_auth_token, ) - self.query_encoder = LanguageModel.load( + self.query_encoder = get_language_model( pretrained_model_name_or_path=query_embedding_model, revision=model_version, language_model_class="DPRQuestionEncoder", @@ -889,7 +889,7 @@ def __init__( tokenizer_class=tokenizers_default_classes["passage"], use_auth_token=use_auth_token, ) - self.passage_encoder = LanguageModel.load( + self.passage_encoder = get_language_model( pretrained_model_name_or_path=passage_embedding_model, revision=model_version, language_model_class="DPRContextEncoder", @@ -903,7 +903,7 @@ def __init__( tokenizer_class=tokenizers_default_classes["table"], use_auth_token=use_auth_token, ) - self.table_encoder = LanguageModel.load( + self.table_encoder = get_language_model( pretrained_model_name_or_path=table_embedding_model, revision=model_version, language_model_class="DPRContextEncoder", diff --git a/test/modeling/test_modeling_dpr.py b/test/modeling/test_modeling_dpr.py index 978e118260..d8090f596a 100644 --- a/test/modeling/test_modeling_dpr.py +++ b/test/modeling/test_modeling_dpr.py @@ -1,3 +1,4 @@ +import os import logging from pathlib import Path @@ -10,7 +11,7 @@ from haystack.modeling.data_handler.dataloader import NamedDataLoader from haystack.modeling.data_handler.processor import TextSimilarityProcessor from haystack.modeling.model.biadaptive_model import BiAdaptiveModel -from haystack.modeling.model.language_model import LanguageModel, DPRContextEncoder, DPRQuestionEncoder +from haystack.modeling.model.language_model import get_language_model, DPRContextEncoder, DPRQuestionEncoder from haystack.modeling.model.prediction_head import TextSimilarityHead from haystack.modeling.model.tokenization import get_tokenizer from haystack.modeling.utils import set_all_seeds, initialize_device_settings @@ -46,15 +47,13 @@ def test_dpr_modules(caplog=None): num_hard_negatives=1, ) - question_language_model = LanguageModel.load( + question_language_model = DPRQuestionEncoder( pretrained_model_name_or_path="bert-base-uncased", - language_model_class="DPRQuestionEncoder", hidden_dropout_prob=0, attention_probs_dropout_prob=0, ) - passage_language_model = LanguageModel.load( + passage_language_model = DPRContextEncoder( pretrained_model_name_or_path="bert-base-uncased", - language_model_class="DPRContextEncoder", hidden_dropout_prob=0, attention_probs_dropout_prob=0, ) @@ -131,9 +130,12 @@ def test_dpr_modules(caplog=None): torch.eq(features["passage_attention_mask"][0][1].nonzero().cpu().squeeze(), torch.tensor(list(range(143)))) ) + features_query = {key.replace("query_", ""): value for key, value in features.items() if key.startswith("query_")} + features_passage = {key.replace("passage_", ""): value for key, value in features.items() if key.startswith("passage_")} + # test model encodings - query_vector = model.language_model1(**features)[0] - passage_vector = model.language_model2(**features)[0] + query_vector = model.language_model1(**features_query)[0] + passage_vector = model.language_model2(**features_passage)[0] assert torch.all( torch.le( query_vector[0, :10].cpu() @@ -485,9 +487,9 @@ def test_dpr_problematic(): ] query_tok = "facebook/dpr-question_encoder-single-nq-base" - query_tokenizer = get_tokenizer(query_tok, use_fast=True) + query_tokenizer = get_tokenizer(query_tok) passage_tok = "facebook/dpr-ctx_encoder-single-nq-base" - passage_tokenizer = get_tokenizer(passage_tok, use_fast=True) + passage_tokenizer = get_tokenizer(passage_tok) processor = TextSimilarityProcessor( query_tokenizer=query_tokenizer, passage_tokenizer=passage_tokenizer, @@ -516,9 +518,9 @@ def test_dpr_query_only(): ] query_tok = "facebook/dpr-question_encoder-single-nq-base" - query_tokenizer = get_tokenizer(query_tok, use_fast=True) + query_tokenizer = get_tokenizer(query_tok) passage_tok = "facebook/dpr-ctx_encoder-single-nq-base" - passage_tokenizer = get_tokenizer(passage_tok, use_fast=True) + passage_tokenizer = get_tokenizer(passage_tok) processor = TextSimilarityProcessor( query_tokenizer=query_tokenizer, passage_tokenizer=passage_tokenizer, @@ -578,9 +580,9 @@ def test_dpr_context_only(): ] query_tok = "facebook/dpr-question_encoder-single-nq-base" - query_tokenizer = get_tokenizer(query_tok, use_fast=True) + query_tokenizer = get_tokenizer(query_tok) passage_tok = "facebook/dpr-ctx_encoder-single-nq-base" - passage_tokenizer = get_tokenizer(passage_tok, use_fast=True) + passage_tokenizer = get_tokenizer(passage_tok) processor = TextSimilarityProcessor( query_tokenizer=query_tokenizer, passage_tokenizer=passage_tokenizer, @@ -629,9 +631,9 @@ def test_dpr_processor_save_load(tmp_path): } query_tok = "facebook/dpr-question_encoder-single-nq-base" - query_tokenizer = get_tokenizer(query_tok, use_fast=True) + query_tokenizer = get_tokenizer(query_tok) passage_tok = "facebook/dpr-ctx_encoder-single-nq-base" - passage_tokenizer = get_tokenizer(passage_tok, use_fast=True) + passage_tokenizer = get_tokenizer(passage_tok) processor = TextSimilarityProcessor( query_tokenizer=query_tokenizer, passage_tokenizer=passage_tokenizer, @@ -646,9 +648,10 @@ def test_dpr_processor_save_load(tmp_path): metric="text_similarity_metric", shuffle_negatives=False, ) - processor.save(save_dir=f"{tmp_path}/testsave/dpr_processor") + save_dir = f"{tmp_path}/testsave/dpr_processor" + processor.save(save_dir=save_dir) dataset, tensor_names, _ = processor.dataset_from_dicts(dicts=[d], return_baskets=False) - loadedprocessor = TextSimilarityProcessor.load_from_dir(load_dir=f"{tmp_path}/testsave/dpr_processor") + loadedprocessor = TextSimilarityProcessor.load_from_dir(load_dir=save_dir) dataset2, tensor_names, _ = loadedprocessor.dataset_from_dicts(dicts=[d], return_baskets=False) assert np.array_equal(dataset.tensors[0], dataset2.tensors[0]) @@ -692,12 +695,12 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path, query_and_passage_ query_tokenizer = get_tokenizer( pretrained_model_name_or_path=query_embedding_model ) # tokenizer class is inferred automatically - query_encoder = LanguageModel.load( - pretrained_model_name_or_path=query_embedding_model, language_model_class="DPRQuestionEncoder" + query_encoder = DPRQuestionEncoder( + pretrained_model_name_or_path=query_embedding_model ) passage_tokenizer = get_tokenizer(pretrained_model_name_or_path=passage_embedding_model) - passage_encoder = LanguageModel.load( - pretrained_model_name_or_path=passage_embedding_model, language_model_class="DPRContextEncoder" + passage_encoder = DPRContextEncoder( + pretrained_model_name_or_path=passage_embedding_model ) processor = TextSimilarityProcessor( @@ -740,13 +743,13 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path, query_and_passage_ loaded_query_tokenizer = get_tokenizer( pretrained_model_name_or_path=Path(save_dir) / query_encoder_dir, use_fast=True ) # tokenizer class is inferred automatically - loaded_query_encoder = LanguageModel.load( + loaded_query_encoder = get_language_model( pretrained_model_name_or_path=Path(save_dir) / query_encoder_dir, language_model_class="DPRQuestionEncoder" ) loaded_passage_tokenizer = get_tokenizer( pretrained_model_name_or_path=Path(save_dir) / passage_encoder_dir, use_fast=True ) - loaded_passage_encoder = LanguageModel.load( + loaded_passage_encoder = get_language_model( pretrained_model_name_or_path=Path(save_dir) / passage_encoder_dir, language_model_class="DPRContextEncoder" ) @@ -852,11 +855,11 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path, query_and_passage_ query_tokenizer = get_tokenizer( pretrained_model_name_or_path=Path(save_dir) / query_encoder_dir ) # tokenizer class is inferred automatically - query_encoder = LanguageModel.load( + query_encoder = get_language_model( pretrained_model_name_or_path=Path(save_dir) / query_encoder_dir, language_model_class="DPRQuestionEncoder" ) passage_tokenizer = get_tokenizer(pretrained_model_name_or_path=Path(save_dir) / passage_encoder_dir) - passage_encoder = LanguageModel.load( + passage_encoder = get_language_model( pretrained_model_name_or_path=Path(save_dir) / passage_encoder_dir, language_model_class="DPRContextEncoder" ) @@ -965,9 +968,9 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path, query_and_passage_ # # data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False) # -# question_language_model = LanguageModel.load(pretrained_model_name_or_path=question_lang_model, +# question_language_model = get_language_model(pretrained_model_name_or_path=question_lang_model, # language_model_class="DPRQuestionEncoder") -# passage_language_model = LanguageModel.load(pretrained_model_name_or_path=passage_lang_model, +# passage_language_model = get_language_model(pretrained_model_name_or_path=passage_lang_model, # language_model_class="DPRContextEncoder") # # prediction_head = TextSimilarityHead(similarity_function=similarity_function) diff --git a/test/modeling/test_modeling_prediction_head.py b/test/modeling/test_modeling_prediction_head.py index e607bce7cc..368afc5022 100644 --- a/test/modeling/test_modeling_prediction_head.py +++ b/test/modeling/test_modeling_prediction_head.py @@ -1,7 +1,7 @@ import logging from haystack.modeling.model.adaptive_model import AdaptiveModel -from haystack.modeling.model.language_model import LanguageModel +from haystack.modeling.model.language_model import get_language_model from haystack.modeling.model.prediction_head import QuestionAnsweringHead from haystack.modeling.utils import set_all_seeds, initialize_device_settings @@ -14,7 +14,7 @@ def test_prediction_head_load_save(tmp_path, caplog=None): devices, n_gpu = initialize_device_settings(use_cuda=False) lang_model = "bert-base-german-cased" - language_model = LanguageModel.load(lang_model) + language_model = get_language_model(lang_model) prediction_head = QuestionAnsweringHead() model = AdaptiveModel( diff --git a/test/modeling/test_modeling_processor.py b/test/modeling/test_modeling_processor.py index 8e3e6f9328..79308d80f8 100644 --- a/test/modeling/test_modeling_processor.py +++ b/test/modeling/test_modeling_processor.py @@ -24,7 +24,7 @@ def test_dataset_from_dicts_qa_inference(caplog=None): sample_types = ["answer-wrong", "answer-offset-wrong", "noanswer", "vanilla"] for model in models: - tokenizer = get_tokenizer(pretrained_model_name_or_path=model, use_fast=True) + tokenizer = get_tokenizer(pretrained_model_name_or_path=model) processor = SquadProcessor(tokenizer, max_seq_len=256, data_dir=None) for sample_type in sample_types: @@ -251,7 +251,7 @@ def test_dataset_from_dicts_qa_labelconversion(caplog=None): sample_types = ["answer-wrong", "answer-offset-wrong", "noanswer", "vanilla"] for model in models: - tokenizer = get_tokenizer(pretrained_model_name_or_path=model, use_fast=True) + tokenizer = get_tokenizer(pretrained_model_name_or_path=model) processor = SquadProcessor(tokenizer, max_seq_len=256, data_dir=None) for sample_type in sample_types: diff --git a/test/modeling/test_tokenization.py b/test/modeling/test_tokenization.py index 1a371972aa..865348ce7c 100644 --- a/test/modeling/test_tokenization.py +++ b/test/modeling/test_tokenization.py @@ -225,7 +225,7 @@ def test_save_load(tmp_path, caplog): @pytest.mark.parametrize("model_name", ["bert-base-german-cased", "google/electra-small-discriminator"]) def test_fast_tokenizer_with_examples(caplog, model_name): - fast_tokenizer = get_tokenizer(model_name, lower_case=False, use_fast=True) + fast_tokenizer = get_tokenizer(model_name, lower_case=False) tokenizer = get_tokenizer(model_name, lower_case=False, use_fast=False) for text in TEXTS: @@ -389,7 +389,7 @@ def test_fast_bert_custom_vocab(caplog): lang_model = "bert-base-cased" - tokenizer = get_tokenizer(pretrained_model_name_or_path=lang_model, do_lower_case=False, use_fast=True) + tokenizer = get_tokenizer(pretrained_model_name_or_path=lang_model, do_lower_case=False) # deprecated: tokenizer.add_custom_vocab("samples/tokenizer/custom_vocab.txt") tokenizer.add_tokens(new_tokens=["neverseentokens"]) @@ -458,7 +458,7 @@ def test_fast_bert_custom_vocab(caplog): def test_fast_tokenizer_type(caplog, model_name, tokenizer_type): caplog.set_level(logging.CRITICAL) - tokenizer = get_tokenizer(model_name, use_fast=True) + tokenizer = get_tokenizer(model_name) assert type(tokenizer) is tokenizer_type @@ -477,13 +477,13 @@ def test_fast_tokenizer_type(caplog, model_name, tokenizer_type): def test_fast_electra_tokenizer(caplog): caplog.set_level(logging.CRITICAL) - tokenizer = get_tokenizer("dbmdz/electra-base-german-europeana-cased-discriminator", use_fast=True) + tokenizer = get_tokenizer("dbmdz/electra-base-german-europeana-cased-discriminator") assert type(tokenizer) is ElectraTokenizerFast @pytest.mark.parametrize("model_name", ["bert-base-cased", "distilbert-base-uncased", "deepset/electra-base-squad2"]) def test_detokenization_in_fast_tokenizers(model_name): - tokenizer = get_tokenizer(pretrained_model_name_or_path=model_name, use_fast=True) + tokenizer = get_tokenizer(pretrained_model_name_or_path=model_name) for text in TEXTS: encoded = tokenizer.encode_plus(text, add_special_tokens=False).encodings[0] From b6b4e1d117c93315729eb4c2e167ab8b3889c22c Mon Sep 17 00:00:00 2001 From: ZanSara Date: Tue, 21 Jun 2022 11:38:53 +0200 Subject: [PATCH 06/89] Fixing dpr issue in test --- haystack/modeling/model/adaptive_model.py | 2 +- haystack/modeling/model/biadaptive_model.py | 2 +- haystack/modeling/model/language_model.py | 424 ++++++++------------ test/modeling/test_modeling_dpr.py | 38 +- 4 files changed, 200 insertions(+), 266 deletions(-) diff --git a/haystack/modeling/model/adaptive_model.py b/haystack/modeling/model/adaptive_model.py index c80ac19083..4df4b94f97 100644 --- a/haystack/modeling/model/adaptive_model.py +++ b/haystack/modeling/model/adaptive_model.py @@ -328,7 +328,7 @@ def convert_from_transformers( :return: AdaptiveModel """ - lm = get_language_model(model_name_or_path, revision=revision, auth_token=use_auth_token, **kwargs) + lm = get_language_model(model_name_or_path, revision=revision, auth_token=use_auth_token, model_kwargs=kwargs) if task_type is None: # Infer task type from config architecture = lm.model.config.architectures[0] diff --git a/haystack/modeling/model/biadaptive_model.py b/haystack/modeling/model/biadaptive_model.py index 3c7625e3ff..c2a967978a 100644 --- a/haystack/modeling/model/biadaptive_model.py +++ b/haystack/modeling/model/biadaptive_model.py @@ -319,7 +319,7 @@ def forward_lm(self, **kwargs): pooled_output[0] = pooled_output1 if "passage_input_ids" in kwargs.keys(): - passage_params = {key.replace("passage_", ""): value for key, value in kwargs.items() if key.startswith("passage_")} + passage_params = {key.replace("passage_", ""): value[0] for key, value in kwargs.items() if key.startswith("passage_")} pooled_output2, _ = self.language_model2(**passage_params) pooled_output[1] = pooled_output2 diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 5bbe664b48..4c475a979e 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -17,7 +17,12 @@ Acknowledgements: Many of the modeling parts here come from the great transformers repository: https://github.com/huggingface/transformers. Thanks for the great work! """ + from typing import Type, Optional, Dict, Any, Union, List +try: + from typing import Literal +except ImportError: + from typing_extensions import Literal # type: ignore import re import json @@ -94,28 +99,35 @@ def __init__(self, name: str): self._output_dims = None self.name = name + @property + def encoder(self): + return self.model.encoder + @abstractmethod def forward( self, input_ids: torch.Tensor, segment_ids: torch.Tensor, - padding_mask: torch.Tensor, + padding_mask: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, output_hidden_states: Optional[bool] = None, output_attentions: Optional[bool] = None ): raise NotImplementedError - def enable_hidden_states_output(self): + @property + def output_hidden_states(self): """ - Sets the model to output the hidden states + Controls whether the model outputs the hidden states or not """ - self.model.encoder.config.output_hidden_states = True + self.encoder.config.output_hidden_states = True - def disable_hidden_states_output(self): + @output_hidden_states.setter + def output_hidden_states(self, value: bool): """ - Sets the model to not output the hidden states + Sets the model to output the hidden states or not """ - self.model.encoder.config.output_hidden_states = False + self.encoder.config.output_hidden_states = value @property def output_dims(self): @@ -144,12 +156,7 @@ def save_config(self, save_dir: Union[Path, str]): setattr(self.model.config, "name", self.name) setattr(self.model.config, "language", self.language) - # For DPR models, transformers overwrites the model_type with the one set in DPRConfig - # Therefore, we copy the model_type from the model config to DPRConfig - if self.name == "DPRQuestionEncoder" or self.name == "DPRContextEncoder": - setattr(transformers.DPRConfig, "model_type", self.model.config.model_type) string = self.model.config.to_json_string() - with open(save_filename, "w") as file: file.write(string) @@ -268,7 +275,7 @@ def __init__( language: str = None, n_added_tokens: int = 0, auth_token: Optional[str] = None, - **kwargs + transformers_args: Optional[Dict[str, Any]] = None ): """ Load a pretrained model by supplying one of the following: @@ -295,12 +302,12 @@ def __init__( # Haystack style haystack_lm_model = Path(pretrained_model_name_or_path) / "language_model.bin" model_config = config_class.from_pretrained(haystack_lm_config) - self.model = model_class.from_pretrained(haystack_lm_model, config=model_config, use_auth_token=auth_token or False, **kwargs) + self.model = model_class.from_pretrained(haystack_lm_model, config=model_config, use_auth_token=auth_token or False, **(transformers_args or {})) self.language = self.model.config.language else: # Pytorch-transformer Style - self.model = model_class.from_pretrained(str(pretrained_model_name_or_path), use_auth_token=auth_token or False, **kwargs) - self.language = language or _infer_language_from_name(pretrained_model_name_or_path) + self.model = model_class.from_pretrained(str(pretrained_model_name_or_path), use_auth_token=auth_token or False, **(transformers_args or {})) + self.language = language or _guess_language(pretrained_model_name_or_path) # resize embeddings in case of custom vocab if n_added_tokens != 0: @@ -319,7 +326,8 @@ def forward( self, input_ids: torch.Tensor, segment_ids: torch.Tensor, - padding_mask: torch.Tensor, + padding_mask: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, output_hidden_states: Optional[bool] = None, output_attentions: Optional[bool] = None ): @@ -330,26 +338,25 @@ def forward( :param segment_ids: The ID of the segment. For example, in next sentence prediction, the tokens in the first sentence are marked with 0 and the tokens in the second sentence are marked with 1. It is a tensor of shape [batch_size, max_seq_len]. - :param padding_mask: A mask that assigns 1 to valid input tokens and 0 to padding tokens - of shape [batch_size, max_seq_len]. + :param padding_mask/attention_mask: A mask that assigns 1 to valid input tokens and 0 to padding tokens + of shape [batch_size, max_seq_len]. Different models call this parameter differently (padding/attention mask). :param output_hidden_states: When set to `True`, outputs hidden states in addition to the embeddings. :param output_attentions: When set to `True`, outputs attentions in addition to the embeddings. :return: Embeddings for each token in the input sequence. Can also return hidden states and attentions if specified using the arguments `output_hidden_states` and `output_attentions`. """ - if output_hidden_states is None: - output_hidden_states = self.model.encoder.config.output_hidden_states - if output_attentions is None: - output_attentions = self.model.encoder.config.output_attentions - - output_tuple = self.model( + mask = {} + if padding_mask is not None: + mask["padding_mask"] = padding_mask + else: + mask["attention_mask"] = attention_mask + return self.model( input_ids, token_type_ids=segment_ids, - attention_mask=padding_mask, - output_hidden_states=output_hidden_states, - output_attentions=output_attentions, + output_hidden_states=output_hidden_states or self.encoder.config.output_hidden_states, + output_attentions=output_attentions or self.encoder.config.output_attentions, return_dict=False, + **mask ) - return output_tuple class HFLanguageModelWithPooler(HFLanguageModel): @@ -362,7 +369,7 @@ class HFLanguageModelWithPooler(HFLanguageModel): - Unlike the other BERT variants, these don't output the `pooled_output`. An additional pooler is initialized. """ - def __init__(self, pretrained_model_name_or_path: Union[Path, str], language: str = None, n_added_tokens: int = 0, **kwargs): + def __init__(self, pretrained_model_name_or_path: Union[Path, str], language: str = None, n_added_tokens: int = 0, transformers_args: Optional[Dict[str, Any]] = None): """ Load a pretrained model by supplying one of the following: @@ -391,7 +398,8 @@ def forward( # type: ignore self, input_ids: torch.Tensor, segment_ids: torch.Tensor, - padding_mask: torch.Tensor, + padding_mask: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, output_hidden_states: Optional[bool] = None, output_attentions: Optional[bool] = None, **kwargs, @@ -400,16 +408,18 @@ def forward( # type: ignore Perform the forward pass of the model. :param input_ids: The IDs of each token in the input sequence. It's a tensor of shape [batch_size, max_seq_len]. - :param padding_mask: A mask that assigns 1 to valid input tokens and 0 to padding tokens - of shape [batch_size, max_seq_len]. + :param padding_mask/attention_mask: A mask that assigns 1 to valid input tokens and 0 to padding tokens + of shape [batch_size, max_seq_len]. Different models call this parameter differently (padding/attention mask). :param output_hidden_states: When set to `True`, outputs hidden states in addition to the embeddings. :param output_attentions: When set to `True`, outputs attentions in addition to the embeddings. :return: Embeddings for each token in the input sequence. """ + output_tuple = super().forward( input_ids=input_ids, segment_ids=segment_ids, padding_mask=padding_mask, + attention_mask=attention_mask, output_hidden_states=output_hidden_states, output_attentions=output_attentions, **kwargs, @@ -418,29 +428,33 @@ def forward( # type: ignore return (output_tuple[0], pooled_output) + output_tuple[1:] -class DPRQuestionEncoder(LanguageModel): +class DPREncoder(LanguageModel): """ - A DPRQuestionEncoder model that wraps Hugging Face's implementation. + A DPREncoder model that wraps Hugging Face's implementation. """ - @silence_transformers_logs def __init__( self, pretrained_model_name_or_path: Union[Path, str], + model_type: str, language: str = None, auth_token: Optional[str] = None, - **kwargs, + transformers_kwargs: Optional[Dict[str, Any]] = None ): """ Load a pretrained model by supplying one of the following: - * The name of a remote model on s3 (for example, "facebook/dpr-question_encoder-single-nq-base"). * A local path of a model trained using transformers (for example, "some_dir/huggingface_model"). * A local path of a model trained using Haystack (for example, "some_dir/haystack_model"). - :param pretrained_model_name_or_path: The path of the base pretrained language model whose weights are used to initialize DPRQuestionEncoder. """ - super().__init__(name="DPRQuestionEncoder") + super().__init__(name=model_type) + self.role = "question" if "question" in model_type.lower() else "context" + self._encoder = None + + kwargs = transformers_kwargs or {} + model_classname = f"DPR{self.role.capitalize()}Encoder" + model_class: Type[PreTrainedModel] = getattr(transformers, model_classname, None) # We need to differentiate between loading model using Haystack format and Pytorch-Transformers format haystack_lm_config = Path(pretrained_model_name_or_path) / "language_model_config.json" @@ -451,30 +465,44 @@ def __init__( if original_model_config.model_type == "dpr": dpr_config = transformers.DPRConfig.from_pretrained(haystack_lm_config) - self.model = transformers.DPRQuestionEncoder.from_pretrained( + self.model = model_class.from_pretrained( haystack_lm_model, config=dpr_config, **kwargs ) else: - if original_model_config.model_type.lower() != "bert": + if original_model_config.model_type != "bert": logger.warning( f"Using a model of type '{original_model_config.model_type}' which might be incompatible with DPR encoders." f"Bert based encoders are supported that need input_ids,token_type_ids,attention_mask as input tensors." ) original_config_dict = vars(original_model_config) original_config_dict.update(kwargs) - self.model = transformers.DPRQuestionEncoder( + self.model = model_class( config=transformers.DPRConfig(**original_config_dict) ) - self.model.base_model.bert_model = get_language_model(str(pretrained_model_name_or_path), auth_token=auth_token).model - + + language_model_type = _get_model_type(haystack_lm_config, auth_token=auth_token, **kwargs) + # Find the class corresponding to this model type + language_model_class: Type[LanguageModel] = HUGGINGFACE_TO_HAYSTACK.get(language_model_type, None) + if not language_model_class: + raise ValueError( + f"The type of model supplied ({language_model_type}) is not supported by Haystack. " + f"Supported model categories are: {', '.join(HUGGINGFACE_TO_HAYSTACK.keys())}") + + # Instantiate the class for this model + self.model.base_model.bert_model = language_model_class( + pretrained_model_name_or_path, + model_type=language_model_type, + **kwargs + ).model + self.language = self.model.config.language else: original_model_config = AutoConfig.from_pretrained( pretrained_model_name_or_path, use_auth_token=auth_token or False ) - if "dpr" in original_model_config.model_type.lower(): + if original_model_config.model_type == "dpr": # "pretrained dpr model": load existing pretrained DPRQuestionEncoder model - self.model = transformers.DPRQuestionEncoder.from_pretrained( + self.model = model_class.from_pretrained( str(pretrained_model_name_or_path), use_auth_token=auth_token or False, **kwargs ) else: @@ -488,155 +516,29 @@ def __init__( ) original_config_dict = vars(original_model_config) original_config_dict.update(kwargs) - self.model = transformers.DPRQuestionEncoder( + self.model = model_class( config=transformers.DPRConfig(**original_config_dict) ) self.model.base_model.bert_model = AutoModel.from_pretrained( str(pretrained_model_name_or_path), use_auth_token=auth_token or False, **original_config_dict ) - self.language = language or _infer_language_from_name(pretrained_model_name_or_path) - - - def save(self, save_dir: Union[str, Path], state_dict: Optional[Dict[Any, Any]] = None): - """ - Save the model `state_dict` and its configuration file so that it can be loaded again. - - :param save_dir: The directory in which the model should be saved. - :param state_dict: A dictionary containing the whole state of the module including names of layers. - By default, the unchanged state dictionary of the module is used. - """ - model_to_save = self.model.module if hasattr(self.model, "module") else self.model # Only save the model itself - - if "dpr" not in self.model.config.model_type.lower() and model_to_save.base_model_prefix.startswith("question_"): - state_dict = model_to_save.state_dict() - if state_dict: - keys = state_dict.keys() - for key in list(keys): - new_key = key - if key.startswith("question_encoder.bert_model.model."): - new_key = key.split("_encoder.bert_model.model.", 1)[1] - elif key.startswith("question_encoder.bert_model."): - new_key = key.split("_encoder.bert_model.", 1)[1] - state_dict[new_key] = state_dict.pop(key) - - super().save(save_dir=save_dir, state_dict=state_dict) - - def forward( # type: ignore - self, - input_ids: torch.Tensor, - segment_ids: torch.Tensor, - attention_mask: torch.Tensor, - ): - """ - Perform the forward pass of the DPRQuestionEncoder model. - - :param input_ids: The IDs of each token in the input sequence. It's a tensor of shape [batch_size, max_seq_len]. - :param segment_ids: The ID of the segment. For example, in next sentence prediction, the tokens in the - first sentence are marked with 0 and the tokens in the second sentence are marked with 1. - It is a tensor of shape [batch_size, max_seq_len]. - :param attention_mask: A mask that assigns 1 to valid input tokens and 0 to padding tokens - of shape [batch_size, max_seq_len]. - :return: Embeddings for each token in the input sequence. - """ - output_tuple = self.model( - input_ids=input_ids, - token_type_ids=segment_ids, - attention_mask=attention_mask, - return_dict=True, - ) - if self.model.question_encoder.config.output_hidden_states == True: - pooled_output, all_hidden_states = output_tuple.pooler_output, output_tuple.hidden_states - return pooled_output, all_hidden_states - else: - pooled_output = output_tuple.pooler_output - return pooled_output, None - - def enable_hidden_states_output(self): - self.model.question_encoder.config.output_hidden_states = True - - def disable_hidden_states_output(self): - self.model.question_encoder.config.output_hidden_states = False + self.language = language or _guess_language(pretrained_model_name_or_path) + @property + def encoder(self): + if not self._encoder: + self._encoder = self.model.question_encoder if self.role == "question" else self.model.ctx_encoder + return self._encoder -class DPRContextEncoder(LanguageModel): - """ - A DPRContextEncoder model that wraps Hugging Face's implementation. - """ - @silence_transformers_logs - def __init__( - self, - pretrained_model_name_or_path: Union[Path, str], - language: str = None, - auth_token: Optional[str] = None, - **kwargs, - ): + def save_config(self, save_dir: Union[Path, str]): """ - Load a pretrained model by supplying one of the following: - - * The name of a remote model on s3 (for example, "facebook/dpr-ctx_encoder-single-nq-base"). - * A local path of a model trained using transformers (for example, "some_dir/huggingface_model"). - * A local path of a model trained using Haystack (for example, "some_dir/haystack_model"). - - :param pretrained_model_name_or_path: The path of the base pretrained language model whose weights are used to initialize DPRContextEncoder. + Save the configuration of the language model in Haystack format. """ - super().__init__(name="DPRContextEncoder") - # We need to differentiate between loading model using Haystack format and Pytorch-Transformers format - haystack_lm_config = Path(pretrained_model_name_or_path) / "language_model_config.json" - - if os.path.exists(haystack_lm_config): - # Haystack style - original_model_config = AutoConfig.from_pretrained(haystack_lm_config) - haystack_lm_model = Path(pretrained_model_name_or_path) / "language_model.bin" - - if "dpr" in original_model_config.model_type.lower(): - dpr_config = transformers.DPRConfig.from_pretrained(haystack_lm_config) - self.model = transformers.DPRContextEncoder.from_pretrained( - haystack_lm_model, config=dpr_config, use_auth_token=auth_token or False, **kwargs - ) - else: - if original_model_config.model_type.lower() != "bert": - logger.warning( - f"Using a model of type '{original_model_config.model_type}' which might be incompatible with DPR encoders." - f"Bert based encoders are supported that need input_ids,token_type_ids,attention_mask as input tensors." - ) - original_config_dict = vars(original_model_config) - original_config_dict.update(kwargs) - self.model = transformers.DPRContextEncoder( - config=transformers.DPRConfig(**original_config_dict) - ) - self.model.base_model.bert_model = get_language_model(str(pretrained_model_name_or_path), auth_token=auth_token).model - self.language = self.model.config.language - - else: - # Pytorch-transformer Style - original_model_config = AutoConfig.from_pretrained( - pretrained_model_name_or_path, use_auth_token=auth_token or False - ) - if "dpr" in original_model_config.model_type.lower(): - # "pretrained dpr model": load existing pretrained DPRContextEncoder model - self.model = transformers.DPRContextEncoder.from_pretrained( - str(pretrained_model_name_or_path), use_auth_token=auth_token or False, **kwargs - ) - else: - # "from scratch": load weights from different architecture (e.g. bert) into DPRContextEncoder - # but keep config values from original architecture - # TODO test for architectures other than BERT, e.g. Electra - if original_model_config.model_type.lower() != "bert": - logger.warning( - f"Using a model of type '{original_model_config.model_type}' which might be incompatible with DPR encoders." - f"Bert based encoders are supported that need input_ids,token_type_ids,attention_mask as input tensors." - ) - original_config_dict = vars(original_model_config) - original_config_dict.update(kwargs) - self.model = transformers.DPRContextEncoder( - config=transformers.DPRConfig(**original_config_dict) - ) - self.model.base_model.bert_model = AutoModel.from_pretrained( - str(pretrained_model_name_or_path), use_auth_token=auth_token or False, **original_config_dict - ) - self.language = language or _infer_language_from_name(pretrained_model_name_or_path) - - + # For DPR models, transformers overwrites the model_type with the one set in DPRConfig + # Therefore, we copy the model_type from the model config to DPRConfig + setattr(transformers.DPRConfig, "model_type", self.model.config.model_type) + super().save_config(save_dir=save_dir) + def save(self, save_dir: Union[str, Path], state_dict: Optional[Dict[Any, Any]] = None): """ Save the model `state_dict` and its configuration file so that it can be loaded again. @@ -646,19 +548,32 @@ def save(self, save_dir: Union[str, Path], state_dict: Optional[Dict[Any, Any]] """ model_to_save = ( self.model.module if hasattr(self.model, "module") else self.model - ) # Only save the model it-self - - if "dpr" not in self.model.config.model_type.lower() and model_to_save.base_model_prefix.startswith("ctx_"): - state_dict = model_to_save.state_dict() - if state_dict: - keys = state_dict.keys() - for key in list(keys): - new_key = key - if key.startswith("ctx_encoder.bert_model.model."): - new_key = key.split("_encoder.bert_model.model.", 1)[1] - elif key.startswith("ctx_encoder.bert_model."): - new_key = key.split("_encoder.bert_model.", 1)[1] - state_dict[new_key] = state_dict.pop(key) + ) # Only save the model itself + + if "dpr" not in self.model.config.model_type.lower(): + if model_to_save.base_model_prefix.startswith("ctx_"): + state_dict = model_to_save.state_dict() + if state_dict: + keys = state_dict.keys() + for key in list(keys): + new_key = key + if key.startswith("ctx_encoder.bert_model.model."): + new_key = key.split("_encoder.bert_model.model.", 1)[1] + elif key.startswith("ctx_encoder.bert_model."): + new_key = key.split("_encoder.bert_model.", 1)[1] + state_dict[new_key] = state_dict.pop(key) + + elif model_to_save.base_model_prefix.startswith("question_"): + state_dict = model_to_save.state_dict() + if state_dict: + keys = state_dict.keys() + for key in list(keys): + new_key = key + if key.startswith("question_encoder.bert_model.model."): + new_key = key.split("_encoder.bert_model.model.", 1)[1] + elif key.startswith("question_encoder.bert_model."): + new_key = key.split("_encoder.bert_model.", 1)[1] + state_dict[new_key] = state_dict.pop(key) super().save(save_dir=save_dir, state_dict=state_dict) @@ -666,42 +581,38 @@ def forward( # type: ignore self, input_ids: torch.Tensor, segment_ids: torch.Tensor, - attention_mask: torch.Tensor, + attention_mask: torch.Tensor ): """ - Perform the forward pass of the DPRContextEncoder model. + Perform the forward pass of the DPR encoder model. - :param passage_input_ids: The IDs of each token in the input sequence. It's a tensor of shape [batch_size, number_of_hard_negative_passages, max_seq_len]. - :param passage_segment_ids: The ID of the segment. For example, in next sentence prediction, the tokens in the + :param input_ids: The IDs of each token in the input sequence. It's a tensor of shape [batch_size, number_of_hard_negative, max_seq_len]. + :param segment_ids: The ID of the segment. For example, in next sentence prediction, the tokens in the first sentence are marked with 0 and the tokens in the second sentence are marked with 1. It is a tensor of shape [batch_size, number_of_hard_negative_passages, max_seq_len]. - :param passage_attention_mask: A mask that assigns 1 to valid input tokens and 0 to padding tokens + :param attention_mask: A mask that assigns 1 to valid input tokens and 0 to padding tokens of shape [batch_size, number_of_hard_negative_passages, max_seq_len]. :return: Embeddings for each token in the input sequence. """ - max_seq_len = input_ids.shape[-1] - input_ids = input_ids.view(-1, max_seq_len) - segment_ids = segment_ids.view(-1, max_seq_len) - attention_mask = attention_mask.view(-1, max_seq_len) + if not self.role == "question": + max_seq_len = input_ids.shape[-1] + input_ids = input_ids.view(-1, max_seq_len) + segment_ids = segment_ids.view(-1, max_seq_len) + attention_mask = attention_mask.view(-1, max_seq_len) + output_tuple = self.model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=attention_mask, return_dict=True, ) - if self.model.ctx_encoder.config.output_hidden_states == True: + if self.encoder.config.output_hidden_states == True: pooled_output, all_hidden_states = output_tuple.pooler_output, output_tuple.hidden_states return pooled_output, all_hidden_states else: pooled_output = output_tuple.pooler_output return pooled_output, None - def enable_hidden_states_output(self): - self.model.ctx_encoder.config.output_hidden_states = True - - def disable_hidden_states_output(self): - self.model.ctx_encoder.config.output_hidden_states = False - HUGGINGFACE_TO_HAYSTACK = { "Albert": HFLanguageModel, @@ -712,8 +623,8 @@ def disable_hidden_states_output(self): "Data2VecVision": HFLanguageModel, "DebertaV2": HFLanguageModelWithPooler, "DistilBert": HFLanguageModelWithPooler, - "DPRContextEncoder": DPRContextEncoder, - "DPRQuestionEncoder": DPRQuestionEncoder, + "DPRContextEncoder": DPREncoder, + "DPRQuestionEncoder": DPREncoder, "Electra": HFLanguageModelWithPooler, "GloVe": HFLanguageModel, "MiniLM": HFLanguageModel, @@ -730,6 +641,10 @@ def disable_hidden_states_output(self): "roberta.*xml": "XLMRoberta", "codebert.*mlm": "Roberta", "mlm.*codebert": "Roberta", + "dpr.*question.*encoder": "DPRQuestionEncoder", + "dpr.*context.*encoder": "DPRContextEncoder", + "dpr.*ctx.*encoder": "DPRContextEncoder", + "mlm.*codebert": "Roberta", "deberta-v2": "DebertaV2", "data2vec-vision": "Data2VecVision", } @@ -755,7 +670,8 @@ def get_language_model( language_model_type: Optional[str] = None, auth_token: Optional[str] = None, revision: Optional[str] = None, - **kwargs + autoconfig_kwargs: Optional[Dict[str, Any]] = None, + model_kwargs: Optional[Dict[str, Any]] = None ) -> LanguageModel: """ Load a pretrained language model by doing one of the following: @@ -772,7 +688,7 @@ def get_language_model( :param revision: The version of the model to use from the Hugging Face model hub. This can be a tag name, a branch name, or a commit hash. :param language_model_type: (Optional) Name of the language model class to load (for example `Bert`). Overrides any other discovered value. """ - logger.info(f"Loading model '{pretrained_model_name_or_path}'") + logger.info(f" * LOADING MODEL: '{pretrained_model_name_or_path}'") config_file = Path(pretrained_model_name_or_path) / "language_model_config.json" @@ -789,7 +705,7 @@ def get_language_model( logger.info(f"Could not find '{pretrained_model_name_or_path}' locally.") logger.info(f"Looking on Transformers Model Hub (in local cache and online)...") language_model_type = _get_model_type( - pretrained_model_name_or_path, auth_token=auth_token, revision=revision, **kwargs + pretrained_model_name_or_path, auth_token=auth_token, revision=revision, autoconfig_kwargs=autoconfig_kwargs ) if not language_model_type: raise Exception( @@ -811,56 +727,68 @@ def get_language_model( pretrained_model_name_or_path, model_type=language_model_type, auth_token=auth_token, - **kwargs + transformers_args=model_kwargs ) logger.info(f"Loaded '{pretrained_model_name_or_path}' ({language_model_type} model)") return language_model -def _get_model_type(model_name_or_path: Union[str, Path], auth_token: Optional[str] = None, revision: Optional[str] = None, **kwargs) -> str: +def _get_model_type( + model_name_or_path: Union[str, Path], + auth_token: Optional[str] = None, + revision: Optional[str] = None, + autoconfig_kwargs: Optional[Dict[str, Any]] = None +) -> str: """ Given a model name, try to use AutoConfig to understand which model type it is. In case it's not successful, tries to infer the type from the name of the model. """ - # Use AutoConfig to understand the model class model_name_or_path = str(model_name_or_path) - config = AutoConfig.from_pretrained( - pretrained_model_name_or_path=model_name_or_path, - use_auth_token=auth_token or False, - revision=revision, - **kwargs - ) - # Find if this mode is present in MODEL_TYPE_BY_NAME.keys() even with a different capitalization - model_type = {key.lower(): key for key in HUGGINGFACE_TO_HAYSTACK.keys()}.get(config.model_type.lower(), None) + if autoconfig_kwargs and "use_auth_token" in autoconfig_kwargs: + auth_token = autoconfig_kwargs["use_auth_token"] + del autoconfig_kwargs["use_auth_token"] - if not model_type: - # DPR - if "dpr" in config.model_type: - if config.archictectures[0] == "DPRReader": - raise NotImplementedError("DPRReader models are currently not supported.") - model_type = config.architectures[0] + model_type: Optional[Type[LanguageModel]] = None + # Use AutoConfig to understand the model class + try: + config = AutoConfig.from_pretrained( + pretrained_model_name_or_path=model_name_or_path, + use_auth_token=auth_token or False, + revision=revision, + **(autoconfig_kwargs or {}) + ) + # Find if this mode is present in MODEL_TYPE_BY_NAME.keys() even with a different capitalization + model_type = {key.lower(): key for key in HUGGINGFACE_TO_HAYSTACK.keys()}.get(config.model_type.lower(), None) - else: - logger.warning("Could not infer the class from config. Trying to infer class from model name.") + except Exception as e: + logger.exception( + f"AutoConfig failed to load on '{model_name_or_path}'. " + ) - # Look for other patterns and variation that hints at the model type - for regex, model_name in NAME_HINTS.keys(): - if re.match(regex, model_name_or_path): - model_type = model_name - break + if not model_type: + logger.warning("Could not infer the model type from its config. Looking for clues in the model name.") + + # Look for other patterns and variation that hints at the model type + for regex, model_name in NAME_HINTS.items(): + if re.match(f".*{regex}.*", model_name_or_path): + model_type = model_name + break - if model_type == "Roberta" and "mlm" in model_name_or_path.lower(): + if model_type and model_type.lower() == "roberta" and "mlm" in model_name_or_path.lower(): logging.error(f"MLM part of codebert is currently not supported in Haystack: '{model_name_or_path}' may crash later.") return model_type -def _infer_language_from_name(name: str) -> str: +def _guess_language(name: str) -> str: + """ + Looks for clues about the model language in the model name. + """ languages = [lang for hint, lang in LANGUAGE_HINTS if hint.lower() in name.lower()] if len(languages) > 0: language = languages[0] else: language = "english" - logger.info(f"Automatically detected language from model name: {language}") + logger.info(f"Auto-detected model language: {language}") return language \ No newline at end of file diff --git a/test/modeling/test_modeling_dpr.py b/test/modeling/test_modeling_dpr.py index d8090f596a..ab8aa96db2 100644 --- a/test/modeling/test_modeling_dpr.py +++ b/test/modeling/test_modeling_dpr.py @@ -11,7 +11,7 @@ from haystack.modeling.data_handler.dataloader import NamedDataLoader from haystack.modeling.data_handler.processor import TextSimilarityProcessor from haystack.modeling.model.biadaptive_model import BiAdaptiveModel -from haystack.modeling.model.language_model import get_language_model, DPRContextEncoder, DPRQuestionEncoder +from haystack.modeling.model.language_model import get_language_model, DPREncoder from haystack.modeling.model.prediction_head import TextSimilarityHead from haystack.modeling.model.tokenization import get_tokenizer from haystack.modeling.utils import set_all_seeds, initialize_device_settings @@ -47,15 +47,21 @@ def test_dpr_modules(caplog=None): num_hard_negatives=1, ) - question_language_model = DPRQuestionEncoder( + question_language_model = DPREncoder( pretrained_model_name_or_path="bert-base-uncased", - hidden_dropout_prob=0, - attention_probs_dropout_prob=0, + model_type="DPRQuestionEncoder", + transformers_kwargs={ + "hidden_dropout_prob": 0, + "attention_probs_dropout_prob": 0, + } ) - passage_language_model = DPRContextEncoder( + passage_language_model = DPREncoder( pretrained_model_name_or_path="bert-base-uncased", - hidden_dropout_prob=0, - attention_probs_dropout_prob=0, + model_type="DPRContextEncoder", + transformers_kwargs={ + "hidden_dropout_prob": 0, + "attention_probs_dropout_prob": 0, + } ) prediction_head = TextSimilarityHead(similarity_function="dot_product") @@ -74,8 +80,8 @@ def test_dpr_modules(caplog=None): assert type(model) == BiAdaptiveModel assert type(processor) == TextSimilarityProcessor - assert type(question_language_model) == DPRQuestionEncoder - assert type(passage_language_model) == DPRContextEncoder + assert type(question_language_model) == DPREncoder + assert type(passage_language_model) == DPREncoder # check embedding layer weights assert list(model.named_parameters())[0][1][0, 0].item() - -0.010200000368058681 < 0.0001 @@ -695,11 +701,11 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path, query_and_passage_ query_tokenizer = get_tokenizer( pretrained_model_name_or_path=query_embedding_model ) # tokenizer class is inferred automatically - query_encoder = DPRQuestionEncoder( + query_encoder = get_language_model( pretrained_model_name_or_path=query_embedding_model ) passage_tokenizer = get_tokenizer(pretrained_model_name_or_path=passage_embedding_model) - passage_encoder = DPRContextEncoder( + passage_encoder = get_language_model( pretrained_model_name_or_path=passage_embedding_model ) @@ -744,13 +750,13 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path, query_and_passage_ pretrained_model_name_or_path=Path(save_dir) / query_encoder_dir, use_fast=True ) # tokenizer class is inferred automatically loaded_query_encoder = get_language_model( - pretrained_model_name_or_path=Path(save_dir) / query_encoder_dir, language_model_class="DPRQuestionEncoder" + pretrained_model_name_or_path=Path(save_dir) / query_encoder_dir ) loaded_passage_tokenizer = get_tokenizer( pretrained_model_name_or_path=Path(save_dir) / passage_encoder_dir, use_fast=True ) loaded_passage_encoder = get_language_model( - pretrained_model_name_or_path=Path(save_dir) / passage_encoder_dir, language_model_class="DPRContextEncoder" + pretrained_model_name_or_path=Path(save_dir) / passage_encoder_dir ) loaded_processor = TextSimilarityProcessor( @@ -797,7 +803,7 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path, query_and_passage_ all_embeddings = {"query": [], "passages": []} model.eval() - for i, batch in enumerate(tqdm(data_loader, desc=f"Creating Embeddings", unit=" Batches", disable=True)): + for batch in tqdm(data_loader, desc=f"Creating Embeddings", unit=" Batches", disable=True): batch = {key: batch[key].to(device) for key in batch} # get logits @@ -856,11 +862,11 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path, query_and_passage_ pretrained_model_name_or_path=Path(save_dir) / query_encoder_dir ) # tokenizer class is inferred automatically query_encoder = get_language_model( - pretrained_model_name_or_path=Path(save_dir) / query_encoder_dir, language_model_class="DPRQuestionEncoder" + pretrained_model_name_or_path=Path(save_dir) / query_encoder_dir ) passage_tokenizer = get_tokenizer(pretrained_model_name_or_path=Path(save_dir) / passage_encoder_dir) passage_encoder = get_language_model( - pretrained_model_name_or_path=Path(save_dir) / passage_encoder_dir, language_model_class="DPRContextEncoder" + pretrained_model_name_or_path=Path(save_dir) / passage_encoder_dir ) processor = TextSimilarityProcessor( From 268cacd516940255d4c45022f6459a7f53fd0b23 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 22 Jun 2022 12:10:33 +0200 Subject: [PATCH 07/89] Fixing DPRetriever, Embedding Retriever and usage of new API in modeling --- haystack/document_stores/memory.py | 9 +- haystack/modeling/infer.py | 4 +- haystack/modeling/model/adaptive_model.py | 4 +- haystack/modeling/model/biadaptive_model.py | 47 ++++++++-- haystack/modeling/model/language_model.py | 85 +++++++++---------- haystack/modeling/model/tokenization.py | 12 ++- haystack/modeling/model/triadaptive_model.py | 6 +- haystack/nodes/retriever/dense.py | 56 ++++++------ .../{test_modeling_dpr.py => test_dpr.py} | 22 ++++- ...odeling_inference.py => test_inference.py} | 0 ...iction_head.py => test_prediction_head.py} | 0 ...odeling_processor.py => test_processor.py} | 0 ...loading.py => test_processor_save_load.py} | 0 ...nswering.py => test_question_answering.py} | 0 14 files changed, 140 insertions(+), 105 deletions(-) rename test/modeling/{test_modeling_dpr.py => test_dpr.py} (98%) rename test/modeling/{test_modeling_inference.py => test_inference.py} (100%) rename test/modeling/{test_modeling_prediction_head.py => test_prediction_head.py} (100%) rename test/modeling/{test_modeling_processor.py => test_processor.py} (100%) rename test/modeling/{test_modeling_processor_saving_loading.py => test_processor_save_load.py} (100%) rename test/modeling/{test_modeling_question_answering.py => test_question_answering.py} (100%) diff --git a/haystack/document_stores/memory.py b/haystack/document_stores/memory.py index c738e4e8ee..d32deaad40 100644 --- a/haystack/document_stores/memory.py +++ b/haystack/document_stores/memory.py @@ -10,7 +10,7 @@ from tqdm import tqdm from haystack.schema import Document, Label -from haystack.errors import DuplicateDocumentError +from haystack.errors import DuplicateDocumentError, DocumentStoreError from haystack.document_stores import BaseDocumentStore from haystack.document_stores.base import get_batches_from_generator from haystack.modeling.utils import initialize_device_settings @@ -448,8 +448,11 @@ def update_embeddings( ) as progress_bar: for document_batch in batched_documents: embeddings = retriever.embed_documents(document_batch) # type: ignore - assert len(document_batch) == len(embeddings) - + if not len(document_batch) == len(embeddings): + raise DocumentStoreError( + "The number of embeddings does not match the number of documents in the batch " + f"({len(embeddings)} != {len(document_batch)})" + ) if embeddings[0].shape[0] != self.embedding_dim: raise RuntimeError( f"Embedding dim. of model ({embeddings[0].shape[0]})" diff --git a/haystack/modeling/infer.py b/haystack/modeling/infer.py index 8245c81bef..85ab58cfe2 100644 --- a/haystack/modeling/infer.py +++ b/haystack/modeling/infer.py @@ -472,9 +472,7 @@ def _get_predictions(self, dataset: Dataset, tensor_names: List, baskets): preds = self.model.formatted_preds( logits=logits, samples=batch_samples, - tokenizer=self.processor.tokenizer, - return_class_probs=self.return_class_probs, - **batch, + padding_mask=batch.get("padding_mask", None) ) preds_all += preds return preds_all diff --git a/haystack/modeling/model/adaptive_model.py b/haystack/modeling/model/adaptive_model.py index 4df4b94f97..bd11b10981 100644 --- a/haystack/modeling/model/adaptive_model.py +++ b/haystack/modeling/model/adaptive_model.py @@ -328,7 +328,7 @@ def convert_from_transformers( :return: AdaptiveModel """ - lm = get_language_model(model_name_or_path, revision=revision, auth_token=use_auth_token, model_kwargs=kwargs) + lm = get_language_model(model_name_or_path, revision=revision, use_auth_token=use_auth_token, model_kwargs=kwargs) if task_type is None: # Infer task type from config architecture = lm.model.config.architectures[0] @@ -481,7 +481,7 @@ def forward(self, output_tuple = self.language_model.forward( input_ids=input_ids, segment_ids=segment_ids, - padding_mask=padding_mask, + attention_mask=padding_mask, output_hidden_states=output_hidden_states, output_attentions=output_attentions ) diff --git a/haystack/modeling/model/biadaptive_model.py b/haystack/modeling/model/biadaptive_model.py index c2a967978a..9718117350 100644 --- a/haystack/modeling/model/biadaptive_model.py +++ b/haystack/modeling/model/biadaptive_model.py @@ -258,7 +258,15 @@ def prepare_labels(self, **kwargs): all_labels.append(labels) return all_labels - def forward(self, **kwargs): + def forward( + self, + query_input_ids: Optional[torch.Tensor] = None, + query_segment_ids: Optional[torch.Tensor] = None, + query_attention_mask: Optional[torch.Tensor] = None, + passage_input_ids: Optional[torch.Tensor] = None, + passage_segment_ids: Optional[torch.Tensor] = None, + passage_attention_mask: Optional[torch.Tensor] = None + ): """ Push data through the whole model and returns logits. The data will propagate through the first language model and second language model based on the tensor names and both the @@ -269,7 +277,14 @@ def forward(self, **kwargs): """ # Run forward pass of both language models - pooled_output = self.forward_lm(**kwargs) + pooled_output = self.forward_lm( + query_input_ids=query_input_ids, + query_segment_ids=query_segment_ids, + query_attention_mask=query_attention_mask, + passage_input_ids=passage_input_ids, + passage_segment_ids=passage_segment_ids, + passage_attention_mask=passage_attention_mask + ) # Run forward pass of (multiple) prediction heads using the output from above all_logits = [] @@ -304,7 +319,15 @@ def forward(self, **kwargs): return all_logits - def forward_lm(self, **kwargs): + def forward_lm( + self, + query_input_ids: Optional[torch.Tensor] = None, + query_segment_ids: Optional[torch.Tensor] = None, + query_attention_mask: Optional[torch.Tensor] = None, + passage_input_ids: Optional[torch.Tensor] = None, + passage_segment_ids: Optional[torch.Tensor] = None, + passage_attention_mask: Optional[torch.Tensor] = None, + ): """ Forward pass for the BiAdaptive model. @@ -313,14 +336,20 @@ def forward_lm(self, **kwargs): """ pooled_output = [None, None] - if "query_input_ids" in kwargs.keys(): - query_params = {key.replace("query_", ""): value for key, value in kwargs.items() if key.startswith("query_")} - pooled_output1, _ = self.language_model1(**query_params) + if query_input_ids is not None: + pooled_output1, _ = self.language_model1( + input_ids=query_input_ids, + segment_ids=query_segment_ids, + attention_mask=query_attention_mask + ) pooled_output[0] = pooled_output1 - if "passage_input_ids" in kwargs.keys(): - passage_params = {key.replace("passage_", ""): value[0] for key, value in kwargs.items() if key.startswith("passage_")} - pooled_output2, _ = self.language_model2(**passage_params) + if passage_input_ids is not None: + pooled_output2, _ = self.language_model2( + input_ids=passage_input_ids, + segment_ids=passage_segment_ids, + attention_mask=passage_attention_mask + ) pooled_output[1] = pooled_output2 return tuple(pooled_output) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 4c475a979e..fe896c2dc4 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -19,10 +19,6 @@ """ from typing import Type, Optional, Dict, Any, Union, List -try: - from typing import Literal -except ImportError: - from typing_extensions import Literal # type: ignore import re import json @@ -274,8 +270,8 @@ def __init__( model_type: str, language: str = None, n_added_tokens: int = 0, - auth_token: Optional[str] = None, - transformers_args: Optional[Dict[str, Any]] = None + use_auth_token: Optional[Union[str, bool]] = None, + model_kwargs: Optional[Dict[str, Any]] = None ): """ Load a pretrained model by supplying one of the following: @@ -289,7 +285,7 @@ def __init__( :param pretrained_model_name_or_path: The path of the saved pretrained model or the name of the model. :param model_type: the HuggingFace class name prefix (for example 'Bert', 'Roberta', etc...) :param language: the model's language ('multilingual' is also accepted) - :param auth_token: the HF token, if necessary + :param use_auth_token: the HF token or False """ super().__init__(name=model_type) @@ -302,11 +298,11 @@ def __init__( # Haystack style haystack_lm_model = Path(pretrained_model_name_or_path) / "language_model.bin" model_config = config_class.from_pretrained(haystack_lm_config) - self.model = model_class.from_pretrained(haystack_lm_model, config=model_config, use_auth_token=auth_token or False, **(transformers_args or {})) + self.model = model_class.from_pretrained(haystack_lm_model, config=model_config, use_auth_token=use_auth_token, **(model_kwargs or {})) self.language = self.model.config.language else: # Pytorch-transformer Style - self.model = model_class.from_pretrained(str(pretrained_model_name_or_path), use_auth_token=auth_token or False, **(transformers_args or {})) + self.model = model_class.from_pretrained(str(pretrained_model_name_or_path), use_auth_token=use_auth_token, **(model_kwargs or {})) self.language = language or _guess_language(pretrained_model_name_or_path) # resize embeddings in case of custom vocab @@ -326,8 +322,7 @@ def forward( self, input_ids: torch.Tensor, segment_ids: torch.Tensor, - padding_mask: Optional[torch.Tensor] = None, - attention_mask: Optional[torch.Tensor] = None, + attention_mask: torch.Tensor, output_hidden_states: Optional[bool] = None, output_attentions: Optional[bool] = None ): @@ -338,24 +333,19 @@ def forward( :param segment_ids: The ID of the segment. For example, in next sentence prediction, the tokens in the first sentence are marked with 0 and the tokens in the second sentence are marked with 1. It is a tensor of shape [batch_size, max_seq_len]. - :param padding_mask/attention_mask: A mask that assigns 1 to valid input tokens and 0 to padding tokens + :param attention_mask: A mask that assigns 1 to valid input tokens and 0 to padding tokens of shape [batch_size, max_seq_len]. Different models call this parameter differently (padding/attention mask). :param output_hidden_states: When set to `True`, outputs hidden states in addition to the embeddings. :param output_attentions: When set to `True`, outputs attentions in addition to the embeddings. :return: Embeddings for each token in the input sequence. Can also return hidden states and attentions if specified using the arguments `output_hidden_states` and `output_attentions`. """ - mask = {} - if padding_mask is not None: - mask["padding_mask"] = padding_mask - else: - mask["attention_mask"] = attention_mask return self.model( input_ids, token_type_ids=segment_ids, + attention_mask=attention_mask, output_hidden_states=output_hidden_states or self.encoder.config.output_hidden_states, output_attentions=output_attentions or self.encoder.config.output_attentions, return_dict=False, - **mask ) @@ -369,7 +359,13 @@ class HFLanguageModelWithPooler(HFLanguageModel): - Unlike the other BERT variants, these don't output the `pooled_output`. An additional pooler is initialized. """ - def __init__(self, pretrained_model_name_or_path: Union[Path, str], language: str = None, n_added_tokens: int = 0, transformers_args: Optional[Dict[str, Any]] = None): + def __init__( + self, + pretrained_model_name_or_path: Union[Path, str], + language: str = None, + n_added_tokens: int = 0, + model_kwargs: Optional[Dict[str, Any]] = None + ): """ Load a pretrained model by supplying one of the following: @@ -379,7 +375,7 @@ def __init__(self, pretrained_model_name_or_path: Union[Path, str], language: st :param pretrained_model_name_or_path: The path of the saved pretrained model or its name. """ - super().__init__(pretrained_model_name_or_path, language, n_added_tokens, **kwargs) + super().__init__(pretrained_model_name_or_path, language, n_added_tokens, model_kwargs) self.pooler = None config = self.model.config @@ -398,11 +394,9 @@ def forward( # type: ignore self, input_ids: torch.Tensor, segment_ids: torch.Tensor, - padding_mask: Optional[torch.Tensor] = None, - attention_mask: Optional[torch.Tensor] = None, + attention_mask: torch.Tensor, output_hidden_states: Optional[bool] = None, - output_attentions: Optional[bool] = None, - **kwargs, + output_attentions: Optional[bool] = None ): """ Perform the forward pass of the model. @@ -418,11 +412,9 @@ def forward( # type: ignore output_tuple = super().forward( input_ids=input_ids, segment_ids=segment_ids, - padding_mask=padding_mask, attention_mask=attention_mask, output_hidden_states=output_hidden_states, output_attentions=output_attentions, - **kwargs, ) pooled_output = self.pooler(output_tuple[0]) return (output_tuple[0], pooled_output) + output_tuple[1:] @@ -438,8 +430,8 @@ def __init__( pretrained_model_name_or_path: Union[Path, str], model_type: str, language: str = None, - auth_token: Optional[str] = None, - transformers_kwargs: Optional[Dict[str, Any]] = None + use_auth_token: Optional[Union[str, bool]] = None, + model_kwargs: Optional[Dict[str, Any]] = None ): """ Load a pretrained model by supplying one of the following: @@ -452,7 +444,7 @@ def __init__( self.role = "question" if "question" in model_type.lower() else "context" self._encoder = None - kwargs = transformers_kwargs or {} + kwargs = model_kwargs or {} model_classname = f"DPR{self.role.capitalize()}Encoder" model_class: Type[PreTrainedModel] = getattr(transformers, model_classname, None) @@ -480,7 +472,7 @@ def __init__( config=transformers.DPRConfig(**original_config_dict) ) - language_model_type = _get_model_type(haystack_lm_config, auth_token=auth_token, **kwargs) + language_model_type = _get_model_type(haystack_lm_config, use_auth_token=use_auth_token, **kwargs) # Find the class corresponding to this model type language_model_class: Type[LanguageModel] = HUGGINGFACE_TO_HAYSTACK.get(language_model_type, None) if not language_model_class: @@ -498,12 +490,12 @@ def __init__( self.language = self.model.config.language else: original_model_config = AutoConfig.from_pretrained( - pretrained_model_name_or_path, use_auth_token=auth_token or False + pretrained_model_name_or_path, use_auth_token=use_auth_token ) if original_model_config.model_type == "dpr": # "pretrained dpr model": load existing pretrained DPRQuestionEncoder model self.model = model_class.from_pretrained( - str(pretrained_model_name_or_path), use_auth_token=auth_token or False, **kwargs + str(pretrained_model_name_or_path), use_auth_token=use_auth_token, **kwargs ) else: # "from scratch": load weights from different architecture (e.g. bert) into DPRQuestionEncoder @@ -520,7 +512,7 @@ def __init__( config=transformers.DPRConfig(**original_config_dict) ) self.model.base_model.bert_model = AutoModel.from_pretrained( - str(pretrained_model_name_or_path), use_auth_token=auth_token or False, **original_config_dict + str(pretrained_model_name_or_path), use_auth_token=use_auth_token, **original_config_dict ) self.language = language or _guess_language(pretrained_model_name_or_path) @@ -615,6 +607,7 @@ def forward( # type: ignore HUGGINGFACE_TO_HAYSTACK = { + "Auto": HFLanguageModel, "Albert": HFLanguageModel, "Bert": HFLanguageModel, "BigBird": HFLanguageModel, @@ -668,7 +661,7 @@ def forward( # type: ignore def get_language_model( pretrained_model_name_or_path: Union[Path, str], language_model_type: Optional[str] = None, - auth_token: Optional[str] = None, + use_auth_token: Optional[Union[str, bool]] = None, revision: Optional[str] = None, autoconfig_kwargs: Optional[Dict[str, Any]] = None, model_kwargs: Optional[Dict[str, Any]] = None @@ -705,10 +698,10 @@ def get_language_model( logger.info(f"Could not find '{pretrained_model_name_or_path}' locally.") logger.info(f"Looking on Transformers Model Hub (in local cache and online)...") language_model_type = _get_model_type( - pretrained_model_name_or_path, auth_token=auth_token, revision=revision, autoconfig_kwargs=autoconfig_kwargs + pretrained_model_name_or_path, use_auth_token=use_auth_token, revision=revision, autoconfig_kwargs=autoconfig_kwargs ) if not language_model_type: - raise Exception( + raise ModelingError( f"Model not found for '{pretrained_model_name_or_path}'. Either supply the local path for a saved " f"model or one of bert/roberta/xlnet/albert/distilbert models that can be downloaded from remote. " f"Ensure that the model class name can be inferred from the directory name when loading a " @@ -726,8 +719,8 @@ def get_language_model( language_model = language_model_class( pretrained_model_name_or_path, model_type=language_model_type, - auth_token=auth_token, - transformers_args=model_kwargs + use_auth_token=use_auth_token, + model_kwargs=model_kwargs ) logger.info(f"Loaded '{pretrained_model_name_or_path}' ({language_model_type} model)") return language_model @@ -735,7 +728,7 @@ def get_language_model( def _get_model_type( model_name_or_path: Union[str, Path], - auth_token: Optional[str] = None, + use_auth_token: Optional[Union[str, bool]] = None, revision: Optional[str] = None, autoconfig_kwargs: Optional[Dict[str, Any]] = None ) -> str: @@ -745,16 +738,12 @@ def _get_model_type( """ model_name_or_path = str(model_name_or_path) - if autoconfig_kwargs and "use_auth_token" in autoconfig_kwargs: - auth_token = autoconfig_kwargs["use_auth_token"] - del autoconfig_kwargs["use_auth_token"] - - model_type: Optional[Type[LanguageModel]] = None + model_type: Optional[str] = None # Use AutoConfig to understand the model class try: config = AutoConfig.from_pretrained( pretrained_model_name_or_path=model_name_or_path, - use_auth_token=auth_token or False, + use_auth_token=use_auth_token, revision=revision, **(autoconfig_kwargs or {}) ) @@ -776,7 +765,11 @@ def _get_model_type( break if model_type and model_type.lower() == "roberta" and "mlm" in model_name_or_path.lower(): - logging.error(f"MLM part of codebert is currently not supported in Haystack: '{model_name_or_path}' may crash later.") + logger.error(f"MLM part of codebert is currently not supported in Haystack: '{model_name_or_path}' may crash later.") + + if not model_type: + logger.error("Model type not found. Using the AutoModel class. This can cause crashes later!") + model_type = "Auto" return model_type diff --git a/haystack/modeling/model/tokenization.py b/haystack/modeling/model/tokenization.py index 6e03e01351..b7eb2f0c71 100644 --- a/haystack/modeling/model/tokenization.py +++ b/haystack/modeling/model/tokenization.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Dict, Any, Tuple, Optional, List +from typing import Dict, Any, Union, Tuple, Optional, List import re import logging @@ -35,7 +35,7 @@ def get_tokenizer( pretrained_model_name_or_path: str, revision: str = None, use_fast: bool = True, - auth_token: Optional[str] = None, + use_auth_token: Optional[Union[str, bool]] = None, **kwargs, ) -> PreTrainedTokenizer: """ @@ -45,19 +45,16 @@ def get_tokenizer( :param pretrained_model_name_or_path: The path of the saved pretrained model or its name (e.g. `bert-base-uncased`) :param revision: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. :param use_fast: Indicate if Haystack should try to load the fast version of the tokenizer (True) or use the Python one (False). Defaults to True. - :param auth_token: The auth_token to use in `PretrainedTokenizer.from_pretrained()`, if required + :param use_auth_token: The auth_token to use in `PretrainedTokenizer.from_pretrained()`, or False :param kwargs: other kwargs to pass on to `PretrainedTokenizer.from_pretrained()` :return: AutoTokenizer instance """ model_name_or_path = str(pretrained_model_name_or_path) - params = {} - - if auth_token: - params["use_auth_token"] = auth_token if "mlm" in model_name_or_path.lower(): raise NotImplementedError("MLM part of codebert is currently not supported in Haystack") + params = {} if any(tokenizer_type in model_name_or_path for tokenizer_type in ["albert", "xlnet"]): params["keep_accents"] = True @@ -65,6 +62,7 @@ def get_tokenizer( model_name_or_path, revision=revision, use_fast=use_fast, + use_auth_token=use_auth_token, **params, **kwargs ) diff --git a/haystack/modeling/model/triadaptive_model.py b/haystack/modeling/model/triadaptive_model.py index 6cc4f56011..ef8e7042cc 100644 --- a/haystack/modeling/model/triadaptive_model.py +++ b/haystack/modeling/model/triadaptive_model.py @@ -294,7 +294,8 @@ def forward_lm(self, **kwargs): pooled_output = [None, None] # Forward pass for the queries if "query_input_ids" in kwargs.keys(): - pooled_output1, hidden_states1 = self.language_model1(**kwargs) + query_params = {key.replace("query_", ""): value for key, value in kwargs.items() if key.startswith("query_")} + pooled_output1, hidden_states1 = self.language_model1(**query_params) pooled_output[0] = pooled_output1 # Forward pass for text passages and tables if "passage_input_ids" in kwargs.keys(): @@ -347,7 +348,8 @@ def forward_lm(self, **kwargs): pooled_output[1] = pooled_output_combined # Current batch consists of only texts else: - pooled_output2, hidden_states2 = self.language_model2(**kwargs) + passage_params = {key.replace("passage_", ""): value for key, value in kwargs.items() if key.startswith("passage_")} + pooled_output2, hidden_states2 = self.language_model2(**passage_params) pooled_output[1] = pooled_output2 return tuple(pooled_output) diff --git a/haystack/nodes/retriever/dense.py b/haystack/nodes/retriever/dense.py index 33b862e178..f3cdeaf1d7 100644 --- a/haystack/nodes/retriever/dense.py +++ b/haystack/nodes/retriever/dense.py @@ -1,4 +1,4 @@ -from typing import List, Dict, Union, Optional, Any +from typing import List, Dict, Union, Optional, Any, Type import logging from pathlib import Path @@ -13,7 +13,7 @@ from torch.utils.data.sampler import SequentialSampler import pandas as pd from huggingface_hub import hf_hub_download -from transformers import AutoConfig +from transformers import AutoConfig, DPRContextEncoderTokenizerFast, DPRQuestionEncoderTokenizerFast, PreTrainedTokenizer from haystack.errors import HaystackError from haystack.schema import Document @@ -152,38 +152,32 @@ def __init__( ) self.infer_tokenizer_classes = infer_tokenizer_classes - tokenizers_default_classes = {"query": "AutoTokenizer", "passage": "AutoTokenizer"} - if self.infer_tokenizer_classes: - tokenizers_default_classes["query"] = None # type: ignore - tokenizers_default_classes["passage"] = None # type: ignore # Init & Load Encoders - self.query_tokenizer = get_tokenizer( + self.query_tokenizer = DPRQuestionEncoderTokenizerFast.from_pretrained( pretrained_model_name_or_path=query_embedding_model, revision=model_version, do_lower_case=True, use_fast=use_fast_tokenizers, - tokenizer_class=tokenizers_default_classes["query"], use_auth_token=use_auth_token, ) self.query_encoder = get_language_model( pretrained_model_name_or_path=query_embedding_model, revision=model_version, - language_model_class="DPRQuestionEncoder", + language_model_type="DPRQuestionEncoder", use_auth_token=use_auth_token, ) - self.passage_tokenizer = get_tokenizer( + self.passage_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained( pretrained_model_name_or_path=passage_embedding_model, revision=model_version, do_lower_case=True, use_fast=use_fast_tokenizers, - tokenizer_class=tokenizers_default_classes["passage"], use_auth_token=use_auth_token, ) self.passage_encoder = get_language_model( pretrained_model_name_or_path=passage_embedding_model, revision=model_version, - language_model_class="DPRContextEncoder", + language_model_type="DPRContextEncoder", use_auth_token=use_auth_token, ) @@ -498,7 +492,14 @@ def _get_predictions(self, dicts): # get logits with torch.no_grad(): - query_embeddings, passage_embeddings = self.model.forward(**batch)[0] + query_embeddings, passage_embeddings = self.model.forward( + query_input_ids=batch.get("query_input_ids", None), + query_segment_ids=batch.get("query_segment_ids", None), + query_attention_mask=batch.get("query_attention_mask", None), + passage_input_ids=batch.get("passage_input_ids", None), + passage_segment_ids=batch.get("passage_segment_ids", None), + passage_attention_mask=batch.get("passage_attention_mask", None) + )[0] if query_embeddings is not None: all_embeddings["query"].append(query_embeddings.cpu().numpy()) if passage_embeddings is not None: @@ -856,10 +857,10 @@ def __init__( ) self.infer_tokenizer_classes = infer_tokenizer_classes - tokenizers_default_classes = { - "query": "DPRQuestionEncoderTokenizer", - "passage": "DPRContextEncoderTokenizer", - "table": "DPRContextEncoderTokenizer", + tokenizers_default_classes: Dict[str, Type[PreTrainedTokenizer]] = { + "query": DPRQuestionEncoderTokenizerFast, + "passage": DPRContextEncoderTokenizerFast, + "table": DPRContextEncoderTokenizerFast, } if self.infer_tokenizer_classes: tokenizers_default_classes["query"] = None # type: ignore @@ -867,46 +868,43 @@ def __init__( tokenizers_default_classes["table"] = None # type: ignore # Init & Load Encoders - self.query_tokenizer = get_tokenizer( - pretrained_model_name_or_path=query_embedding_model, + self.query_tokenizer = tokenizers_default_classes["query"].from_pretrained( + query_embedding_model, revision=model_version, do_lower_case=True, use_fast=use_fast_tokenizers, - tokenizer_class=tokenizers_default_classes["query"], use_auth_token=use_auth_token, ) self.query_encoder = get_language_model( pretrained_model_name_or_path=query_embedding_model, + language_model_type="DPRQuestionEncoder", revision=model_version, - language_model_class="DPRQuestionEncoder", use_auth_token=use_auth_token, ) - self.passage_tokenizer = get_tokenizer( - pretrained_model_name_or_path=passage_embedding_model, + self.passage_tokenizer = tokenizers_default_classes["passage"].from_pretrained( + passage_embedding_model, revision=model_version, do_lower_case=True, use_fast=use_fast_tokenizers, - tokenizer_class=tokenizers_default_classes["passage"], use_auth_token=use_auth_token, ) self.passage_encoder = get_language_model( pretrained_model_name_or_path=passage_embedding_model, + language_model_type="DPRContextEncoder", revision=model_version, - language_model_class="DPRContextEncoder", use_auth_token=use_auth_token, ) - self.table_tokenizer = get_tokenizer( - pretrained_model_name_or_path=table_embedding_model, + self.table_tokenizer = tokenizers_default_classes["table"].from_pretrained( + table_embedding_model, revision=model_version, do_lower_case=True, use_fast=use_fast_tokenizers, - tokenizer_class=tokenizers_default_classes["table"], use_auth_token=use_auth_token, ) self.table_encoder = get_language_model( pretrained_model_name_or_path=table_embedding_model, + language_model_type="DPRContextEncoder", revision=model_version, - language_model_class="DPRContextEncoder", use_auth_token=use_auth_token, ) diff --git a/test/modeling/test_modeling_dpr.py b/test/modeling/test_dpr.py similarity index 98% rename from test/modeling/test_modeling_dpr.py rename to test/modeling/test_dpr.py index ab8aa96db2..857d0527a2 100644 --- a/test/modeling/test_modeling_dpr.py +++ b/test/modeling/test_dpr.py @@ -50,7 +50,7 @@ def test_dpr_modules(caplog=None): question_language_model = DPREncoder( pretrained_model_name_or_path="bert-base-uncased", model_type="DPRQuestionEncoder", - transformers_kwargs={ + model_kwargs={ "hidden_dropout_prob": 0, "attention_probs_dropout_prob": 0, } @@ -58,7 +58,7 @@ def test_dpr_modules(caplog=None): passage_language_model = DPREncoder( pretrained_model_name_or_path="bert-base-uncased", model_type="DPRContextEncoder", - transformers_kwargs={ + model_kwargs={ "hidden_dropout_prob": 0, "attention_probs_dropout_prob": 0, } @@ -165,7 +165,14 @@ def test_dpr_modules(caplog=None): ) # test logits and loss - embeddings = model(**features) + embeddings = model( + query_input_ids=features.get("query_input_ids", None), + query_segment_ids=features.get("query_segment_ids", None), + query_attention_mask=features.get("query_attention_mask", None), + passage_input_ids=features.get("passage_input_ids", None), + passage_segment_ids=features.get("passage_segment_ids", None), + passage_attention_mask=features.get("passage_attention_mask", None), + ) query_emb, passage_emb = embeddings[0] assert torch.all(torch.eq(query_emb.cpu(), query_vector.cpu())) assert torch.all(torch.eq(passage_emb.cpu(), passage_vector.cpu())) @@ -808,7 +815,14 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path, query_and_passage_ # get logits with torch.no_grad(): - query_embeddings, passage_embeddings = model.forward(**batch)[0] + query_embeddings, passage_embeddings = model.forward( + query_input_ids=batch.get("query_input_ids", None), + query_segment_ids=batch.get("query_segment_ids", None), + query_attention_mask=batch.get("query_attention_mask", None), + passage_input_ids=batch.get("passage_input_ids", None), + passage_segment_ids=batch.get("passage_segment_ids", None), + passage_attention_mask=batch.get("passage_attention_mask", None) + )[0] if query_embeddings is not None: all_embeddings["query"].append(query_embeddings.cpu().numpy()) if passage_embeddings is not None: diff --git a/test/modeling/test_modeling_inference.py b/test/modeling/test_inference.py similarity index 100% rename from test/modeling/test_modeling_inference.py rename to test/modeling/test_inference.py diff --git a/test/modeling/test_modeling_prediction_head.py b/test/modeling/test_prediction_head.py similarity index 100% rename from test/modeling/test_modeling_prediction_head.py rename to test/modeling/test_prediction_head.py diff --git a/test/modeling/test_modeling_processor.py b/test/modeling/test_processor.py similarity index 100% rename from test/modeling/test_modeling_processor.py rename to test/modeling/test_processor.py diff --git a/test/modeling/test_modeling_processor_saving_loading.py b/test/modeling/test_processor_save_load.py similarity index 100% rename from test/modeling/test_modeling_processor_saving_loading.py rename to test/modeling/test_processor_save_load.py diff --git a/test/modeling/test_modeling_question_answering.py b/test/modeling/test_question_answering.py similarity index 100% rename from test/modeling/test_modeling_question_answering.py rename to test/modeling/test_question_answering.py From 39419f3a2aabbe8daa7f8b44db270db84e63deb9 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 22 Jun 2022 10:15:56 +0000 Subject: [PATCH 08/89] Update Documentation & Code Style --- haystack/errors.py | 4 +- haystack/modeling/data_handler/data_silo.py | 14 +- haystack/modeling/data_handler/processor.py | 14 +- haystack/modeling/infer.py | 14 +- haystack/modeling/model/adaptive_model.py | 21 ++- haystack/modeling/model/biadaptive_model.py | 12 +- haystack/modeling/model/language_model.py | 167 +++++++++--------- haystack/modeling/model/tokenization.py | 20 +-- haystack/modeling/model/triadaptive_model.py | 8 +- haystack/modeling/training/base.py | 59 ++++--- .../nodes/retriever/_embedding_encoder.py | 1 - haystack/nodes/retriever/dense.py | 9 +- test/modeling/test_dpr.py | 40 ++--- 13 files changed, 186 insertions(+), 197 deletions(-) diff --git a/haystack/errors.py b/haystack/errors.py index d5b1da13a2..af435557c9 100644 --- a/haystack/errors.py +++ b/haystack/errors.py @@ -38,9 +38,7 @@ def __repr__(self): class ModelingError(HaystackError): """Exception for issues raised by the modeling module""" - def __init__( - self, message: Optional[str] = None, docs_link: Optional[str] = "https://haystack.deepset.ai/" - ): + def __init__(self, message: Optional[str] = None, docs_link: Optional[str] = "https://haystack.deepset.ai/"): super().__init__(message=message, docs_link=docs_link) diff --git a/haystack/modeling/data_handler/data_silo.py b/haystack/modeling/data_handler/data_silo.py index 062b1d85c3..e7c0929f1a 100644 --- a/haystack/modeling/data_handler/data_silo.py +++ b/haystack/modeling/data_handler/data_silo.py @@ -811,11 +811,15 @@ def _run_teacher(self, batch: dict) -> List[torch.Tensor]: """ Run the teacher model on the given batch. """ - params = {'input_ids': batch["input_ids"], 'segment_ids': batch["segment_ids"], 'padding_mask': batch["padding_mask"]} - if 'output_hidden_states' in batch.keys(): - params['output_hidden_states'] = batch["output_hidden_states"] - if 'output_attentions' in batch.keys(): - params['output_attentions'] = batch["output_attentions"] + params = { + "input_ids": batch["input_ids"], + "segment_ids": batch["segment_ids"], + "padding_mask": batch["padding_mask"], + } + if "output_hidden_states" in batch.keys(): + params["output_hidden_states"] = batch["output_hidden_states"] + if "output_attentions" in batch.keys(): + params["output_attentions"] = batch["output_attentions"] return self.teacher.inferencer.model(**params) def _pass_batches( diff --git a/haystack/modeling/data_handler/processor.py b/haystack/modeling/data_handler/processor.py index 3164411e12..cd1c8557a6 100644 --- a/haystack/modeling/data_handler/processor.py +++ b/haystack/modeling/data_handler/processor.py @@ -178,9 +178,7 @@ def load_from_dir(cls, load_dir: str): "Loading tokenizer from deprecated config. " "If you used `custom_vocab` or `never_split_chars`, this won't work anymore." ) - tokenizer = get_tokenizer( - load_dir, tokenizer_class=config["tokenizer"], do_lower_case=config["lower_case"] - ) + tokenizer = get_tokenizer(load_dir, tokenizer_class=config["tokenizer"], do_lower_case=config["lower_case"]) else: tokenizer = get_tokenizer(load_dir, tokenizer_class=config["tokenizer"]) @@ -919,9 +917,13 @@ def load_from_dir(cls, load_dir: str): config = json.load(open(processor_config_file)) # init tokenizers query_tokenizer_class: Type[PreTrainedTokenizer] = getattr(transformers, config["query_tokenizer"]) - query_tokenizer = query_tokenizer_class.from_pretrained(pretrained_model_name_or_path=load_dir, subfolder="query") - passage_tokenizer_class: Type[PreTrainedTokenizer] = getattr(transformers, config["passage_tokenizer"]) - passage_tokenizer = passage_tokenizer_class.from_pretrained(pretrained_model_name_or_path=load_dir, subfolder="passage") + query_tokenizer = query_tokenizer_class.from_pretrained( + pretrained_model_name_or_path=load_dir, subfolder="query" + ) + passage_tokenizer_class: Type[PreTrainedTokenizer] = getattr(transformers, config["passage_tokenizer"]) + passage_tokenizer = passage_tokenizer_class.from_pretrained( + pretrained_model_name_or_path=load_dir, subfolder="passage" + ) # we have to delete the tokenizer string from config, because we pass it as Object del config["query_tokenizer"] diff --git a/haystack/modeling/infer.py b/haystack/modeling/infer.py index 85ab58cfe2..adfddf1d50 100644 --- a/haystack/modeling/infer.py +++ b/haystack/modeling/infer.py @@ -470,9 +470,7 @@ def _get_predictions(self, dataset: Dataset, tensor_names: List, baskets): with torch.no_grad(): logits = self.model.forward(**batch) preds = self.model.formatted_preds( - logits=logits, - samples=batch_samples, - padding_mask=batch.get("padding_mask", None) + logits=logits, samples=batch_samples, padding_mask=batch.get("padding_mask", None) ) preds_all += preds return preds_all @@ -510,11 +508,11 @@ def _get_predictions_and_aggregate(self, dataset: Dataset, tensor_names: List, b # Aggregation works on preds, not logits. We want as much processing happening in one batch + on GPU # So we transform logits to preds here as well logits = self.model.forward( - input_ids=batch["input_ids"], - segment_ids=batch["segment_ids"], - padding_mask=batch["padding_mask"], - output_hidden_states=batch.get("output_hidden_states", False), - output_attentions=batch.get("output_attentions", False) + input_ids=batch["input_ids"], + segment_ids=batch["segment_ids"], + padding_mask=batch["padding_mask"], + output_hidden_states=batch.get("output_hidden_states", False), + output_attentions=batch.get("output_attentions", False), ) # preds = self.model.logits_to_preds(logits, **batch)[0] (This must somehow be useful for SQuAD) preds = self.model.logits_to_preds(logits, **batch) diff --git a/haystack/modeling/model/adaptive_model.py b/haystack/modeling/model/adaptive_model.py index bd11b10981..e3af09286b 100644 --- a/haystack/modeling/model/adaptive_model.py +++ b/haystack/modeling/model/adaptive_model.py @@ -328,7 +328,9 @@ def convert_from_transformers( :return: AdaptiveModel """ - lm = get_language_model(model_name_or_path, revision=revision, use_auth_token=use_auth_token, model_kwargs=kwargs) + lm = get_language_model( + model_name_or_path, revision=revision, use_auth_token=use_auth_token, model_kwargs=kwargs + ) if task_type is None: # Infer task type from config architecture = lm.model.config.architectures[0] @@ -456,12 +458,13 @@ def prepare_labels(self, **kwargs): all_labels.append(labels) return all_labels - def forward(self, + def forward( + self, input_ids: torch.Tensor, segment_ids: torch.Tensor, padding_mask: torch.Tensor, - output_hidden_states: bool = False, - output_attentions: bool = False + output_hidden_states: bool = False, + output_attentions: bool = False, ): """ Push data through the whole model and returns logits. The data will @@ -479,11 +482,11 @@ def forward(self, """ # Run forward pass of language model output_tuple = self.language_model.forward( - input_ids=input_ids, - segment_ids=segment_ids, - attention_mask=padding_mask, - output_hidden_states=output_hidden_states, - output_attentions=output_attentions + input_ids=input_ids, + segment_ids=segment_ids, + attention_mask=padding_mask, + output_hidden_states=output_hidden_states, + output_attentions=output_attentions, ) if output_hidden_states: if output_attentions: diff --git a/haystack/modeling/model/biadaptive_model.py b/haystack/modeling/model/biadaptive_model.py index 9718117350..5cd1a37459 100644 --- a/haystack/modeling/model/biadaptive_model.py +++ b/haystack/modeling/model/biadaptive_model.py @@ -265,7 +265,7 @@ def forward( query_attention_mask: Optional[torch.Tensor] = None, passage_input_ids: Optional[torch.Tensor] = None, passage_segment_ids: Optional[torch.Tensor] = None, - passage_attention_mask: Optional[torch.Tensor] = None + passage_attention_mask: Optional[torch.Tensor] = None, ): """ Push data through the whole model and returns logits. The data will propagate through @@ -283,7 +283,7 @@ def forward( query_attention_mask=query_attention_mask, passage_input_ids=passage_input_ids, passage_segment_ids=passage_segment_ids, - passage_attention_mask=passage_attention_mask + passage_attention_mask=passage_attention_mask, ) # Run forward pass of (multiple) prediction heads using the output from above @@ -338,17 +338,13 @@ def forward_lm( if query_input_ids is not None: pooled_output1, _ = self.language_model1( - input_ids=query_input_ids, - segment_ids=query_segment_ids, - attention_mask=query_attention_mask + input_ids=query_input_ids, segment_ids=query_segment_ids, attention_mask=query_attention_mask ) pooled_output[0] = pooled_output1 if passage_input_ids is not None: pooled_output2, _ = self.language_model2( - input_ids=passage_input_ids, - segment_ids=passage_segment_ids, - attention_mask=passage_attention_mask + input_ids=passage_input_ids, segment_ids=passage_segment_ids, attention_mask=passage_attention_mask ) pooled_output[1] = pooled_output2 diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index fe896c2dc4..6b7c7b0a8c 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -38,14 +38,14 @@ from haystack.errors import ModelingError LANGUAGE_HINTS = ( - ("german", "german"), - ("english", "english"), - ("chinese", "chinese"), - ("indian", "indian"), - ("french", "french"), - ("camembert", "french"), - ("polish", "polish"), - ("spanish", "spanish"), + ("german", "german"), + ("english", "english"), + ("chinese", "chinese"), + ("indian", "indian"), + ("french", "french"), + ("camembert", "french"), + ("polish", "polish"), + ("spanish", "spanish"), ("umberto", "italian"), ("multilingual", "multilingual"), ) @@ -58,7 +58,6 @@ OUTPUT_DIM_NAMES = ["dim", "hidden_size", "d_model"] - def silence_transformers_logs(from_pretrained_func): """ A wrapper that raises the log level of Transformers to @@ -86,13 +85,13 @@ def quiet_from_pretrained_func(cls, *args, **kwargs): # TODO analyse if LMs can be completely used through HF transformers class LanguageModel(nn.Module, ABC): """ - The parent class for any kind of model that can embed language into a semantic vector space. + The parent class for any kind of model that can embed language into a semantic vector space. These models read in tokenized sentences and return vectors that capture the meaning of sentences or of tokens. """ def __init__(self, name: str): super().__init__() - self._output_dims = None + self._output_dims = None self.name = name @property @@ -101,13 +100,13 @@ def encoder(self): @abstractmethod def forward( - self, - input_ids: torch.Tensor, - segment_ids: torch.Tensor, + self, + input_ids: torch.Tensor, + segment_ids: torch.Tensor, padding_mask: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, - output_hidden_states: Optional[bool] = None, - output_attentions: Optional[bool] = None + output_hidden_states: Optional[bool] = None, + output_attentions: Optional[bool] = None, ): raise NotImplementedError @@ -173,11 +172,7 @@ def save(self, save_dir: Union[str, Path], state_dict: Dict[Any, Any] = None): self.save_config(save_dir) def formatted_preds( - self, - logits, - samples, - ignore_first_token: bool = True, - padding_mask: torch.Tensor = None + self, logits, samples, ignore_first_token: bool = True, padding_mask: torch.Tensor = None ) -> List[Dict[str, Any]]: """ Extracting vectors from a language model (for example, for extracting sentence embeddings). @@ -229,7 +224,9 @@ def formatted_preds( elif self.extraction_strategy == "cls_token": vecs = sequence_output[:, 0, :].cpu().numpy() else: - raise NotImplementedError(f"This extraction strategy ({self.extraction_strategy}) is not supported by Haystack.") + raise NotImplementedError( + f"This extraction strategy ({self.extraction_strategy}) is not supported by Haystack." + ) preds = [] for vec, sample in zip(vecs, samples): @@ -239,7 +236,9 @@ def formatted_preds( preds.append(pred) return preds - def _pool_tokens(self, sequence_output: torch.Tensor, padding_mask: torch.Tensor, strategy: str, ignore_first_token: bool): + def _pool_tokens( + self, sequence_output: torch.Tensor, padding_mask: torch.Tensor, strategy: str, ignore_first_token: bool + ): token_vecs = sequence_output.cpu().numpy() # we only take the aggregated value of non-padding tokens padding_mask = padding_mask.cpu().numpy() @@ -265,13 +264,13 @@ class HFLanguageModel(LanguageModel): @silence_transformers_logs def __init__( - self, - pretrained_model_name_or_path: Union[Path, str], + self, + pretrained_model_name_or_path: Union[Path, str], model_type: str, - language: str = None, - n_added_tokens: int = 0, + language: str = None, + n_added_tokens: int = 0, use_auth_token: Optional[Union[str, bool]] = None, - model_kwargs: Optional[Dict[str, Any]] = None + model_kwargs: Optional[Dict[str, Any]] = None, ): """ Load a pretrained model by supplying one of the following: @@ -288,7 +287,7 @@ def __init__( :param use_auth_token: the HF token or False """ super().__init__(name=model_type) - + config_class: PretrainedConfig = getattr(transformers, model_type + "Config", None) model_class: PreTrainedModel = getattr(transformers, model_type + "Model", None) @@ -298,13 +297,17 @@ def __init__( # Haystack style haystack_lm_model = Path(pretrained_model_name_or_path) / "language_model.bin" model_config = config_class.from_pretrained(haystack_lm_config) - self.model = model_class.from_pretrained(haystack_lm_model, config=model_config, use_auth_token=use_auth_token, **(model_kwargs or {})) + self.model = model_class.from_pretrained( + haystack_lm_model, config=model_config, use_auth_token=use_auth_token, **(model_kwargs or {}) + ) self.language = self.model.config.language else: # Pytorch-transformer Style - self.model = model_class.from_pretrained(str(pretrained_model_name_or_path), use_auth_token=use_auth_token, **(model_kwargs or {})) + self.model = model_class.from_pretrained( + str(pretrained_model_name_or_path), use_auth_token=use_auth_token, **(model_kwargs or {}) + ) self.language = language or _guess_language(pretrained_model_name_or_path) - + # resize embeddings in case of custom vocab if n_added_tokens != 0: # TODO verify for other models than BERT @@ -324,7 +327,7 @@ def forward( segment_ids: torch.Tensor, attention_mask: torch.Tensor, output_hidden_states: Optional[bool] = None, - output_attentions: Optional[bool] = None + output_attentions: Optional[bool] = None, ): """ Perform the forward pass of the model. @@ -360,11 +363,11 @@ class HFLanguageModelWithPooler(HFLanguageModel): """ def __init__( - self, - pretrained_model_name_or_path: Union[Path, str], - language: str = None, - n_added_tokens: int = 0, - model_kwargs: Optional[Dict[str, Any]] = None + self, + pretrained_model_name_or_path: Union[Path, str], + language: str = None, + n_added_tokens: int = 0, + model_kwargs: Optional[Dict[str, Any]] = None, ): """ Load a pretrained model by supplying one of the following: @@ -396,7 +399,7 @@ def forward( # type: ignore segment_ids: torch.Tensor, attention_mask: torch.Tensor, output_hidden_states: Optional[bool] = None, - output_attentions: Optional[bool] = None + output_attentions: Optional[bool] = None, ): """ Perform the forward pass of the model. @@ -424,6 +427,7 @@ class DPREncoder(LanguageModel): """ A DPREncoder model that wraps Hugging Face's implementation. """ + @silence_transformers_logs def __init__( self, @@ -431,7 +435,7 @@ def __init__( model_type: str, language: str = None, use_auth_token: Optional[Union[str, bool]] = None, - model_kwargs: Optional[Dict[str, Any]] = None + model_kwargs: Optional[Dict[str, Any]] = None, ): """ Load a pretrained model by supplying one of the following: @@ -457,9 +461,7 @@ def __init__( if original_model_config.model_type == "dpr": dpr_config = transformers.DPRConfig.from_pretrained(haystack_lm_config) - self.model = model_class.from_pretrained( - haystack_lm_model, config=dpr_config, **kwargs - ) + self.model = model_class.from_pretrained(haystack_lm_model, config=dpr_config, **kwargs) else: if original_model_config.model_type != "bert": logger.warning( @@ -468,9 +470,7 @@ def __init__( ) original_config_dict = vars(original_model_config) original_config_dict.update(kwargs) - self.model = model_class( - config=transformers.DPRConfig(**original_config_dict) - ) + self.model = model_class(config=transformers.DPRConfig(**original_config_dict)) language_model_type = _get_model_type(haystack_lm_config, use_auth_token=use_auth_token, **kwargs) # Find the class corresponding to this model type @@ -478,13 +478,12 @@ def __init__( if not language_model_class: raise ValueError( f"The type of model supplied ({language_model_type}) is not supported by Haystack. " - f"Supported model categories are: {', '.join(HUGGINGFACE_TO_HAYSTACK.keys())}") + f"Supported model categories are: {', '.join(HUGGINGFACE_TO_HAYSTACK.keys())}" + ) # Instantiate the class for this model self.model.base_model.bert_model = language_model_class( - pretrained_model_name_or_path, - model_type=language_model_type, - **kwargs + pretrained_model_name_or_path, model_type=language_model_type, **kwargs ).model self.language = self.model.config.language @@ -508,9 +507,7 @@ def __init__( ) original_config_dict = vars(original_model_config) original_config_dict.update(kwargs) - self.model = model_class( - config=transformers.DPRConfig(**original_config_dict) - ) + self.model = model_class(config=transformers.DPRConfig(**original_config_dict)) self.model.base_model.bert_model = AutoModel.from_pretrained( str(pretrained_model_name_or_path), use_auth_token=use_auth_token, **original_config_dict ) @@ -530,7 +527,7 @@ def save_config(self, save_dir: Union[Path, str]): # Therefore, we copy the model_type from the model config to DPRConfig setattr(transformers.DPRConfig, "model_type", self.model.config.model_type) super().save_config(save_dir=save_dir) - + def save(self, save_dir: Union[str, Path], state_dict: Optional[Dict[Any, Any]] = None): """ Save the model `state_dict` and its configuration file so that it can be loaded again. @@ -538,9 +535,7 @@ def save(self, save_dir: Union[str, Path], state_dict: Optional[Dict[Any, Any]] :param save_dir: The directory in which the model should be saved. :param state_dict: A dictionary containing the whole state of the module including names of layers. By default, the unchanged state dictionary of the module is used. """ - model_to_save = ( - self.model.module if hasattr(self.model, "module") else self.model - ) # Only save the model itself + model_to_save = self.model.module if hasattr(self.model, "module") else self.model # Only save the model itself if "dpr" not in self.model.config.model_type.lower(): if model_to_save.base_model_prefix.startswith("ctx_"): @@ -569,12 +564,7 @@ def save(self, save_dir: Union[str, Path], state_dict: Optional[Dict[Any, Any]] super().save(save_dir=save_dir, state_dict=state_dict) - def forward( # type: ignore - self, - input_ids: torch.Tensor, - segment_ids: torch.Tensor, - attention_mask: torch.Tensor - ): + def forward(self, input_ids: torch.Tensor, segment_ids: torch.Tensor, attention_mask: torch.Tensor): # type: ignore """ Perform the forward pass of the DPR encoder model. @@ -593,10 +583,7 @@ def forward( # type: ignore attention_mask = attention_mask.view(-1, max_seq_len) output_tuple = self.model( - input_ids=input_ids, - token_type_ids=segment_ids, - attention_mask=attention_mask, - return_dict=True, + input_ids=input_ids, token_type_ids=segment_ids, attention_mask=attention_mask, return_dict=True ) if self.encoder.config.output_hidden_states == True: pooled_output, all_hidden_states = output_tuple.pooler_output, output_tuple.hidden_states @@ -627,7 +614,6 @@ def forward( # type: ignore "WordEmbedding_LM": HFLanguageModel, "XLMRoberta": HFLanguageModel, "XLNet": HFLanguageModelWithPooler, - } NAME_HINTS = { "xlm.*roberta": "XLMRoberta", @@ -658,13 +644,14 @@ def forward( # type: ignore }, } + def get_language_model( - pretrained_model_name_or_path: Union[Path, str], - language_model_type: Optional[str] = None, + pretrained_model_name_or_path: Union[Path, str], + language_model_type: Optional[str] = None, use_auth_token: Optional[Union[str, bool]] = None, - revision: Optional[str] = None, + revision: Optional[str] = None, autoconfig_kwargs: Optional[Dict[str, Any]] = None, - model_kwargs: Optional[Dict[str, Any]] = None + model_kwargs: Optional[Dict[str, Any]] = None, ) -> LanguageModel: """ Load a pretrained language model by doing one of the following: @@ -698,9 +685,12 @@ def get_language_model( logger.info(f"Could not find '{pretrained_model_name_or_path}' locally.") logger.info(f"Looking on Transformers Model Hub (in local cache and online)...") language_model_type = _get_model_type( - pretrained_model_name_or_path, use_auth_token=use_auth_token, revision=revision, autoconfig_kwargs=autoconfig_kwargs + pretrained_model_name_or_path, + use_auth_token=use_auth_token, + revision=revision, + autoconfig_kwargs=autoconfig_kwargs, ) - if not language_model_type: + if not language_model_type: raise ModelingError( f"Model not found for '{pretrained_model_name_or_path}'. Either supply the local path for a saved " f"model or one of bert/roberta/xlnet/albert/distilbert models that can be downloaded from remote. " @@ -713,24 +703,25 @@ def get_language_model( if not language_model_class: raise ValueError( f"The type of model supplied ({language_model_type}) is not supported by Haystack. " - f"Supported model categories are: {', '.join(HUGGINGFACE_TO_HAYSTACK.keys())}") + f"Supported model categories are: {', '.join(HUGGINGFACE_TO_HAYSTACK.keys())}" + ) # Instantiate the class for this model language_model = language_model_class( pretrained_model_name_or_path, model_type=language_model_type, use_auth_token=use_auth_token, - model_kwargs=model_kwargs + model_kwargs=model_kwargs, ) logger.info(f"Loaded '{pretrained_model_name_or_path}' ({language_model_type} model)") return language_model def _get_model_type( - model_name_or_path: Union[str, Path], + model_name_or_path: Union[str, Path], use_auth_token: Optional[Union[str, bool]] = None, - revision: Optional[str] = None, - autoconfig_kwargs: Optional[Dict[str, Any]] = None + revision: Optional[str] = None, + autoconfig_kwargs: Optional[Dict[str, Any]] = None, ) -> str: """ Given a model name, try to use AutoConfig to understand which model type it is. @@ -742,18 +733,16 @@ def _get_model_type( # Use AutoConfig to understand the model class try: config = AutoConfig.from_pretrained( - pretrained_model_name_or_path=model_name_or_path, - use_auth_token=use_auth_token, - revision=revision, - **(autoconfig_kwargs or {}) + pretrained_model_name_or_path=model_name_or_path, + use_auth_token=use_auth_token, + revision=revision, + **(autoconfig_kwargs or {}), ) # Find if this mode is present in MODEL_TYPE_BY_NAME.keys() even with a different capitalization model_type = {key.lower(): key for key in HUGGINGFACE_TO_HAYSTACK.keys()}.get(config.model_type.lower(), None) except Exception as e: - logger.exception( - f"AutoConfig failed to load on '{model_name_or_path}'. " - ) + logger.exception(f"AutoConfig failed to load on '{model_name_or_path}'. ") if not model_type: logger.warning("Could not infer the model type from its config. Looking for clues in the model name.") @@ -765,7 +754,9 @@ def _get_model_type( break if model_type and model_type.lower() == "roberta" and "mlm" in model_name_or_path.lower(): - logger.error(f"MLM part of codebert is currently not supported in Haystack: '{model_name_or_path}' may crash later.") + logger.error( + f"MLM part of codebert is currently not supported in Haystack: '{model_name_or_path}' may crash later." + ) if not model_type: logger.error("Model type not found. Using the AutoModel class. This can cause crashes later!") @@ -784,4 +775,4 @@ def _guess_language(name: str) -> str: else: language = "english" logger.info(f"Auto-detected model language: {language}") - return language \ No newline at end of file + return language diff --git a/haystack/modeling/model/tokenization.py b/haystack/modeling/model/tokenization.py index b7eb2f0c71..a7426c2a34 100644 --- a/haystack/modeling/model/tokenization.py +++ b/haystack/modeling/model/tokenization.py @@ -39,7 +39,7 @@ def get_tokenizer( **kwargs, ) -> PreTrainedTokenizer: """ - Enables loading of different Tokenizer classes with a uniform interface. + Enables loading of different Tokenizer classes with a uniform interface. Right now it always returns an instance of `AutoTokenizer`. :param pretrained_model_name_or_path: The path of the saved pretrained model or its name (e.g. `bert-base-uncased`) @@ -57,17 +57,15 @@ def get_tokenizer( params = {} if any(tokenizer_type in model_name_or_path for tokenizer_type in ["albert", "xlnet"]): params["keep_accents"] = True - + return AutoTokenizer.from_pretrained( - model_name_or_path, - revision=revision, - use_fast=use_fast, - use_auth_token=use_auth_token, - **params, - **kwargs + model_name_or_path, revision=revision, use_fast=use_fast, use_auth_token=use_auth_token, **params, **kwargs ) -def tokenize_batch_question_answering(pre_baskets: Dict[Any, Any], tokenizer: PreTrainedTokenizer, indices: List[Any]) -> List[SampleBasket]: + +def tokenize_batch_question_answering( + pre_baskets: Dict[Any, Any], tokenizer: PreTrainedTokenizer, indices: List[Any] +) -> List[SampleBasket]: """ Tokenizes text data for question answering tasks. Tokenization means splitting words into subwords, depending on the tokenizer's vocabulary. @@ -248,7 +246,9 @@ def truncate_sequences( return (seq_a, seq_b, overflowing_tokens) -def _words_to_tokens(words: List[str], word_offsets: List[int], tokenizer: PreTrainedTokenizer) -> Tuple[str, List[str], List[int]]: +def _words_to_tokens( + words: List[str], word_offsets: List[int], tokenizer: PreTrainedTokenizer +) -> Tuple[str, List[str], List[int]]: """ Tokenize "words" into subword tokens while keeping track of offsets and if a token is the start of a word. :param words: list of words. diff --git a/haystack/modeling/model/triadaptive_model.py b/haystack/modeling/model/triadaptive_model.py index ef8e7042cc..66bee19507 100644 --- a/haystack/modeling/model/triadaptive_model.py +++ b/haystack/modeling/model/triadaptive_model.py @@ -294,7 +294,9 @@ def forward_lm(self, **kwargs): pooled_output = [None, None] # Forward pass for the queries if "query_input_ids" in kwargs.keys(): - query_params = {key.replace("query_", ""): value for key, value in kwargs.items() if key.startswith("query_")} + query_params = { + key.replace("query_", ""): value for key, value in kwargs.items() if key.startswith("query_") + } pooled_output1, hidden_states1 = self.language_model1(**query_params) pooled_output[0] = pooled_output1 # Forward pass for text passages and tables @@ -348,7 +350,9 @@ def forward_lm(self, **kwargs): pooled_output[1] = pooled_output_combined # Current batch consists of only texts else: - passage_params = {key.replace("passage_", ""): value for key, value in kwargs.items() if key.startswith("passage_")} + passage_params = { + key.replace("passage_", ""): value for key, value in kwargs.items() if key.startswith("passage_") + } pooled_output2, hidden_states2 = self.language_model2(**passage_params) pooled_output[1] = pooled_output2 diff --git a/haystack/modeling/training/base.py b/haystack/modeling/training/base.py index 0d585f75ff..c448cae3c1 100644 --- a/haystack/modeling/training/base.py +++ b/haystack/modeling/training/base.py @@ -250,7 +250,9 @@ def train(self): vocab_size1=len(self.data_silo.processor.query_tokenizer), vocab_size2=len(self.data_silo.processor.passage_tokenizer), ) - elif not self.model.language_model.name == "debertav2": # DebertaV2 has mismatched vocab size on purpose (see https://github.com/huggingface/transformers/issues/12428) + elif ( + not self.model.language_model.name == "debertav2" + ): # DebertaV2 has mismatched vocab size on purpose (see https://github.com/huggingface/transformers/issues/12428) self.model.verify_vocab_size(vocab_size=len(self.data_silo.processor.tokenizer)) self.model.train() @@ -764,14 +766,18 @@ def compute_loss(self, batch: dict, step: int) -> torch.Tensor: keys = list(batch.keys()) keys = [key for key in keys if key.startswith("teacher_output")] teacher_logits = [batch.pop(key) for key in keys] - - params = {'input_ids': batch["input_ids"], 'segment_ids': batch["segment_ids"], 'padding_mask': batch["padding_mask"]} - if 'output_hidden_states' in batch.keys(): - params['output_hidden_states'] = batch["output_hidden_states"] - if 'output_attentions' in batch.keys(): - params['output_attentions'] = batch["output_attentions"] + + params = { + "input_ids": batch["input_ids"], + "segment_ids": batch["segment_ids"], + "padding_mask": batch["padding_mask"], + } + if "output_hidden_states" in batch.keys(): + params["output_hidden_states"] = batch["output_hidden_states"] + if "output_attentions" in batch.keys(): + params["output_attentions"] = batch["output_attentions"] logits = self.model.forward(**params) - + student_loss = self.model.logits_to_loss(logits=logits, global_step=self.global_step, **batch) distillation_loss = self.distillation_loss_fn( student_logits=logits[0] / self.temperature, teacher_logits=teacher_logits[0] / self.temperature @@ -903,11 +909,15 @@ def __init__( self.loss = DataParallel(self.loss).to(device) def compute_loss(self, batch: dict, step: int) -> torch.Tensor: - params = {'input_ids': batch["input_ids"], 'segment_ids': batch["segment_ids"], 'padding_mask': batch["padding_mask"]} - if 'output_hidden_states' in batch.keys(): - params['output_hidden_states'] = batch["output_hidden_states"] - if 'output_attentions' in batch.keys(): - params['output_attentions'] = batch["output_attentions"] + params = { + "input_ids": batch["input_ids"], + "segment_ids": batch["segment_ids"], + "padding_mask": batch["padding_mask"], + } + if "output_hidden_states" in batch.keys(): + params["output_hidden_states"] = batch["output_hidden_states"] + if "output_attentions" in batch.keys(): + params["output_attentions"] = batch["output_attentions"] return self.backward_propagate(torch.sum(self.loss(**params)), step) @@ -954,26 +964,21 @@ def __init__(self, model: Union[DataParallel, AdaptiveModel], teacher_model: Mod else: self.dim_mappings.append(None) - def forward( - self, - input_ids: torch.Tensor, - segment_ids: torch.Tensor, - padding_mask: torch.Tensor - ): + def forward(self, input_ids: torch.Tensor, segment_ids: torch.Tensor, padding_mask: torch.Tensor): with torch.no_grad(): _, teacher_hidden_states, teacher_attentions = self.teacher_model.forward( - input_ids=input_ids, - segment_ids=segment_ids, + input_ids=input_ids, + segment_ids=segment_ids, padding_mask=padding_mask, - output_attentions=True, - output_hidden_states=True + output_attentions=True, + output_hidden_states=True, ) _, hidden_states, attentions = self.model.forward( - input_ids=input_ids, - segment_ids=segment_ids, + input_ids=input_ids, + segment_ids=segment_ids, padding_mask=padding_mask, - output_attentions=True, - output_hidden_states=True + output_attentions=True, + output_hidden_states=True, ) loss = torch.tensor(0.0, device=input_ids.device) diff --git a/haystack/nodes/retriever/_embedding_encoder.py b/haystack/nodes/retriever/_embedding_encoder.py index 3bf9d74b67..447a92c80c 100644 --- a/haystack/nodes/retriever/_embedding_encoder.py +++ b/haystack/nodes/retriever/_embedding_encoder.py @@ -373,7 +373,6 @@ def save(self, save_dir: Union[Path, str]): raise NotImplementedError("save method can only be used with sentence-transformers EmbeddingRetriever(s)") - _EMBEDDING_ENCODERS: Dict[str, Callable] = { "farm": _DefaultEmbeddingEncoder, "transformers": _DefaultEmbeddingEncoder, diff --git a/haystack/nodes/retriever/dense.py b/haystack/nodes/retriever/dense.py index f3cdeaf1d7..66925af055 100644 --- a/haystack/nodes/retriever/dense.py +++ b/haystack/nodes/retriever/dense.py @@ -13,7 +13,12 @@ from torch.utils.data.sampler import SequentialSampler import pandas as pd from huggingface_hub import hf_hub_download -from transformers import AutoConfig, DPRContextEncoderTokenizerFast, DPRQuestionEncoderTokenizerFast, PreTrainedTokenizer +from transformers import ( + AutoConfig, + DPRContextEncoderTokenizerFast, + DPRQuestionEncoderTokenizerFast, + PreTrainedTokenizer, +) from haystack.errors import HaystackError from haystack.schema import Document @@ -498,7 +503,7 @@ def _get_predictions(self, dicts): query_attention_mask=batch.get("query_attention_mask", None), passage_input_ids=batch.get("passage_input_ids", None), passage_segment_ids=batch.get("passage_segment_ids", None), - passage_attention_mask=batch.get("passage_attention_mask", None) + passage_attention_mask=batch.get("passage_attention_mask", None), )[0] if query_embeddings is not None: all_embeddings["query"].append(query_embeddings.cpu().numpy()) diff --git a/test/modeling/test_dpr.py b/test/modeling/test_dpr.py index 857d0527a2..d14357d3fa 100644 --- a/test/modeling/test_dpr.py +++ b/test/modeling/test_dpr.py @@ -50,18 +50,12 @@ def test_dpr_modules(caplog=None): question_language_model = DPREncoder( pretrained_model_name_or_path="bert-base-uncased", model_type="DPRQuestionEncoder", - model_kwargs={ - "hidden_dropout_prob": 0, - "attention_probs_dropout_prob": 0, - } + model_kwargs={"hidden_dropout_prob": 0, "attention_probs_dropout_prob": 0}, ) passage_language_model = DPREncoder( pretrained_model_name_or_path="bert-base-uncased", model_type="DPRContextEncoder", - model_kwargs={ - "hidden_dropout_prob": 0, - "attention_probs_dropout_prob": 0, - } + model_kwargs={"hidden_dropout_prob": 0, "attention_probs_dropout_prob": 0}, ) prediction_head = TextSimilarityHead(similarity_function="dot_product") @@ -137,7 +131,9 @@ def test_dpr_modules(caplog=None): ) features_query = {key.replace("query_", ""): value for key, value in features.items() if key.startswith("query_")} - features_passage = {key.replace("passage_", ""): value for key, value in features.items() if key.startswith("passage_")} + features_passage = { + key.replace("passage_", ""): value for key, value in features.items() if key.startswith("passage_") + } # test model encodings query_vector = model.language_model1(**features_query)[0] @@ -708,13 +704,9 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path, query_and_passage_ query_tokenizer = get_tokenizer( pretrained_model_name_or_path=query_embedding_model ) # tokenizer class is inferred automatically - query_encoder = get_language_model( - pretrained_model_name_or_path=query_embedding_model - ) + query_encoder = get_language_model(pretrained_model_name_or_path=query_embedding_model) passage_tokenizer = get_tokenizer(pretrained_model_name_or_path=passage_embedding_model) - passage_encoder = get_language_model( - pretrained_model_name_or_path=passage_embedding_model - ) + passage_encoder = get_language_model(pretrained_model_name_or_path=passage_embedding_model) processor = TextSimilarityProcessor( query_tokenizer=query_tokenizer, @@ -756,15 +748,11 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path, query_and_passage_ loaded_query_tokenizer = get_tokenizer( pretrained_model_name_or_path=Path(save_dir) / query_encoder_dir, use_fast=True ) # tokenizer class is inferred automatically - loaded_query_encoder = get_language_model( - pretrained_model_name_or_path=Path(save_dir) / query_encoder_dir - ) + loaded_query_encoder = get_language_model(pretrained_model_name_or_path=Path(save_dir) / query_encoder_dir) loaded_passage_tokenizer = get_tokenizer( pretrained_model_name_or_path=Path(save_dir) / passage_encoder_dir, use_fast=True ) - loaded_passage_encoder = get_language_model( - pretrained_model_name_or_path=Path(save_dir) / passage_encoder_dir - ) + loaded_passage_encoder = get_language_model(pretrained_model_name_or_path=Path(save_dir) / passage_encoder_dir) loaded_processor = TextSimilarityProcessor( query_tokenizer=loaded_query_tokenizer, @@ -821,7 +809,7 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path, query_and_passage_ query_attention_mask=batch.get("query_attention_mask", None), passage_input_ids=batch.get("passage_input_ids", None), passage_segment_ids=batch.get("passage_segment_ids", None), - passage_attention_mask=batch.get("passage_attention_mask", None) + passage_attention_mask=batch.get("passage_attention_mask", None), )[0] if query_embeddings is not None: all_embeddings["query"].append(query_embeddings.cpu().numpy()) @@ -875,13 +863,9 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path, query_and_passage_ query_tokenizer = get_tokenizer( pretrained_model_name_or_path=Path(save_dir) / query_encoder_dir ) # tokenizer class is inferred automatically - query_encoder = get_language_model( - pretrained_model_name_or_path=Path(save_dir) / query_encoder_dir - ) + query_encoder = get_language_model(pretrained_model_name_or_path=Path(save_dir) / query_encoder_dir) passage_tokenizer = get_tokenizer(pretrained_model_name_or_path=Path(save_dir) / passage_encoder_dir) - passage_encoder = get_language_model( - pretrained_model_name_or_path=Path(save_dir) / passage_encoder_dir - ) + passage_encoder = get_language_model(pretrained_model_name_or_path=Path(save_dir) / passage_encoder_dir) processor = TextSimilarityProcessor( query_tokenizer=query_tokenizer, From 6d4857be063fc319a88cde2e2413178c8a864cf9 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 22 Jun 2022 12:33:08 +0200 Subject: [PATCH 09/89] Remove mentions to data2vecvision --- haystack/modeling/model/language_model.py | 2 - .../nodes/retriever/_embedding_encoder.py | 61 ------------------- test/modeling/test_dpr.py | 18 +++++- 3 files changed, 16 insertions(+), 65 deletions(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index fe896c2dc4..ef7ac237f0 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -613,7 +613,6 @@ def forward( # type: ignore "BigBird": HFLanguageModel, "Camembert": HFLanguageModel, "Codebert": HFLanguageModel, - "Data2VecVision": HFLanguageModel, "DebertaV2": HFLanguageModelWithPooler, "DistilBert": HFLanguageModelWithPooler, "DPRContextEncoder": DPREncoder, @@ -639,7 +638,6 @@ def forward( # type: ignore "dpr.*ctx.*encoder": "DPRContextEncoder", "mlm.*codebert": "Roberta", "deberta-v2": "DebertaV2", - "data2vec-vision": "Data2VecVision", } PARAMETERS_BY_MODEL = { "DistilBert": {"summary_last_dropout": 0, "summary_type": "first", "summary_activation": "tanh"}, diff --git a/haystack/nodes/retriever/_embedding_encoder.py b/haystack/nodes/retriever/_embedding_encoder.py index 3bf9d74b67..f0abede69a 100644 --- a/haystack/nodes/retriever/_embedding_encoder.py +++ b/haystack/nodes/retriever/_embedding_encoder.py @@ -314,70 +314,9 @@ def save(self, save_dir: Union[Path, str]): ) -class _Data2VecVisionEmbeddingEncoder(_BaseEmbeddingEncoder): - def __init__(self, retriever: "EmbeddingRetriever"): - - self.embedding_model = Inferencer.load( - retriever.embedding_model, - revision=retriever.model_version, - task_type="embeddings", - extraction_strategy=retriever.pooling_strategy, - extraction_layer=retriever.emb_extraction_layer, - gpu=retriever.use_gpu, - batch_size=retriever.batch_size, - max_seq_len=retriever.max_seq_len, - num_processes=0, - use_auth_token=retriever.use_auth_token, - ) - # Check that document_store has the right similarity function - similarity = retriever.document_store.similarity - # If we are using a sentence transformer model - if "sentence" in retriever.embedding_model.lower() and similarity != "cosine": - logger.warning( - f"You seem to be using a Sentence Transformer with the {similarity} function. " - f"We recommend using cosine instead. " - f"This can be set when initializing the DocumentStore" - ) - elif "dpr" in retriever.embedding_model.lower() and similarity != "dot_product": - logger.warning( - f"You seem to be using a DPR model with the {similarity} function. " - f"We recommend using dot_product instead. " - f"This can be set when initializing the DocumentStore" - ) - - def embed(self, texts: Union[List[List[str]], List[str], str]) -> List[np.ndarray]: - # TODO: FARM's `sample_to_features_text` need to fix following warning - - # tokenization_utils.py:460: FutureWarning: `is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead. - emb = self.embedding_model.inference_from_dicts(dicts=[{"text": t} for t in texts]) - emb = [(r["vec"]) for r in emb] - return emb - - def embed_queries(self, texts: List[str]) -> List[np.ndarray]: - return self.embed(texts) - - def embed_documents(self, docs: List[Document]) -> List[np.ndarray]: - passages = [d.content for d in docs] # type: ignore - return self.embed(passages) - - def train( - self, - training_data: List[Dict[str, Any]], - learning_rate: float = 2e-5, - n_epochs: int = 1, - num_warmup_steps: int = None, - batch_size: int = 16, - ): - raise NotImplementedError("train method can only be used with sentence-transformers EmbeddingRetriever(s)") - - def save(self, save_dir: Union[Path, str]): - raise NotImplementedError("save method can only be used with sentence-transformers EmbeddingRetriever(s)") - - - _EMBEDDING_ENCODERS: Dict[str, Callable] = { "farm": _DefaultEmbeddingEncoder, "transformers": _DefaultEmbeddingEncoder, "sentence_transformers": _SentenceTransformersEmbeddingEncoder, "retribert": _RetribertEmbeddingEncoder, - "data2vec_vision": _Data2VecVisionEmbeddingEncoder, } diff --git a/test/modeling/test_dpr.py b/test/modeling/test_dpr.py index 857d0527a2..ead858b9f2 100644 --- a/test/modeling/test_dpr.py +++ b/test/modeling/test_dpr.py @@ -849,7 +849,14 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path, query_and_passage_ # get logits with torch.no_grad(): - query_embeddings, passage_embeddings = loaded_model.forward(**batch)[0] + query_embeddings, passage_embeddings = loaded_model.forward( + query_input_ids=batch.get("query_input_ids", None), + query_segment_ids=batch.get("query_segment_ids", None), + query_attention_mask=batch.get("query_attention_mask", None), + passage_input_ids=batch.get("passage_input_ids", None), + passage_segment_ids=batch.get("passage_segment_ids", None), + passage_attention_mask=batch.get("passage_attention_mask", None) + )[0] if query_embeddings is not None: all_embeddings2["query"].append(query_embeddings.cpu().numpy()) if passage_embeddings is not None: @@ -933,7 +940,14 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path, query_and_passage_ # get logits with torch.no_grad(): - query_embeddings, passage_embeddings = loaded_model.forward(**batch)[0] + query_embeddings, passage_embeddings = loaded_model.forward( + query_input_ids=batch.get("query_input_ids", None), + query_segment_ids=batch.get("query_segment_ids", None), + query_attention_mask=batch.get("query_attention_mask", None), + passage_input_ids=batch.get("passage_input_ids", None), + passage_segment_ids=batch.get("passage_segment_ids", None), + passage_attention_mask=batch.get("passage_attention_mask", None) + )[0] if query_embeddings is not None: all_embeddings3["query"].append(query_embeddings.cpu().numpy()) if passage_embeddings is not None: From 63ab0cb2a87a3a0627c59de74aa02fa2e9bf39e0 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 22 Jun 2022 10:46:02 +0000 Subject: [PATCH 10/89] Update Documentation & Code Style --- haystack/modeling/model/language_model.py | 164 ++++++++++------------ test/modeling/test_dpr.py | 4 +- 2 files changed, 80 insertions(+), 88 deletions(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 00c40e33d8..44c5023dd2 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -38,14 +38,14 @@ from haystack.errors import ModelingError LANGUAGE_HINTS = ( - ("german", "german"), - ("english", "english"), - ("chinese", "chinese"), - ("indian", "indian"), - ("french", "french"), - ("camembert", "french"), - ("polish", "polish"), - ("spanish", "spanish"), + ("german", "german"), + ("english", "english"), + ("chinese", "chinese"), + ("indian", "indian"), + ("french", "french"), + ("camembert", "french"), + ("polish", "polish"), + ("spanish", "spanish"), ("umberto", "italian"), ("multilingual", "multilingual"), ) @@ -85,13 +85,13 @@ def quiet_from_pretrained_func(cls, *args, **kwargs): # TODO analyse if LMs can be completely used through HF transformers class LanguageModel(nn.Module, ABC): """ - The parent class for any kind of model that can embed language into a semantic vector space. + The parent class for any kind of model that can embed language into a semantic vector space. These models read in tokenized sentences and return vectors that capture the meaning of sentences or of tokens. """ def __init__(self, name: str): super().__init__() - self._output_dims = None + self._output_dims = None self.name = name @property @@ -100,11 +100,11 @@ def encoder(self): @abstractmethod def forward( - self, - input_ids: torch.Tensor, - segment_ids: torch.Tensor, + self, + input_ids: torch.Tensor, + segment_ids: torch.Tensor, padding_mask: torch.Tensor = None, - output_hidden_states: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, output_attentions: Optional[bool] = None, ): raise NotImplementedError @@ -171,11 +171,7 @@ def save(self, save_dir: Union[str, Path], state_dict: Dict[Any, Any] = None): self.save_config(save_dir) def formatted_preds( - self, - logits, - samples, - ignore_first_token: bool = True, - padding_mask: torch.Tensor = None + self, logits, samples, ignore_first_token: bool = True, padding_mask: torch.Tensor = None ) -> List[Dict[str, Any]]: """ Extracting vectors from a language model (for example, for extracting sentence embeddings). @@ -227,7 +223,9 @@ def formatted_preds( elif self.extraction_strategy == "cls_token": vecs = sequence_output[:, 0, :].cpu().numpy() else: - raise NotImplementedError(f"This extraction strategy ({self.extraction_strategy}) is not supported by Haystack.") + raise NotImplementedError( + f"This extraction strategy ({self.extraction_strategy}) is not supported by Haystack." + ) preds = [] for vec, sample in zip(vecs, samples): @@ -237,7 +235,9 @@ def formatted_preds( preds.append(pred) return preds - def _pool_tokens(self, sequence_output: torch.Tensor, padding_mask: torch.Tensor, strategy: str, ignore_first_token: bool): + def _pool_tokens( + self, sequence_output: torch.Tensor, padding_mask: torch.Tensor, strategy: str, ignore_first_token: bool + ): token_vecs = sequence_output.cpu().numpy() # we only take the aggregated value of non-padding tokens padding_mask = padding_mask.cpu().numpy() @@ -263,13 +263,13 @@ class HFLanguageModel(LanguageModel): @silence_transformers_logs def __init__( - self, - pretrained_model_name_or_path: Union[Path, str], + self, + pretrained_model_name_or_path: Union[Path, str], model_type: str, - language: str = None, - n_added_tokens: int = 0, + language: str = None, + n_added_tokens: int = 0, use_auth_token: Optional[Union[str, bool]] = None, - model_kwargs: Optional[Dict[str, Any]] = None + model_kwargs: Optional[Dict[str, Any]] = None, ): """ Load a pretrained model by supplying one of the following: @@ -286,7 +286,7 @@ def __init__( :param use_auth_token: the HF token or False """ super().__init__(name=model_type) - + config_class: PretrainedConfig = getattr(transformers, model_type + "Config", None) model_class: PreTrainedModel = getattr(transformers, model_type + "Model", None) @@ -296,13 +296,17 @@ def __init__( # Haystack style haystack_lm_model = Path(pretrained_model_name_or_path) / "language_model.bin" model_config = config_class.from_pretrained(haystack_lm_config) - self.model = model_class.from_pretrained(haystack_lm_model, config=model_config, use_auth_token=use_auth_token, **(model_kwargs or {})) + self.model = model_class.from_pretrained( + haystack_lm_model, config=model_config, use_auth_token=use_auth_token, **(model_kwargs or {}) + ) self.language = self.model.config.language else: # Pytorch-transformer Style - self.model = model_class.from_pretrained(str(pretrained_model_name_or_path), use_auth_token=use_auth_token, **(model_kwargs or {})) + self.model = model_class.from_pretrained( + str(pretrained_model_name_or_path), use_auth_token=use_auth_token, **(model_kwargs or {}) + ) self.language = language or _guess_language(pretrained_model_name_or_path) - + # resize embeddings in case of custom vocab if n_added_tokens != 0: # TODO verify for other models than BERT @@ -322,7 +326,7 @@ def forward( segment_ids: torch.Tensor, attention_mask: torch.Tensor, output_hidden_states: Optional[bool] = None, - output_attentions: Optional[bool] = None + output_attentions: Optional[bool] = None, ): """ Perform the forward pass of the model. @@ -358,11 +362,11 @@ class HFLanguageModelWithPooler(HFLanguageModel): """ def __init__( - self, - pretrained_model_name_or_path: Union[Path, str], - language: str = None, - n_added_tokens: int = 0, - model_kwargs: Optional[Dict[str, Any]] = None + self, + pretrained_model_name_or_path: Union[Path, str], + language: str = None, + n_added_tokens: int = 0, + model_kwargs: Optional[Dict[str, Any]] = None, ): """ Load a pretrained model by supplying one of the following: @@ -394,7 +398,7 @@ def forward( # type: ignore segment_ids: torch.Tensor, attention_mask: torch.Tensor, output_hidden_states: Optional[bool] = None, - output_attentions: Optional[bool] = None + output_attentions: Optional[bool] = None, ): """ Perform the forward pass of the model. @@ -422,6 +426,7 @@ class DPREncoder(LanguageModel): """ A DPREncoder model that wraps Hugging Face's implementation. """ + @silence_transformers_logs def __init__( self, @@ -429,7 +434,7 @@ def __init__( model_type: str, language: str = None, use_auth_token: Optional[Union[str, bool]] = None, - model_kwargs: Optional[Dict[str, Any]] = None + model_kwargs: Optional[Dict[str, Any]] = None, ): """ Load a pretrained model by supplying one of the following: @@ -455,9 +460,7 @@ def __init__( if original_model_config.model_type == "dpr": dpr_config = transformers.DPRConfig.from_pretrained(haystack_lm_config) - self.model = model_class.from_pretrained( - haystack_lm_model, config=dpr_config, **kwargs - ) + self.model = model_class.from_pretrained(haystack_lm_model, config=dpr_config, **kwargs) else: if original_model_config.model_type != "bert": logger.warning( @@ -466,9 +469,7 @@ def __init__( ) original_config_dict = vars(original_model_config) original_config_dict.update(kwargs) - self.model = model_class( - config=transformers.DPRConfig(**original_config_dict) - ) + self.model = model_class(config=transformers.DPRConfig(**original_config_dict)) language_model_type = _get_model_type(haystack_lm_config, use_auth_token=use_auth_token, **kwargs) # Find the class corresponding to this model type @@ -476,13 +477,12 @@ def __init__( if not language_model_class: raise ValueError( f"The type of model supplied ({language_model_type}) is not supported by Haystack. " - f"Supported model categories are: {', '.join(HUGGINGFACE_TO_HAYSTACK.keys())}") + f"Supported model categories are: {', '.join(HUGGINGFACE_TO_HAYSTACK.keys())}" + ) # Instantiate the class for this model self.model.base_model.bert_model = language_model_class( - pretrained_model_name_or_path, - model_type=language_model_type, - **kwargs + pretrained_model_name_or_path, model_type=language_model_type, **kwargs ).model self.language = self.model.config.language @@ -506,9 +506,7 @@ def __init__( ) original_config_dict = vars(original_model_config) original_config_dict.update(kwargs) - self.model = model_class( - config=transformers.DPRConfig(**original_config_dict) - ) + self.model = model_class(config=transformers.DPRConfig(**original_config_dict)) self.model.base_model.bert_model = AutoModel.from_pretrained( str(pretrained_model_name_or_path), use_auth_token=use_auth_token, **original_config_dict ) @@ -528,7 +526,7 @@ def save_config(self, save_dir: Union[Path, str]): # Therefore, we copy the model_type from the model config to DPRConfig setattr(transformers.DPRConfig, "model_type", self.model.config.model_type) super().save_config(save_dir=save_dir) - + def save(self, save_dir: Union[str, Path], state_dict: Optional[Dict[Any, Any]] = None): """ Save the model `state_dict` and its configuration file so that it can be loaded again. @@ -536,9 +534,7 @@ def save(self, save_dir: Union[str, Path], state_dict: Optional[Dict[Any, Any]] :param save_dir: The directory in which the model should be saved. :param state_dict: A dictionary containing the whole state of the module including names of layers. By default, the unchanged state dictionary of the module is used. """ - model_to_save = ( - self.model.module if hasattr(self.model, "module") else self.model - ) # Only save the model itself + model_to_save = self.model.module if hasattr(self.model, "module") else self.model # Only save the model itself if "dpr" not in self.model.config.model_type.lower(): if model_to_save.base_model_prefix.startswith("ctx_"): @@ -567,12 +563,7 @@ def save(self, save_dir: Union[str, Path], state_dict: Optional[Dict[Any, Any]] super().save(save_dir=save_dir, state_dict=state_dict) - def forward( # type: ignore - self, - input_ids: torch.Tensor, - segment_ids: torch.Tensor, - attention_mask: torch.Tensor - ): + def forward(self, input_ids: torch.Tensor, segment_ids: torch.Tensor, attention_mask: torch.Tensor): # type: ignore """ Perform the forward pass of the DPR encoder model. @@ -591,10 +582,7 @@ def forward( # type: ignore attention_mask = attention_mask.view(-1, max_seq_len) output_tuple = self.model( - input_ids=input_ids, - token_type_ids=segment_ids, - attention_mask=attention_mask, - return_dict=True, + input_ids=input_ids, token_type_ids=segment_ids, attention_mask=attention_mask, return_dict=True ) if self.encoder.config.output_hidden_states == True: pooled_output, all_hidden_states = output_tuple.pooler_output, output_tuple.hidden_states @@ -624,7 +612,6 @@ def forward( # type: ignore "WordEmbedding_LM": HFLanguageModel, "XLMRoberta": HFLanguageModel, "XLNet": HFLanguageModelWithPooler, - } NAME_HINTS = { "xlm.*roberta": "XLMRoberta", @@ -654,13 +641,14 @@ def forward( # type: ignore }, } + def get_language_model( - pretrained_model_name_or_path: Union[Path, str], - language_model_type: Optional[str] = None, + pretrained_model_name_or_path: Union[Path, str], + language_model_type: Optional[str] = None, use_auth_token: Optional[Union[str, bool]] = None, - revision: Optional[str] = None, + revision: Optional[str] = None, autoconfig_kwargs: Optional[Dict[str, Any]] = None, - model_kwargs: Optional[Dict[str, Any]] = None + model_kwargs: Optional[Dict[str, Any]] = None, ) -> LanguageModel: """ Load a pretrained language model by doing one of the following: @@ -694,9 +682,12 @@ def get_language_model( logger.info(f"Could not find '{pretrained_model_name_or_path}' locally.") logger.info(f"Looking on Transformers Model Hub (in local cache and online)...") language_model_type = _get_model_type( - pretrained_model_name_or_path, use_auth_token=use_auth_token, revision=revision, autoconfig_kwargs=autoconfig_kwargs + pretrained_model_name_or_path, + use_auth_token=use_auth_token, + revision=revision, + autoconfig_kwargs=autoconfig_kwargs, ) - if not language_model_type: + if not language_model_type: raise ModelingError( f"Model not found for '{pretrained_model_name_or_path}'. Either supply the local path for a saved " f"model or one of bert/roberta/xlnet/albert/distilbert models that can be downloaded from remote. " @@ -709,24 +700,25 @@ def get_language_model( if not language_model_class: raise ValueError( f"The type of model supplied ({language_model_type}) is not supported by Haystack. " - f"Supported model categories are: {', '.join(HUGGINGFACE_TO_HAYSTACK.keys())}") + f"Supported model categories are: {', '.join(HUGGINGFACE_TO_HAYSTACK.keys())}" + ) # Instantiate the class for this model language_model = language_model_class( pretrained_model_name_or_path, model_type=language_model_type, use_auth_token=use_auth_token, - model_kwargs=model_kwargs + model_kwargs=model_kwargs, ) logger.info(f"Loaded '{pretrained_model_name_or_path}' ({language_model_type} model)") return language_model def _get_model_type( - model_name_or_path: Union[str, Path], + model_name_or_path: Union[str, Path], use_auth_token: Optional[Union[str, bool]] = None, - revision: Optional[str] = None, - autoconfig_kwargs: Optional[Dict[str, Any]] = None + revision: Optional[str] = None, + autoconfig_kwargs: Optional[Dict[str, Any]] = None, ) -> str: """ Given a model name, try to use AutoConfig to understand which model type it is. @@ -738,18 +730,16 @@ def _get_model_type( # Use AutoConfig to understand the model class try: config = AutoConfig.from_pretrained( - pretrained_model_name_or_path=model_name_or_path, - use_auth_token=use_auth_token, - revision=revision, - **(autoconfig_kwargs or {}) + pretrained_model_name_or_path=model_name_or_path, + use_auth_token=use_auth_token, + revision=revision, + **(autoconfig_kwargs or {}), ) # Find if this mode is present in MODEL_TYPE_BY_NAME.keys() even with a different capitalization model_type = {key.lower(): key for key in HUGGINGFACE_TO_HAYSTACK.keys()}.get(config.model_type.lower(), None) except Exception as e: - logger.exception( - f"AutoConfig failed to load on '{model_name_or_path}'. " - ) + logger.exception(f"AutoConfig failed to load on '{model_name_or_path}'. ") if not model_type: logger.warning("Could not infer the model type from its config. Looking for clues in the model name.") @@ -761,7 +751,9 @@ def _get_model_type( break if model_type and model_type.lower() == "roberta" and "mlm" in model_name_or_path.lower(): - logger.error(f"MLM part of codebert is currently not supported in Haystack: '{model_name_or_path}' may crash later.") + logger.error( + f"MLM part of codebert is currently not supported in Haystack: '{model_name_or_path}' may crash later." + ) if not model_type: logger.error("Model type not found. Using the AutoModel class. This can cause crashes later!") @@ -780,4 +772,4 @@ def _guess_language(name: str) -> str: else: language = "english" logger.info(f"Auto-detected model language: {language}") - return language \ No newline at end of file + return language diff --git a/test/modeling/test_dpr.py b/test/modeling/test_dpr.py index 6ef2f1e9ed..d66e2f9d4f 100644 --- a/test/modeling/test_dpr.py +++ b/test/modeling/test_dpr.py @@ -843,7 +843,7 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path, query_and_passage_ query_attention_mask=batch.get("query_attention_mask", None), passage_input_ids=batch.get("passage_input_ids", None), passage_segment_ids=batch.get("passage_segment_ids", None), - passage_attention_mask=batch.get("passage_attention_mask", None) + passage_attention_mask=batch.get("passage_attention_mask", None), )[0] if query_embeddings is not None: all_embeddings2["query"].append(query_embeddings.cpu().numpy()) @@ -930,7 +930,7 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path, query_and_passage_ query_attention_mask=batch.get("query_attention_mask", None), passage_input_ids=batch.get("passage_input_ids", None), passage_segment_ids=batch.get("passage_segment_ids", None), - passage_attention_mask=batch.get("passage_attention_mask", None) + passage_attention_mask=batch.get("passage_attention_mask", None), )[0] if query_embeddings is not None: all_embeddings3["query"].append(query_embeddings.cpu().numpy()) From 34b99739f283ddc94794dfe29687c56b6e3f2e40 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 22 Jun 2022 16:11:37 +0200 Subject: [PATCH 11/89] fixing mypy issues --- haystack/modeling/data_handler/processor.py | 4 ++-- haystack/modeling/model/biadaptive_model.py | 2 +- haystack/modeling/model/language_model.py | 11 +++++------ haystack/modeling/model/tokenization.py | 8 ++++---- test/samples/squad/tiny_augmented.json | 2 +- 5 files changed, 13 insertions(+), 14 deletions(-) diff --git a/haystack/modeling/data_handler/processor.py b/haystack/modeling/data_handler/processor.py index cd1c8557a6..92b28212e7 100644 --- a/haystack/modeling/data_handler/processor.py +++ b/haystack/modeling/data_handler/processor.py @@ -438,7 +438,7 @@ def __init__( "using the default task or add a custom task later via processor.add_task()" ) - def dataset_from_dicts(self, dicts: List[dict], indices: Optional[List[int]] = None, return_baskets: bool = False): + def dataset_from_dicts(self, dicts: List[dict], indices: List[int] = [], return_baskets: bool = False): """ Convert input dictionaries into a pytorch dataset for Question Answering. For this we have an internal representation called "baskets". @@ -485,7 +485,7 @@ def file_to_dicts(self, file: str) -> List[dict]: return dicts # TODO use Input Objects instead of this function, remove Natural Questions (NQ) related code - def convert_qa_input_dict(self, infer_dict: dict): + def convert_qa_input_dict(self, infer_dict: dict) -> Dict[str, Any]: """Input dictionaries in QA can either have ["context", "qas"] (internal format) as keys or ["text", "questions"] (api format). This function converts the latter into the former. It also converts the is_impossible field to answer_type so that NQ and SQuAD dicts have the same format. diff --git a/haystack/modeling/model/biadaptive_model.py b/haystack/modeling/model/biadaptive_model.py index 5cd1a37459..7b5beadc61 100644 --- a/haystack/modeling/model/biadaptive_model.py +++ b/haystack/modeling/model/biadaptive_model.py @@ -344,7 +344,7 @@ def forward_lm( if passage_input_ids is not None: pooled_output2, _ = self.language_model2( - input_ids=passage_input_ids, segment_ids=passage_segment_ids, attention_mask=passage_attention_mask + input_ids=passage_input_ids[0], segment_ids=passage_segment_ids[0], attention_mask=passage_attention_mask[0] ) pooled_output[1] = pooled_output2 diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 00c40e33d8..2f237638f5 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -388,7 +388,7 @@ def __init__( self.pooler = SequenceSummary(config) self.pooler.apply(self.model._init_weights) - def forward( # type: ignore + def forward( self, input_ids: torch.Tensor, segment_ids: torch.Tensor, @@ -444,7 +444,7 @@ def __init__( kwargs = model_kwargs or {} model_classname = f"DPR{self.role.capitalize()}Encoder" - model_class: Type[PreTrainedModel] = getattr(transformers, model_classname, None) + model_class: Optional[Type[PreTrainedModel]] = getattr(transformers, model_classname, None) # We need to differentiate between loading model using Haystack format and Pytorch-Transformers format haystack_lm_config = Path(pretrained_model_name_or_path) / "language_model_config.json" @@ -481,7 +481,6 @@ def __init__( # Instantiate the class for this model self.model.base_model.bert_model = language_model_class( pretrained_model_name_or_path, - model_type=language_model_type, **kwargs ).model @@ -604,7 +603,7 @@ def forward( # type: ignore return pooled_output, None -HUGGINGFACE_TO_HAYSTACK = { +HUGGINGFACE_TO_HAYSTACK: Dict[str, Type[LanguageModel]] = { "Auto": HFLanguageModel, "Albert": HFLanguageModel, "Bert": HFLanguageModel, @@ -626,7 +625,7 @@ def forward( # type: ignore "XLNet": HFLanguageModelWithPooler, } -NAME_HINTS = { +NAME_HINTS: Dict[str, str] = { "xlm.*roberta": "XLMRoberta", "roberta.*xml": "XLMRoberta", "codebert.*mlm": "Roberta", @@ -637,7 +636,7 @@ def forward( # type: ignore "mlm.*codebert": "Roberta", "deberta-v2": "DebertaV2", } -PARAMETERS_BY_MODEL = { +PARAMETERS_BY_MODEL: Dict[str, Dict[str, Any]] = { "DistilBert": {"summary_last_dropout": 0, "summary_type": "first", "summary_activation": "tanh"}, "XLNet": {"summary_last_dropout": 0}, "Electra": { diff --git a/haystack/modeling/model/tokenization.py b/haystack/modeling/model/tokenization.py index a7426c2a34..d05ae8dbdf 100644 --- a/haystack/modeling/model/tokenization.py +++ b/haystack/modeling/model/tokenization.py @@ -248,7 +248,7 @@ def truncate_sequences( def _words_to_tokens( words: List[str], word_offsets: List[int], tokenizer: PreTrainedTokenizer -) -> Tuple[str, List[str], List[int]]: +) -> Tuple[List[str], List[int], List[bool]]: """ Tokenize "words" into subword tokens while keeping track of offsets and if a token is the start of a word. :param words: list of words. @@ -256,9 +256,9 @@ def _words_to_tokens( :param tokenizer: Tokenizer (e.g. from get_tokenizer)) :return: Tuple of (tokens, offsets, start_of_word) """ - tokens = [] - token_offsets = [] - start_of_word = [] + tokens: List[str] = [] + token_offsets: List[int] = [] + start_of_word: List[bool] = [] index = 0 for index, word, word_offset in enumerate(zip(words, word_offsets)): if index % 500000 == 0: diff --git a/test/samples/squad/tiny_augmented.json b/test/samples/squad/tiny_augmented.json index 2c29add194..c906c383e8 100644 --- a/test/samples/squad/tiny_augmented.json +++ b/test/samples/squad/tiny_augmented.json @@ -1 +1 @@ -{"data": [{"title": "test1", "paragraphs": [{"context": "my name is carla \u2014 me danced together with abdul - berlin", "qas": [{"answers": [], "id": 7211011040021040393, "question": "Who lives in Berlin?", "is_impossible": false}]}, {"context": "my grandmother is baba and i met together with you ka jakarta", "qas": [{"answers": [], "id": 7211011040021040393, "question": "Who lives in Berlin?", "is_impossible": false}]}, {"context": "my sister is carla & i live upstairs with friends boom berlin", "qas": [{"answers": [], "id": 7211011040021040393, "question": "Who lives in Berlin?", "is_impossible": false}]}, {"context": "the name is harry and i worked together with friends in berlin", "qas": [{"answers": [], "id": 7211011040021040393, "question": "Who lives in Berlin?", "is_impossible": false}]}, {"context": "whose aunt is carla and i sang together paula abdul in berlin", "qas": [{"answers": [], "id": 7211011040021040393, "question": "Who lives in Berlin?", "is_impossible": false}]}]}, {"title": "test2", "paragraphs": [{"context": "suppose is another test context", "qas": [{"answers": [], "id": -5782547119306399562, "question": "The model can't answer this", "is_impossible": false}]}, {"context": "what is another test context", "qas": [{"answers": [], "id": -5782547119306399562, "question": "The model can't answer this", "is_impossible": false}]}, {"context": "where is the test for", "qas": [{"answers": [], "id": -5782547119306399562, "question": "The model can't answer this", "is_impossible": false}]}, {"context": "suppose defines for test context", "qas": [{"answers": [], "id": -5782547119306399562, "question": "The model can't answer this", "is_impossible": false}]}, {"context": "these constitutes a social that", "qas": [{"answers": [], "id": -5782547119306399562, "question": "The model can't answer this", "is_impossible": false}]}]}], "topics": [{"title": "test1", "paragraphs": [{"context": "my name is carla \u2014 me danced together with abdul - berlin", "qas": [{"answers": [], "id": 7211011040021040393, "question": "Who lives in Berlin?", "is_impossible": false}]}, {"context": "my grandmother is baba and i met together with you ka jakarta", "qas": [{"answers": [], "id": 7211011040021040393, "question": "Who lives in Berlin?", "is_impossible": false}]}, {"context": "my sister is carla & i live upstairs with friends boom berlin", "qas": [{"answers": [], "id": 7211011040021040393, "question": "Who lives in Berlin?", "is_impossible": false}]}, {"context": "the name is harry and i worked together with friends in berlin", "qas": [{"answers": [], "id": 7211011040021040393, "question": "Who lives in Berlin?", "is_impossible": false}]}, {"context": "whose aunt is carla and i sang together paula abdul in berlin", "qas": [{"answers": [], "id": 7211011040021040393, "question": "Who lives in Berlin?", "is_impossible": false}]}]}, {"title": "test2", "paragraphs": [{"context": "suppose is another test context", "qas": [{"answers": [], "id": -5782547119306399562, "question": "The model can't answer this", "is_impossible": false}]}, {"context": "what is another test context", "qas": [{"answers": [], "id": -5782547119306399562, "question": "The model can't answer this", "is_impossible": false}]}, {"context": "where is the test for", "qas": [{"answers": [], "id": -5782547119306399562, "question": "The model can't answer this", "is_impossible": false}]}, {"context": "suppose defines for test context", "qas": [{"answers": [], "id": -5782547119306399562, "question": "The model can't answer this", "is_impossible": false}]}, {"context": "these constitutes a social that", "qas": [{"answers": [], "id": -5782547119306399562, "question": "The model can't answer this", "is_impossible": false}]}]}]} \ No newline at end of file +{"data": [{"title": "test1", "paragraphs": [{"context": "maiden father is carla and i lives together with friends in berlin", "qas": [{"answers": [], "id": 7211011040021040393, "question": "Who lives in Berlin?", "is_impossible": false}]}, {"context": "my dad is carla and i lived comfortably at abdul rahman manhattan", "qas": [{"answers": [], "id": 7211011040021040393, "question": "Who lives in Berlin?", "is_impossible": false}]}, {"context": "my mum ... carla and maria perform exclusively with myself karim berlin", "qas": [{"answers": [], "id": 7211011040021040393, "question": "Who lives in Berlin?", "is_impossible": false}]}, {"context": "last wife , carla because i live now beside abdul in berlin", "qas": [{"answers": [], "id": 7211011040021040393, "question": "Who lives in Berlin?", "is_impossible": false}]}, {"context": "my name is carla and i live together with abdul hamid berlin", "qas": [{"answers": [], "id": 7211011040021040393, "question": "Who lives in Berlin?", "is_impossible": false}]}]}, {"title": "test2", "paragraphs": [{"context": "this is another test context", "qas": [{"answers": [], "id": -5782547119306399562, "question": "The model can't answer this", "is_impossible": false}]}, {"context": "thus is another test .", "qas": [{"answers": [], "id": -5782547119306399562, "question": "The model can't answer this", "is_impossible": false}]}, {"context": "this is another mathematical context", "qas": [{"answers": [], "id": -5782547119306399562, "question": "The model can't answer this", "is_impossible": false}]}, {"context": "this is another test context", "qas": [{"answers": [], "id": -5782547119306399562, "question": "The model can't answer this", "is_impossible": false}]}, {"context": "there is dynamic test context", "qas": [{"answers": [], "id": -5782547119306399562, "question": "The model can't answer this", "is_impossible": false}]}]}], "topics": [{"title": "test1", "paragraphs": [{"context": "maiden father is carla and i lives together with friends in berlin", "qas": [{"answers": [], "id": 7211011040021040393, "question": "Who lives in Berlin?", "is_impossible": false}]}, {"context": "my dad is carla and i lived comfortably at abdul rahman manhattan", "qas": [{"answers": [], "id": 7211011040021040393, "question": "Who lives in Berlin?", "is_impossible": false}]}, {"context": "my mum ... carla and maria perform exclusively with myself karim berlin", "qas": [{"answers": [], "id": 7211011040021040393, "question": "Who lives in Berlin?", "is_impossible": false}]}, {"context": "last wife , carla because i live now beside abdul in berlin", "qas": [{"answers": [], "id": 7211011040021040393, "question": "Who lives in Berlin?", "is_impossible": false}]}, {"context": "my name is carla and i live together with abdul hamid berlin", "qas": [{"answers": [], "id": 7211011040021040393, "question": "Who lives in Berlin?", "is_impossible": false}]}]}, {"title": "test2", "paragraphs": [{"context": "this is another test context", "qas": [{"answers": [], "id": -5782547119306399562, "question": "The model can't answer this", "is_impossible": false}]}, {"context": "thus is another test .", "qas": [{"answers": [], "id": -5782547119306399562, "question": "The model can't answer this", "is_impossible": false}]}, {"context": "this is another mathematical context", "qas": [{"answers": [], "id": -5782547119306399562, "question": "The model can't answer this", "is_impossible": false}]}, {"context": "this is another test context", "qas": [{"answers": [], "id": -5782547119306399562, "question": "The model can't answer this", "is_impossible": false}]}, {"context": "there is dynamic test context", "qas": [{"answers": [], "id": -5782547119306399562, "question": "The model can't answer this", "is_impossible": false}]}]}]} \ No newline at end of file From 0253b14aa45f46d6fcb89cbe53a8c0b004ba352b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 22 Jun 2022 14:14:32 +0000 Subject: [PATCH 12/89] Update Documentation & Code Style --- haystack/modeling/model/biadaptive_model.py | 4 +++- haystack/modeling/model/language_model.py | 4 +--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/haystack/modeling/model/biadaptive_model.py b/haystack/modeling/model/biadaptive_model.py index 7b5beadc61..b8a8407778 100644 --- a/haystack/modeling/model/biadaptive_model.py +++ b/haystack/modeling/model/biadaptive_model.py @@ -344,7 +344,9 @@ def forward_lm( if passage_input_ids is not None: pooled_output2, _ = self.language_model2( - input_ids=passage_input_ids[0], segment_ids=passage_segment_ids[0], attention_mask=passage_attention_mask[0] + input_ids=passage_input_ids[0], + segment_ids=passage_segment_ids[0], + attention_mask=passage_attention_mask[0], ) pooled_output[1] = pooled_output2 diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index bf0712c4c9..0a01869abc 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -481,9 +481,7 @@ def __init__( ) # Instantiate the class for this model - self.model.base_model.bert_model = language_model_class( - pretrained_model_name_or_path, **kwargs - ).model + self.model.base_model.bert_model = language_model_class(pretrained_model_name_or_path, **kwargs).model self.language = self.model.config.language else: From d78d55abb6d88b5e2747e272d09870216a19f2d5 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 22 Jun 2022 16:24:20 +0200 Subject: [PATCH 13/89] typing tokenization better --- haystack/modeling/model/tokenization.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/haystack/modeling/model/tokenization.py b/haystack/modeling/model/tokenization.py index d05ae8dbdf..cd9b72f7e4 100644 --- a/haystack/modeling/model/tokenization.py +++ b/haystack/modeling/model/tokenization.py @@ -144,7 +144,7 @@ def _get_start_of_word_QA(word_ids): return [1] + list(np.ediff1d(np.array(word_ids))) -def tokenize_with_metadata(text: str, tokenizer) -> Dict[str, Any]: +def tokenize_with_metadata(text: str, tokenizer: PreTrainedTokenizer) -> Dict[str, Any]: """ Performing tokenization while storing some important metadata for each token: @@ -175,18 +175,18 @@ def tokenize_with_metadata(text: str, tokenizer) -> Dict[str, Any]: tokens = tokenized["input_ids"] offsets = np.array([x[0] for x in tokenized["offset_mapping"]]) # offsets2 = [x[0] for x in tokenized2["offset_mapping"]] - words = np.array(tokenized.encodings[0].words) + words_array = np.array(tokenized.encodings[0].words) # TODO check for validity for all tokenizer and special token types - words[0] = -1 - words[-1] = words[-2] - words += 1 - start_of_word = [0] + list(np.ediff1d(words)) + words_array[0] = -1 + words_array[-1] = words_array[-2] + words_array += 1 + start_of_word = [0] + list(np.ediff1d(words_array)) tokenized_dict = {"tokens": tokens, "offsets": offsets, "start_of_word": start_of_word} else: # split text into "words" (here: simple whitespace tokenizer). words = text.split(" ") - word_offsets = [] + word_offsets: List[int] = [] cumulated = 0 for word in words: word_offsets.append(cumulated) @@ -260,7 +260,7 @@ def _words_to_tokens( token_offsets: List[int] = [] start_of_word: List[bool] = [] index = 0 - for index, word, word_offset in enumerate(zip(words, word_offsets)): + for index, (word, word_offset) in enumerate(zip(words, word_offsets)): if index % 500000 == 0: logger.info(index) # Get (subword) tokens of single word. From 75518505bfe821894e97a412bf06594c768b9e20 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 22 Jun 2022 18:08:51 +0200 Subject: [PATCH 14/89] more fixes for mypy --- haystack/modeling/data_handler/processor.py | 10 +++--- haystack/modeling/model/language_model.py | 26 +++++++++----- haystack/modeling/model/tokenization.py | 39 ++++++++++----------- 3 files changed, 42 insertions(+), 33 deletions(-) diff --git a/haystack/modeling/data_handler/processor.py b/haystack/modeling/data_handler/processor.py index 92b28212e7..cdc383827b 100644 --- a/haystack/modeling/data_handler/processor.py +++ b/haystack/modeling/data_handler/processor.py @@ -308,7 +308,7 @@ def file_to_dicts(self, file: str) -> List[dict]: raise NotImplementedError() @abstractmethod - def dataset_from_dicts(self, dicts: List[dict], indices: Optional[List[int]] = None, return_baskets: bool = False): + def dataset_from_dicts(self, dicts: List[dict], indices: List[int] = [], return_baskets: bool = False): raise NotImplementedError() @abstractmethod @@ -970,7 +970,7 @@ def save(self, save_dir: Union[str, Path]): with open(output_config_file, "w") as file: json.dump(config, file) - def dataset_from_dicts(self, dicts: List[dict], indices: Optional[List[int]] = None, return_baskets: bool = False): + def dataset_from_dicts(self, dicts: List[dict], indices: List[int] = [], return_baskets: bool = False): """ Convert input dictionaries into a pytorch dataset for TextSimilarity (e.g. DPR). For conversion we have an internal representation called "baskets". @@ -1480,7 +1480,7 @@ def _read_multimodal_dpr_json(self, file: str, max_samples: Optional[int] = None standard_dicts.append(sample) return standard_dicts - def dataset_from_dicts(self, dicts: List[Dict], indices: Optional[List[int]] = None, return_baskets: bool = False): + def dataset_from_dicts(self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False): """ Convert input dictionaries into a pytorch dataset for TextSimilarity. For conversion we have an internal representation called "baskets". @@ -1971,7 +1971,7 @@ def convert_labels(self, dictionary: Dict): ret: Dict = {} return ret - def dataset_from_dicts(self, dicts: List[Dict], indices=None, return_baskets: bool = False, debug: bool = False): + def dataset_from_dicts(self, dicts: List[Dict], indices: List[str] = [], return_baskets: bool = False, debug: bool = False): """ Function to convert input dictionaries containing text into a torch dataset. For normal operation with Language Models it calls the superclass' TextClassification.dataset_from_dicts method. @@ -2059,7 +2059,7 @@ def file_to_dicts(self, file: str) -> List[dict]: dicts.append({"text": line}) return dicts - def dataset_from_dicts(self, dicts: List[dict], indices: Optional[List[int]] = None, return_baskets: bool = False): + def dataset_from_dicts(self, dicts: List[dict], indices: List[int] = [], return_baskets: bool = False): if return_baskets: raise NotImplementedError("return_baskets is not supported by UnlabeledTextProcessor") texts = [dict_["text"] for dict_ in dicts] diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 0a01869abc..ca10efdc65 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -18,6 +18,7 @@ Thanks for the great work! """ +from ast import Str from typing import Type, Optional, Dict, Any, Union, List import re @@ -103,7 +104,7 @@ def forward( self, input_ids: torch.Tensor, segment_ids: torch.Tensor, - padding_mask: torch.Tensor = None, + attention_mask: torch.Tensor, output_hidden_states: Optional[bool] = None, output_attentions: Optional[bool] = None, ): @@ -305,7 +306,7 @@ def __init__( self.model = model_class.from_pretrained( str(pretrained_model_name_or_path), use_auth_token=use_auth_token, **(model_kwargs or {}) ) - self.language = language or _guess_language(pretrained_model_name_or_path) + self.language = language or _guess_language(str(pretrained_model_name_or_path)) # resize embeddings in case of custom vocab if n_added_tokens != 0: @@ -378,7 +379,6 @@ def __init__( :param pretrained_model_name_or_path: The path of the saved pretrained model or its name. """ super().__init__(pretrained_model_name_or_path, language, n_added_tokens, model_kwargs) - self.pooler = None config = self.model.config # These models do not provide a pooled_output by default. Therefore, we need to initialize an extra pooler. @@ -449,7 +449,10 @@ def __init__( kwargs = model_kwargs or {} model_classname = f"DPR{self.role.capitalize()}Encoder" - model_class: Optional[Type[PreTrainedModel]] = getattr(transformers, model_classname, None) + try: + model_class: Type[PreTrainedModel] = getattr(transformers, model_classname) + except AttributeError as e: + raise ModelingError(f"Model class of type '{model_classname}' not found.") # We need to differentiate between loading model using Haystack format and Pytorch-Transformers format haystack_lm_config = Path(pretrained_model_name_or_path) / "language_model_config.json" @@ -481,7 +484,7 @@ def __init__( ) # Instantiate the class for this model - self.model.base_model.bert_model = language_model_class(pretrained_model_name_or_path, **kwargs).model + self.model.base_model.bert_model = language_model_class(pretrained_model_name_or_path=pretrained_model_name_or_path, **kwargs).model self.language = self.model.config.language else: @@ -508,7 +511,7 @@ def __init__( self.model.base_model.bert_model = AutoModel.from_pretrained( str(pretrained_model_name_or_path), use_auth_token=use_auth_token, **original_config_dict ) - self.language = language or _guess_language(pretrained_model_name_or_path) + self.language = language or _guess_language(Str(pretrained_model_name_or_path)) @property def encoder(self): @@ -561,7 +564,14 @@ def save(self, save_dir: Union[str, Path], state_dict: Optional[Dict[Any, Any]] super().save(save_dir=save_dir, state_dict=state_dict) - def forward(self, input_ids: torch.Tensor, segment_ids: torch.Tensor, attention_mask: torch.Tensor): # type: ignore + def forward( + self, + input_ids: torch.Tensor, + segment_ids: torch.Tensor, + attention_mask: torch.Tensor, + output_hidden_states: Optional[bool] = None, + output_attentions: Optional[bool] = None, + ): """ Perform the forward pass of the DPR encoder model. @@ -582,7 +592,7 @@ def forward(self, input_ids: torch.Tensor, segment_ids: torch.Tensor, attention_ output_tuple = self.model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=attention_mask, return_dict=True ) - if self.encoder.config.output_hidden_states == True: + if output_hidden_states or self.encoder.config.output_hidden_states: pooled_output, all_hidden_states = output_tuple.pooler_output, output_tuple.hidden_states return pooled_output, all_hidden_states else: diff --git a/haystack/modeling/model/tokenization.py b/haystack/modeling/model/tokenization.py index cd9b72f7e4..cb9f864242 100644 --- a/haystack/modeling/model/tokenization.py +++ b/haystack/modeling/model/tokenization.py @@ -64,7 +64,7 @@ def get_tokenizer( def tokenize_batch_question_answering( - pre_baskets: Dict[Any, Any], tokenizer: PreTrainedTokenizer, indices: List[Any] + pre_baskets: List[Dict[str, Any]], tokenizer: PreTrainedTokenizer, indices: List[Any] ) -> List[SampleBasket]: """ Tokenizes text data for question answering tasks. Tokenization means splitting words into subwords, depending on the @@ -175,27 +175,26 @@ def tokenize_with_metadata(text: str, tokenizer: PreTrainedTokenizer) -> Dict[st tokens = tokenized["input_ids"] offsets = np.array([x[0] for x in tokenized["offset_mapping"]]) # offsets2 = [x[0] for x in tokenized2["offset_mapping"]] - words_array = np.array(tokenized.encodings[0].words) + words = np.array(tokenized.encodings[0].words) # TODO check for validity for all tokenizer and special token types - words_array[0] = -1 - words_array[-1] = words_array[-2] - words_array += 1 - start_of_word = [0] + list(np.ediff1d(words_array)) - tokenized_dict = {"tokens": tokens, "offsets": offsets, "start_of_word": start_of_word} - else: - # split text into "words" (here: simple whitespace tokenizer). - words = text.split(" ") - word_offsets: List[int] = [] - cumulated = 0 - for word in words: - word_offsets.append(cumulated) - cumulated += len(word) + 1 # 1 because we so far have whitespace tokenizer - - # split "words" into "subword tokens" - tokens, offsets, start_of_word = _words_to_tokens(words, word_offsets, tokenizer) - tokenized_dict = {"tokens": tokens, "offsets": offsets, "start_of_word": start_of_word} - return tokenized_dict + words[0] = -1 + words[-1] = words[-2] + words += 1 + start_of_word: List[int] = [0] + list(np.ediff1d(words)) + return {"tokens": tokens, "offsets": offsets, "start_of_word": start_of_word} + + # split text into "words" (here: simple whitespace tokenizer). + words: List[str] = text.split(" ") + word_offsets: List[int] = [] + cumulated = 0 + for word in words: + word_offsets.append(cumulated) + cumulated += len(word) + 1 # 1 because we so far have whitespace tokenizer + + # split "words" into "subword tokens" + tokens, offsets, start_of_word = _words_to_tokens(words, word_offsets, tokenizer) + return {"tokens": tokens, "offsets": offsets, "start_of_word": start_of_word} def truncate_sequences( From e78fe2ec32f15a409b0f1f48a2163955913e0c05 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 22 Jun 2022 16:11:41 +0000 Subject: [PATCH 15/89] Update Documentation & Code Style --- haystack/modeling/data_handler/processor.py | 4 +++- haystack/modeling/model/language_model.py | 10 ++++++---- haystack/modeling/model/tokenization.py | 2 +- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/haystack/modeling/data_handler/processor.py b/haystack/modeling/data_handler/processor.py index cdc383827b..2371af335f 100644 --- a/haystack/modeling/data_handler/processor.py +++ b/haystack/modeling/data_handler/processor.py @@ -1971,7 +1971,9 @@ def convert_labels(self, dictionary: Dict): ret: Dict = {} return ret - def dataset_from_dicts(self, dicts: List[Dict], indices: List[str] = [], return_baskets: bool = False, debug: bool = False): + def dataset_from_dicts( + self, dicts: List[Dict], indices: List[str] = [], return_baskets: bool = False, debug: bool = False + ): """ Function to convert input dictionaries containing text into a torch dataset. For normal operation with Language Models it calls the superclass' TextClassification.dataset_from_dicts method. diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index ca10efdc65..c18e23b8bd 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -484,7 +484,9 @@ def __init__( ) # Instantiate the class for this model - self.model.base_model.bert_model = language_model_class(pretrained_model_name_or_path=pretrained_model_name_or_path, **kwargs).model + self.model.base_model.bert_model = language_model_class( + pretrained_model_name_or_path=pretrained_model_name_or_path, **kwargs + ).model self.language = self.model.config.language else: @@ -565,9 +567,9 @@ def save(self, save_dir: Union[str, Path], state_dict: Optional[Dict[Any, Any]] super().save(save_dir=save_dir, state_dict=state_dict) def forward( - self, - input_ids: torch.Tensor, - segment_ids: torch.Tensor, + self, + input_ids: torch.Tensor, + segment_ids: torch.Tensor, attention_mask: torch.Tensor, output_hidden_states: Optional[bool] = None, output_attentions: Optional[bool] = None, diff --git a/haystack/modeling/model/tokenization.py b/haystack/modeling/model/tokenization.py index cb9f864242..e2437242da 100644 --- a/haystack/modeling/model/tokenization.py +++ b/haystack/modeling/model/tokenization.py @@ -183,7 +183,7 @@ def tokenize_with_metadata(text: str, tokenizer: PreTrainedTokenizer) -> Dict[st words += 1 start_of_word: List[int] = [0] + list(np.ediff1d(words)) return {"tokens": tokens, "offsets": offsets, "start_of_word": start_of_word} - + # split text into "words" (here: simple whitespace tokenizer). words: List[str] = text.split(" ") word_offsets: List[int] = [] From 8ed07fffae3ab29305821876b83f46bf5aa60189 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 22 Jun 2022 18:15:04 +0200 Subject: [PATCH 16/89] pylint --- haystack/modeling/model/language_model.py | 1 - haystack/nodes/retriever/dense.py | 1 - 2 files changed, 2 deletions(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index ca10efdc65..157d8eb142 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -629,7 +629,6 @@ def forward( "dpr.*question.*encoder": "DPRQuestionEncoder", "dpr.*context.*encoder": "DPRContextEncoder", "dpr.*ctx.*encoder": "DPRContextEncoder", - "mlm.*codebert": "Roberta", "deberta-v2": "DebertaV2", } PARAMETERS_BY_MODEL: Dict[str, Dict[str, Any]] = { diff --git a/haystack/nodes/retriever/dense.py b/haystack/nodes/retriever/dense.py index 66925af055..4bf7305609 100644 --- a/haystack/nodes/retriever/dense.py +++ b/haystack/nodes/retriever/dense.py @@ -25,7 +25,6 @@ from haystack.document_stores import BaseDocumentStore from haystack.nodes.retriever.base import BaseRetriever from haystack.nodes.retriever._embedding_encoder import _EMBEDDING_ENCODERS -from haystack.modeling.model.tokenization import get_tokenizer from haystack.modeling.model.language_model import get_language_model from haystack.modeling.model.biadaptive_model import BiAdaptiveModel from haystack.modeling.model.triadaptive_model import TriAdaptiveModel From 4226eea7a564ba1ad78afca5a670722c3036babc Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 22 Jun 2022 18:38:23 +0200 Subject: [PATCH 17/89] more mypy --- haystack/modeling/data_handler/processor.py | 14 ++--- haystack/modeling/model/language_model.py | 67 ++++++++++++++------- haystack/modeling/model/tokenization.py | 9 ++- haystack/nodes/retriever/dense.py | 10 +-- 4 files changed, 64 insertions(+), 36 deletions(-) diff --git a/haystack/modeling/data_handler/processor.py b/haystack/modeling/data_handler/processor.py index cdc383827b..1b08d4d467 100644 --- a/haystack/modeling/data_handler/processor.py +++ b/haystack/modeling/data_handler/processor.py @@ -308,7 +308,7 @@ def file_to_dicts(self, file: str) -> List[dict]: raise NotImplementedError() @abstractmethod - def dataset_from_dicts(self, dicts: List[dict], indices: List[int] = [], return_baskets: bool = False): + def dataset_from_dicts(self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False): raise NotImplementedError() @abstractmethod @@ -438,7 +438,7 @@ def __init__( "using the default task or add a custom task later via processor.add_task()" ) - def dataset_from_dicts(self, dicts: List[dict], indices: List[int] = [], return_baskets: bool = False): + def dataset_from_dicts(self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False): """ Convert input dictionaries into a pytorch dataset for Question Answering. For this we have an internal representation called "baskets". @@ -970,7 +970,7 @@ def save(self, save_dir: Union[str, Path]): with open(output_config_file, "w") as file: json.dump(config, file) - def dataset_from_dicts(self, dicts: List[dict], indices: List[int] = [], return_baskets: bool = False): + def dataset_from_dicts(self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False): """ Convert input dictionaries into a pytorch dataset for TextSimilarity (e.g. DPR). For conversion we have an internal representation called "baskets". @@ -1480,7 +1480,7 @@ def _read_multimodal_dpr_json(self, file: str, max_samples: Optional[int] = None standard_dicts.append(sample) return standard_dicts - def dataset_from_dicts(self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False): + def dataset_from_dicts(self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False): """ Convert input dictionaries into a pytorch dataset for TextSimilarity. For conversion we have an internal representation called "baskets". @@ -1828,7 +1828,7 @@ def __init__( def file_to_dicts(self, file: str) -> List[Dict]: raise NotImplementedError - def dataset_from_dicts(self, dicts, indices=None, return_baskets=False, debug=False): + def dataset_from_dicts(self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False): self.baskets = [] # Tokenize in batches texts = [x["text"] for x in dicts] @@ -1971,7 +1971,7 @@ def convert_labels(self, dictionary: Dict): ret: Dict = {} return ret - def dataset_from_dicts(self, dicts: List[Dict], indices: List[str] = [], return_baskets: bool = False, debug: bool = False): + def dataset_from_dicts(self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False): """ Function to convert input dictionaries containing text into a torch dataset. For normal operation with Language Models it calls the superclass' TextClassification.dataset_from_dicts method. @@ -2059,7 +2059,7 @@ def file_to_dicts(self, file: str) -> List[dict]: dicts.append({"text": line}) return dicts - def dataset_from_dicts(self, dicts: List[dict], indices: List[int] = [], return_baskets: bool = False): + def dataset_from_dicts(self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False): if return_baskets: raise NotImplementedError("return_baskets is not supported by UnlabeledTextProcessor") texts = [dict_["text"] for dict_ in dicts] diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 157d8eb142..e16c0c0b8f 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -90,10 +90,18 @@ class LanguageModel(nn.Module, ABC): These models read in tokenized sentences and return vectors that capture the meaning of sentences or of tokens. """ - def __init__(self, name: str): + def __init__( + self, + pretrained_model_name_or_path: Union[Path, str], + model_type: str, + language: str = None, + n_added_tokens: int = 0, + use_auth_token: Optional[Union[str, bool]] = None, + model_kwargs: Optional[Dict[str, Any]] = None, + ): super().__init__() self._output_dims = None - self.name = name + self.name = model_type @property def encoder(self): @@ -286,7 +294,7 @@ def __init__( :param language: the model's language ('multilingual' is also accepted) :param use_auth_token: the HF token or False """ - super().__init__(name=model_type) + super().__init__(model_type=model_type) config_class: PretrainedConfig = getattr(transformers, model_type + "Config", None) model_class: PreTrainedModel = getattr(transformers, model_type + "Model", None) @@ -365,8 +373,10 @@ class HFLanguageModelWithPooler(HFLanguageModel): def __init__( self, pretrained_model_name_or_path: Union[Path, str], + model_type: str, language: str = None, n_added_tokens: int = 0, + use_auth_token: Optional[Union[str, bool]] = None, model_kwargs: Optional[Dict[str, Any]] = None, ): """ @@ -378,7 +388,14 @@ def __init__( :param pretrained_model_name_or_path: The path of the saved pretrained model or its name. """ - super().__init__(pretrained_model_name_or_path, language, n_added_tokens, model_kwargs) + super().__init__( + pretrained_model_name_or_path=pretrained_model_name_or_path, + model_type=model_type, + language=language, + n_added_tokens=n_added_tokens, + use_auth_token=use_auth_token, + model_kwargs=model_kwargs + ) config = self.model.config # These models do not provide a pooled_output by default. Therefore, we need to initialize an extra pooler. @@ -443,7 +460,7 @@ def __init__( * A local path of a model trained using Haystack (for example, "some_dir/haystack_model"). :param pretrained_model_name_or_path: The path of the base pretrained language model whose weights are used to initialize DPRQuestionEncoder. """ - super().__init__(name=model_type) + super().__init__(model_type=model_type) self.role = "question" if "question" in model_type.lower() else "context" self._encoder = None @@ -476,15 +493,15 @@ def __init__( language_model_type = _get_model_type(haystack_lm_config, use_auth_token=use_auth_token, **kwargs) # Find the class corresponding to this model type - language_model_class: Type[LanguageModel] = HUGGINGFACE_TO_HAYSTACK.get(language_model_type, None) - if not language_model_class: + try: + language_model_class: Type[LanguageModel] = HUGGINGFACE_TO_HAYSTACK[language_model_type] + except KeyError as e: raise ValueError( f"The type of model supplied ({language_model_type}) is not supported by Haystack. " f"Supported model categories are: {', '.join(HUGGINGFACE_TO_HAYSTACK.keys())}" ) - # Instantiate the class for this model - self.model.base_model.bert_model = language_model_class(pretrained_model_name_or_path=pretrained_model_name_or_path, **kwargs).model + self.model.base_model.bert_model = language_model_class(name="bert", **kwargs).model self.language = self.model.config.language else: @@ -511,7 +528,7 @@ def __init__( self.model.base_model.bert_model = AutoModel.from_pretrained( str(pretrained_model_name_or_path), use_auth_token=use_auth_token, **original_config_dict ) - self.language = language or _guess_language(Str(pretrained_model_name_or_path)) + self.language = language or _guess_language(str(pretrained_model_name_or_path)) @property def encoder(self): @@ -651,7 +668,9 @@ def forward( def get_language_model( pretrained_model_name_or_path: Union[Path, str], - language_model_type: Optional[str] = None, + model_type: str, + language: str = None, + n_added_tokens: int = 0, use_auth_token: Optional[Union[str, bool]] = None, revision: Optional[str] = None, autoconfig_kwargs: Optional[Dict[str, Any]] = None, @@ -676,25 +695,25 @@ def get_language_model( config_file = Path(pretrained_model_name_or_path) / "language_model_config.json" - if language_model_type is None: + if model_type is None: if os.path.exists(config_file): # it's a local directory in Haystack format logger.info(f"Model found locally at {pretrained_model_name_or_path}") config = json.load(open(config_file)) - language_model_type = config["name"] + model_type = config["name"] else: # It's from the model hub logger.info(f"Could not find '{pretrained_model_name_or_path}' locally.") logger.info(f"Looking on Transformers Model Hub (in local cache and online)...") - language_model_type = _get_model_type( + model_type = _get_model_type( pretrained_model_name_or_path, use_auth_token=use_auth_token, revision=revision, autoconfig_kwargs=autoconfig_kwargs, ) - if not language_model_type: + if not model_type: raise ModelingError( f"Model not found for '{pretrained_model_name_or_path}'. Either supply the local path for a saved " f"model or one of bert/roberta/xlnet/albert/distilbert models that can be downloaded from remote. " @@ -703,21 +722,24 @@ def get_language_model( ) # Find the class corresponding to this model type - language_model_class: Type[LanguageModel] = HUGGINGFACE_TO_HAYSTACK.get(language_model_type, None) - if not language_model_class: + try: + language_model_class: Type[LanguageModel] = HUGGINGFACE_TO_HAYSTACK[model_type] + except KeyError as e: raise ValueError( - f"The type of model supplied ({language_model_type}) is not supported by Haystack. " + f"The type of model supplied ({model_type}) is not supported by Haystack. " f"Supported model categories are: {', '.join(HUGGINGFACE_TO_HAYSTACK.keys())}" - ) + ) from e # Instantiate the class for this model language_model = language_model_class( - pretrained_model_name_or_path, - model_type=language_model_type, + name=pretrained_model_name_or_path, + model_type=model_type, + language=language, + n_added_tokens=n_added_tokens, use_auth_token=use_auth_token, model_kwargs=model_kwargs, ) - logger.info(f"Loaded '{pretrained_model_name_or_path}' ({language_model_type} model)") + logger.info(f"Loaded '{pretrained_model_name_or_path}' ({model_type} model)") return language_model @@ -738,6 +760,7 @@ def _get_model_type( try: config = AutoConfig.from_pretrained( pretrained_model_name_or_path=model_name_or_path, + model_type=model_type, use_auth_token=use_auth_token, revision=revision, **(autoconfig_kwargs or {}), diff --git a/haystack/modeling/model/tokenization.py b/haystack/modeling/model/tokenization.py index cb9f864242..c3bbe5db4d 100644 --- a/haystack/modeling/model/tokenization.py +++ b/haystack/modeling/model/tokenization.py @@ -167,6 +167,11 @@ def tokenize_with_metadata(text: str, tokenizer: PreTrainedTokenizer) -> Dict[st # Note: using text.split() directly would destroy the offset, # since \n\n\n would be treated similarly as a single \n text = re.sub(r"\s", " ", text) + + words: Union[List[str], np.ndarray] = [] + word_offsets: Union[List[int], np.ndarray] = [] + start_of_word: Union[List[int], List[bool]] = [] + # Fast Tokenizers return offsets, so we don't need to calculate them ourselves if tokenizer.is_fast: # tokenized = tokenizer(text, return_offsets_mapping=True, return_special_tokens_mask=True) @@ -181,11 +186,11 @@ def tokenize_with_metadata(text: str, tokenizer: PreTrainedTokenizer) -> Dict[st words[0] = -1 words[-1] = words[-2] words += 1 - start_of_word: List[int] = [0] + list(np.ediff1d(words)) + start_of_word = [0] + list(np.ediff1d(words)) return {"tokens": tokens, "offsets": offsets, "start_of_word": start_of_word} # split text into "words" (here: simple whitespace tokenizer). - words: List[str] = text.split(" ") + words = text.split(" ") word_offsets: List[int] = [] cumulated = 0 for word in words: diff --git a/haystack/nodes/retriever/dense.py b/haystack/nodes/retriever/dense.py index 4bf7305609..3d908e456b 100644 --- a/haystack/nodes/retriever/dense.py +++ b/haystack/nodes/retriever/dense.py @@ -168,7 +168,7 @@ def __init__( self.query_encoder = get_language_model( pretrained_model_name_or_path=query_embedding_model, revision=model_version, - language_model_type="DPRQuestionEncoder", + model_type="DPRQuestionEncoder", use_auth_token=use_auth_token, ) self.passage_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained( @@ -181,7 +181,7 @@ def __init__( self.passage_encoder = get_language_model( pretrained_model_name_or_path=passage_embedding_model, revision=model_version, - language_model_type="DPRContextEncoder", + model_type="DPRContextEncoder", use_auth_token=use_auth_token, ) @@ -881,7 +881,7 @@ def __init__( ) self.query_encoder = get_language_model( pretrained_model_name_or_path=query_embedding_model, - language_model_type="DPRQuestionEncoder", + model_type="DPRQuestionEncoder", revision=model_version, use_auth_token=use_auth_token, ) @@ -894,7 +894,7 @@ def __init__( ) self.passage_encoder = get_language_model( pretrained_model_name_or_path=passage_embedding_model, - language_model_type="DPRContextEncoder", + model_type="DPRContextEncoder", revision=model_version, use_auth_token=use_auth_token, ) @@ -907,7 +907,7 @@ def __init__( ) self.table_encoder = get_language_model( pretrained_model_name_or_path=table_embedding_model, - language_model_type="DPRContextEncoder", + model_type="DPRContextEncoder", revision=model_version, use_auth_token=use_auth_token, ) From e4e9ba122df19eb016686ec73efa424a74c4f7da Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 22 Jun 2022 18:41:50 +0200 Subject: [PATCH 18/89] remove merge tags --- haystack/modeling/data_handler/processor.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/haystack/modeling/data_handler/processor.py b/haystack/modeling/data_handler/processor.py index cf9ab4b3aa..1b08d4d467 100644 --- a/haystack/modeling/data_handler/processor.py +++ b/haystack/modeling/data_handler/processor.py @@ -1971,13 +1971,7 @@ def convert_labels(self, dictionary: Dict): ret: Dict = {} return ret -<<<<<<< HEAD def dataset_from_dicts(self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False): -======= - def dataset_from_dicts( - self, dicts: List[Dict], indices: List[str] = [], return_baskets: bool = False, debug: bool = False - ): ->>>>>>> e78fe2ec32f15a409b0f1f48a2163955913e0c05 """ Function to convert input dictionaries containing text into a torch dataset. For normal operation with Language Models it calls the superclass' TextClassification.dataset_from_dicts method. From 7fc2443d9f0e29ce2073156238b1b82c847bd2a2 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 22 Jun 2022 16:44:21 +0000 Subject: [PATCH 19/89] Update Documentation & Code Style --- haystack/modeling/data_handler/processor.py | 28 +++++++++++++++------ haystack/modeling/model/language_model.py | 8 +++--- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/haystack/modeling/data_handler/processor.py b/haystack/modeling/data_handler/processor.py index 1b08d4d467..b66a435b0a 100644 --- a/haystack/modeling/data_handler/processor.py +++ b/haystack/modeling/data_handler/processor.py @@ -308,7 +308,9 @@ def file_to_dicts(self, file: str) -> List[dict]: raise NotImplementedError() @abstractmethod - def dataset_from_dicts(self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False): + def dataset_from_dicts( + self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False + ): raise NotImplementedError() @abstractmethod @@ -438,7 +440,9 @@ def __init__( "using the default task or add a custom task later via processor.add_task()" ) - def dataset_from_dicts(self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False): + def dataset_from_dicts( + self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False + ): """ Convert input dictionaries into a pytorch dataset for Question Answering. For this we have an internal representation called "baskets". @@ -970,7 +974,9 @@ def save(self, save_dir: Union[str, Path]): with open(output_config_file, "w") as file: json.dump(config, file) - def dataset_from_dicts(self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False): + def dataset_from_dicts( + self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False + ): """ Convert input dictionaries into a pytorch dataset for TextSimilarity (e.g. DPR). For conversion we have an internal representation called "baskets". @@ -1480,7 +1486,9 @@ def _read_multimodal_dpr_json(self, file: str, max_samples: Optional[int] = None standard_dicts.append(sample) return standard_dicts - def dataset_from_dicts(self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False): + def dataset_from_dicts( + self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False + ): """ Convert input dictionaries into a pytorch dataset for TextSimilarity. For conversion we have an internal representation called "baskets". @@ -1828,7 +1836,9 @@ def __init__( def file_to_dicts(self, file: str) -> List[Dict]: raise NotImplementedError - def dataset_from_dicts(self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False): + def dataset_from_dicts( + self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False + ): self.baskets = [] # Tokenize in batches texts = [x["text"] for x in dicts] @@ -1971,7 +1981,9 @@ def convert_labels(self, dictionary: Dict): ret: Dict = {} return ret - def dataset_from_dicts(self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False): + def dataset_from_dicts( + self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False + ): """ Function to convert input dictionaries containing text into a torch dataset. For normal operation with Language Models it calls the superclass' TextClassification.dataset_from_dicts method. @@ -2059,7 +2071,9 @@ def file_to_dicts(self, file: str) -> List[dict]: dicts.append({"text": line}) return dicts - def dataset_from_dicts(self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False): + def dataset_from_dicts( + self, dicts: List[Dict], indices: List[int] = [], return_baskets: bool = False, debug: bool = False + ): if return_baskets: raise NotImplementedError("return_baskets is not supported by UnlabeledTextProcessor") texts = [dict_["text"] for dict_ in dicts] diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index c368a2410d..973b625c1c 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -389,12 +389,12 @@ def __init__( :param pretrained_model_name_or_path: The path of the saved pretrained model or its name. """ super().__init__( - pretrained_model_name_or_path=pretrained_model_name_or_path, + pretrained_model_name_or_path=pretrained_model_name_or_path, model_type=model_type, - language=language, - n_added_tokens=n_added_tokens, + language=language, + n_added_tokens=n_added_tokens, use_auth_token=use_auth_token, - model_kwargs=model_kwargs + model_kwargs=model_kwargs, ) config = self.model.config From 94d5d0df2bc75e94e9d418f6bdef98c0a75ba157 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 22 Jun 2022 18:49:14 +0200 Subject: [PATCH 20/89] mypy --- haystack/modeling/data_handler/samples.py | 4 ++-- haystack/modeling/model/language_model.py | 12 ++---------- haystack/modeling/model/tokenization.py | 3 +-- 3 files changed, 5 insertions(+), 14 deletions(-) diff --git a/haystack/modeling/data_handler/samples.py b/haystack/modeling/data_handler/samples.py index 443295ea64..51f497e830 100644 --- a/haystack/modeling/data_handler/samples.py +++ b/haystack/modeling/data_handler/samples.py @@ -1,4 +1,4 @@ -from typing import Union, Optional, List +from typing import Any, Union, Optional, List, Dict import logging import numpy as np @@ -13,7 +13,7 @@ class Sample: the human readable clear_text. Over the course of data preprocessing, this object is populated with tokenized and featurized versions of the data.""" - def __init__(self, id: str, clear_text: dict, tokenized: Optional[dict] = None, features: Optional[dict] = None): + def __init__(self, id: str, clear_text: dict, tokenized: Optional[dict] = None, features: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None): """ :param id: The unique id of the sample :param clear_text: A dictionary containing various human readable fields (e.g. text, label). diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index c368a2410d..16440b7cc4 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -90,15 +90,7 @@ class LanguageModel(nn.Module, ABC): These models read in tokenized sentences and return vectors that capture the meaning of sentences or of tokens. """ - def __init__( - self, - pretrained_model_name_or_path: Union[Path, str], - model_type: str, - language: str = None, - n_added_tokens: int = 0, - use_auth_token: Optional[Union[str, bool]] = None, - model_kwargs: Optional[Dict[str, Any]] = None, - ): + def __init__(self, model_type: str): super().__init__() self._output_dims = None self.name = model_type @@ -670,7 +662,7 @@ def forward( def get_language_model( pretrained_model_name_or_path: Union[Path, str], - model_type: str, + model_type: Optional[str] = None, language: str = None, n_added_tokens: int = 0, use_auth_token: Optional[Union[str, bool]] = None, diff --git a/haystack/modeling/model/tokenization.py b/haystack/modeling/model/tokenization.py index fec08a0bc7..3af175f8e0 100644 --- a/haystack/modeling/model/tokenization.py +++ b/haystack/modeling/model/tokenization.py @@ -170,7 +170,7 @@ def tokenize_with_metadata(text: str, tokenizer: PreTrainedTokenizer) -> Dict[st words: Union[List[str], np.ndarray] = [] word_offsets: Union[List[int], np.ndarray] = [] - start_of_word: Union[List[int], List[bool]] = [] + start_of_word: List[Union[int, bool]] = [] # Fast Tokenizers return offsets, so we don't need to calculate them ourselves if tokenizer.is_fast: @@ -191,7 +191,6 @@ def tokenize_with_metadata(text: str, tokenizer: PreTrainedTokenizer) -> Dict[st # split text into "words" (here: simple whitespace tokenizer). words = text.split(" ") - word_offsets: List[int] = [] cumulated = 0 for word in words: word_offsets.append(cumulated) From d3853997072cbc82e29a43a30631bbc23eb0cf26 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 22 Jun 2022 16:52:03 +0000 Subject: [PATCH 21/89] Update Documentation & Code Style --- haystack/modeling/data_handler/samples.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/haystack/modeling/data_handler/samples.py b/haystack/modeling/data_handler/samples.py index 51f497e830..6335490ec7 100644 --- a/haystack/modeling/data_handler/samples.py +++ b/haystack/modeling/data_handler/samples.py @@ -13,7 +13,13 @@ class Sample: the human readable clear_text. Over the course of data preprocessing, this object is populated with tokenized and featurized versions of the data.""" - def __init__(self, id: str, clear_text: dict, tokenized: Optional[dict] = None, features: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None): + def __init__( + self, + id: str, + clear_text: dict, + tokenized: Optional[dict] = None, + features: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, + ): """ :param id: The unique id of the sample :param clear_text: A dictionary containing various human readable fields (e.g. text, label). From 8c20ef08ebba7952212f3d3e64d2b95ace31efd7 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 22 Jun 2022 19:19:05 +0200 Subject: [PATCH 22/89] mypy again --- haystack/modeling/model/biadaptive_model.py | 8 ++++---- haystack/modeling/model/language_model.py | 9 ++++----- haystack/modeling/model/tokenization.py | 2 +- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/haystack/modeling/model/biadaptive_model.py b/haystack/modeling/model/biadaptive_model.py index b8a8407778..77cda717af 100644 --- a/haystack/modeling/model/biadaptive_model.py +++ b/haystack/modeling/model/biadaptive_model.py @@ -336,13 +336,13 @@ def forward_lm( """ pooled_output = [None, None] - if query_input_ids is not None: + if query_input_ids is not None and query_segment_ids is not None and query_attention_mask is not None: pooled_output1, _ = self.language_model1( input_ids=query_input_ids, segment_ids=query_segment_ids, attention_mask=query_attention_mask ) pooled_output[0] = pooled_output1 - if passage_input_ids is not None: + if passage_input_ids is not None and passage_segment_ids is not None and passage_attention_mask is not None: pooled_output2, _ = self.language_model2( input_ids=passage_input_ids[0], segment_ids=passage_segment_ids[0], @@ -490,10 +490,10 @@ def convert_from_transformers( :return: AdaptiveModel """ lm1 = get_language_model( - pretrained_model_name_or_path=model_name_or_path1, language_model_class="DPRQuestionEncoder" + pretrained_model_name_or_path=model_name_or_path1, model_class="DPRQuestionEncoder" ) lm2 = get_language_model( - pretrained_model_name_or_path=model_name_or_path2, language_model_class="DPRContextEncoder" + pretrained_model_name_or_path=model_name_or_path2, model_class="DPRContextEncoder" ) prediction_head = TextSimilarityHead(similarity_function=similarity_function) # TODO Infer type of head automatically from config diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index cf3abb77e0..0af7e2c185 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -18,7 +18,6 @@ Thanks for the great work! """ -from ast import Str from typing import Type, Optional, Dict, Any, Union, List import re @@ -394,7 +393,7 @@ def __init__( # The pooler takes the first hidden representation & feeds it to a dense layer of (hidden_dim x hidden_dim). # We don't want a dropout in the end of the pooler, since we do that already in the adaptive model before we # feed everything to the prediction head - sequence_summary_config = PARAMETERS_BY_MODEL.get(self.name.lower()) + sequence_summary_config = PARAMETERS_BY_MODEL.get(self.name.lower(), {}) for key, value in sequence_summary_config.items(): setattr(config, key, value) @@ -486,7 +485,7 @@ def __init__( language_model_type = _get_model_type(haystack_lm_config, use_auth_token=use_auth_token, **kwargs) # Find the class corresponding to this model type try: - language_model_class: Type[LanguageModel] = HUGGINGFACE_TO_HAYSTACK[language_model_type] + language_model_class: Type[Union[HFLanguageModel, DPREncoder]] = HUGGINGFACE_TO_HAYSTACK[language_model_type] except KeyError as e: raise ValueError( f"The type of model supplied ({language_model_type}) is not supported by Haystack. " @@ -717,7 +716,7 @@ def get_language_model( # Find the class corresponding to this model type try: - language_model_class: Type[LanguageModel] = HUGGINGFACE_TO_HAYSTACK[model_type] + language_model_class: Type[Union[HFLanguageModel, DPREncoder]] = HUGGINGFACE_TO_HAYSTACK[model_type] except KeyError as e: raise ValueError( f"The type of model supplied ({model_type}) is not supported by Haystack. " @@ -726,7 +725,7 @@ def get_language_model( # Instantiate the class for this model language_model = language_model_class( - name=pretrained_model_name_or_path, + pretrained_model_name_or_path=pretrained_model_name_or_path, model_type=model_type, language=language, n_added_tokens=n_added_tokens, diff --git a/haystack/modeling/model/tokenization.py b/haystack/modeling/model/tokenization.py index 3af175f8e0..2b6ae16a52 100644 --- a/haystack/modeling/model/tokenization.py +++ b/haystack/modeling/model/tokenization.py @@ -197,7 +197,7 @@ def tokenize_with_metadata(text: str, tokenizer: PreTrainedTokenizer) -> Dict[st cumulated += len(word) + 1 # 1 because we so far have whitespace tokenizer # split "words" into "subword tokens" - tokens, offsets, start_of_word = _words_to_tokens(words, word_offsets, tokenizer) + tokens, offsets, start_of_word = _words_to_tokens(words, word_offsets, tokenizer) # type: ignore return {"tokens": tokens, "offsets": offsets, "start_of_word": start_of_word} From cdb3b114b48b761e24406d024078f9c9c68511e1 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 22 Jun 2022 17:21:19 +0000 Subject: [PATCH 23/89] Update Documentation & Code Style --- haystack/modeling/model/biadaptive_model.py | 8 ++------ haystack/modeling/model/language_model.py | 4 +++- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/haystack/modeling/model/biadaptive_model.py b/haystack/modeling/model/biadaptive_model.py index 77cda717af..861a5bf877 100644 --- a/haystack/modeling/model/biadaptive_model.py +++ b/haystack/modeling/model/biadaptive_model.py @@ -489,12 +489,8 @@ def convert_from_transformers( :type processor: Processor :return: AdaptiveModel """ - lm1 = get_language_model( - pretrained_model_name_or_path=model_name_or_path1, model_class="DPRQuestionEncoder" - ) - lm2 = get_language_model( - pretrained_model_name_or_path=model_name_or_path2, model_class="DPRContextEncoder" - ) + lm1 = get_language_model(pretrained_model_name_or_path=model_name_or_path1, model_class="DPRQuestionEncoder") + lm2 = get_language_model(pretrained_model_name_or_path=model_name_or_path2, model_class="DPRContextEncoder") prediction_head = TextSimilarityHead(similarity_function=similarity_function) # TODO Infer type of head automatically from config if task_type == "text_similarity": diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 0af7e2c185..4e0e502630 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -485,7 +485,9 @@ def __init__( language_model_type = _get_model_type(haystack_lm_config, use_auth_token=use_auth_token, **kwargs) # Find the class corresponding to this model type try: - language_model_class: Type[Union[HFLanguageModel, DPREncoder]] = HUGGINGFACE_TO_HAYSTACK[language_model_type] + language_model_class: Type[Union[HFLanguageModel, DPREncoder]] = HUGGINGFACE_TO_HAYSTACK[ + language_model_type + ] except KeyError as e: raise ValueError( f"The type of model supplied ({language_model_type}) is not supported by Haystack. " From 73f3a4a842a1f9cd496856e29a40394e7b171cd3 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 22 Jun 2022 19:21:28 +0200 Subject: [PATCH 24/89] last mypy errors --- haystack/modeling/model/biadaptive_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/haystack/modeling/model/biadaptive_model.py b/haystack/modeling/model/biadaptive_model.py index 77cda717af..5224c0fd0a 100644 --- a/haystack/modeling/model/biadaptive_model.py +++ b/haystack/modeling/model/biadaptive_model.py @@ -490,10 +490,10 @@ def convert_from_transformers( :return: AdaptiveModel """ lm1 = get_language_model( - pretrained_model_name_or_path=model_name_or_path1, model_class="DPRQuestionEncoder" + pretrained_model_name_or_path=model_name_or_path1, model_type="DPRQuestionEncoder" ) lm2 = get_language_model( - pretrained_model_name_or_path=model_name_or_path2, model_class="DPRContextEncoder" + pretrained_model_name_or_path=model_name_or_path2, model_type="DPRContextEncoder" ) prediction_head = TextSimilarityHead(similarity_function=similarity_function) # TODO Infer type of head automatically from config From dc9f7531aec605350ac557aa05ea9d6272c37c41 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 22 Jun 2022 17:24:08 +0000 Subject: [PATCH 25/89] Update Documentation & Code Style --- haystack/modeling/model/biadaptive_model.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/haystack/modeling/model/biadaptive_model.py b/haystack/modeling/model/biadaptive_model.py index 5224c0fd0a..ed5152892a 100644 --- a/haystack/modeling/model/biadaptive_model.py +++ b/haystack/modeling/model/biadaptive_model.py @@ -489,12 +489,8 @@ def convert_from_transformers( :type processor: Processor :return: AdaptiveModel """ - lm1 = get_language_model( - pretrained_model_name_or_path=model_name_or_path1, model_type="DPRQuestionEncoder" - ) - lm2 = get_language_model( - pretrained_model_name_or_path=model_name_or_path2, model_type="DPRContextEncoder" - ) + lm1 = get_language_model(pretrained_model_name_or_path=model_name_or_path1, model_type="DPRQuestionEncoder") + lm2 = get_language_model(pretrained_model_name_or_path=model_name_or_path2, model_type="DPRContextEncoder") prediction_head = TextSimilarityHead(similarity_function=similarity_function) # TODO Infer type of head automatically from config if task_type == "text_similarity": From 0b369152c734e08d7cd829a1fb1a11f3024da17a Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 22 Jun 2022 19:39:47 +0200 Subject: [PATCH 26/89] Add n_added_tokens to DPREncoder.__init__ for compatibility --- haystack/modeling/model/language_model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 4e0e502630..e3f1ec9dd2 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -441,6 +441,7 @@ def __init__( pretrained_model_name_or_path: Union[Path, str], model_type: str, language: str = None, + n_added_tokens: int = 0, use_auth_token: Optional[Union[str, bool]] = None, model_kwargs: Optional[Dict[str, Any]] = None, ): From fe957c1a5ca962f12736dbe528943254ff688e58 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 22 Jun 2022 20:34:29 +0200 Subject: [PATCH 27/89] fix tests --- haystack/modeling/model/language_model.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index e3f1ec9dd2..90ef0ef642 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -345,8 +345,8 @@ def forward( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=attention_mask, - output_hidden_states=output_hidden_states or self.encoder.config.output_hidden_states, - output_attentions=output_attentions or self.encoder.config.output_attentions, + output_hidden_states=output_hidden_states or self.model.encoder.config.output_hidden_states, + output_attentions=output_attentions or self.model.encoder.config.output_attentions, return_dict=False, ) @@ -496,7 +496,7 @@ def __init__( ) # Instantiate the class for this model self.model.base_model.bert_model = language_model_class( - pretrained_model_name_or_path=pretrained_model_name_or_path, **kwargs + pretrained_model_name_or_path=pretrained_model_name_or_path, model_type="bert", **kwargs ).model self.language = self.model.config.language From 129cbf6f19347a7479d61d18c5f9c507deb2cfce Mon Sep 17 00:00:00 2001 From: ZanSara Date: Fri, 24 Jun 2022 14:42:28 +0200 Subject: [PATCH 28/89] not all models have encoders --- haystack/modeling/model/language_model.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 90ef0ef642..216dc8efe4 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -341,6 +341,17 @@ def forward( :param output_attentions: When set to `True`, outputs attentions in addition to the embeddings. :return: Embeddings for each token in the input sequence. Can also return hidden states and attentions if specified using the arguments `output_hidden_states` and `output_attentions`. """ + encoder = getattr(self, "encoder", None) # Not all models have an encoder + if encoder: + output_hidden_states=output_hidden_states or self.model.encoder.config.output_hidden_states + output_attentions=output_attentions or self.model.encoder.config.output_attentions, + + params = {} + if output_hidden_states: + params["output_hidden_states"] = output_hidden_states + if output_attentions: + params["output_attentions"] = output_attentions + return self.model( input_ids=input_ids, token_type_ids=segment_ids, From 7442438e1fffc4adf6fcba713867b4999fc8455b Mon Sep 17 00:00:00 2001 From: ZanSara Date: Fri, 24 Jun 2022 14:43:56 +0200 Subject: [PATCH 29/89] comma --- haystack/modeling/model/language_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 216dc8efe4..219d9eeaa0 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -344,7 +344,7 @@ def forward( encoder = getattr(self, "encoder", None) # Not all models have an encoder if encoder: output_hidden_states=output_hidden_states or self.model.encoder.config.output_hidden_states - output_attentions=output_attentions or self.model.encoder.config.output_attentions, + output_attentions=output_attentions or self.model.encoder.config.output_attentions params = {} if output_hidden_states: From f4676b5b305122f6b4b26f002b7934569652bb1f Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 24 Jun 2022 12:46:40 +0000 Subject: [PATCH 30/89] Update Documentation & Code Style --- haystack/modeling/model/language_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 219d9eeaa0..d1ac64729b 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -343,8 +343,8 @@ def forward( """ encoder = getattr(self, "encoder", None) # Not all models have an encoder if encoder: - output_hidden_states=output_hidden_states or self.model.encoder.config.output_hidden_states - output_attentions=output_attentions or self.model.encoder.config.output_attentions + output_hidden_states = output_hidden_states or self.model.encoder.config.output_hidden_states + output_attentions = output_attentions or self.model.encoder.config.output_attentions params = {} if output_hidden_states: From 0c714bb127c310c2187fe99d2e216adb03527d30 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Fri, 24 Jun 2022 14:52:05 +0200 Subject: [PATCH 31/89] using params --- haystack/modeling/model/language_model.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 219d9eeaa0..76bcea394b 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -356,9 +356,8 @@ def forward( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=attention_mask, - output_hidden_states=output_hidden_states or self.model.encoder.config.output_hidden_states, - output_attentions=output_attentions or self.model.encoder.config.output_attentions, return_dict=False, + **params ) From 23cf8f2e760826a74a6607ed2e585f8bf5b017dc Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 24 Jun 2022 12:55:16 +0000 Subject: [PATCH 32/89] Update Documentation & Code Style --- haystack/modeling/model/language_model.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index afe0ef0f86..d720b0428e 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -353,11 +353,7 @@ def forward( params["output_attentions"] = output_attentions return self.model( - input_ids=input_ids, - token_type_ids=segment_ids, - attention_mask=attention_mask, - return_dict=False, - **params + input_ids=input_ids, token_type_ids=segment_ids, attention_mask=attention_mask, return_dict=False, **params ) From 922340a6e9e7d5b6e4b5d2d3396c3f10ce482923 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Fri, 24 Jun 2022 16:01:28 +0200 Subject: [PATCH 33/89] trying to simplify DPREncoder --- haystack/modeling/model/language_model.py | 233 ++++++++++++++-------- 1 file changed, 146 insertions(+), 87 deletions(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index afe0ef0f86..f36b6f19bd 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -290,7 +290,6 @@ def __init__( config_class: PretrainedConfig = getattr(transformers, model_type + "Config", None) model_class: PreTrainedModel = getattr(transformers, model_type + "Model", None) - # We need to differentiate between loading model using Haystack format and Pytorch-Transformers format haystack_lm_config = Path(pretrained_model_name_or_path) / "language_model_config.json" if os.path.exists(haystack_lm_config): # Haystack style @@ -460,81 +459,145 @@ def __init__( * The name of a remote model on s3 (for example, "facebook/dpr-question_encoder-single-nq-base"). * A local path of a model trained using transformers (for example, "some_dir/huggingface_model"). * A local path of a model trained using Haystack (for example, "some_dir/haystack_model"). + :param pretrained_model_name_or_path: The path of the base pretrained language model whose weights are used to initialize DPRQuestionEncoder. + :param model_type: the type of model (see `HUGGINGFACE_TO_HAYSTACK`) + :param model_kwargs: any kwarg to pass to the model at init + :param language: the model's language. If not given, it will be inferred. Defaults to english. + :param n_added_tokens: unused for `DPREncoder` + :param use_auth_token: useful if the model is from the HF Hub and private + :param model_kwargs: any kwarg to pass to the model at init """ super().__init__(model_type=model_type) self.role = "question" if "question" in model_type.lower() else "context" self._encoder = None - kwargs = model_kwargs or {} model_classname = f"DPR{self.role.capitalize()}Encoder" try: model_class: Type[PreTrainedModel] = getattr(transformers, model_classname) except AttributeError as e: raise ModelingError(f"Model class of type '{model_classname}' not found.") - # We need to differentiate between loading model using Haystack format and Pytorch-Transformers format haystack_lm_config = Path(pretrained_model_name_or_path) / "language_model_config.json" if os.path.exists(haystack_lm_config): - # Haystack style - original_model_config = AutoConfig.from_pretrained(haystack_lm_config) - haystack_lm_model = Path(pretrained_model_name_or_path) / "language_model.bin" + self._init_model_haystack_style( + haystack_lm_config=haystack_lm_config, + model_name_or_path=pretrained_model_name_or_path, + model_class=model_kwargs or {}, + model_kwargs=model_kwargs, + use_auth_token=use_auth_token + ) + else: + self._init_model_transformers_style( + model_name_or_path=pretrained_model_name_or_path, + model_class=model_class, + model_kwargs=model_kwargs or {}, + use_auth_token=use_auth_token, + language=language + ) - if original_model_config.model_type == "dpr": - dpr_config = transformers.DPRConfig.from_pretrained(haystack_lm_config) - self.model = model_class.from_pretrained(haystack_lm_model, config=dpr_config, **kwargs) - else: - if original_model_config.model_type != "bert": - logger.warning( - f"Using a model of type '{original_model_config.model_type}' which might be incompatible with DPR encoders." - f"Bert based encoders are supported that need input_ids,token_type_ids,attention_mask as input tensors." - ) - original_config_dict = vars(original_model_config) - original_config_dict.update(kwargs) - self.model = model_class(config=transformers.DPRConfig(**original_config_dict)) - - language_model_type = _get_model_type(haystack_lm_config, use_auth_token=use_auth_token, **kwargs) - # Find the class corresponding to this model type - try: - language_model_class: Type[Union[HFLanguageModel, DPREncoder]] = HUGGINGFACE_TO_HAYSTACK[ - language_model_type - ] - except KeyError as e: - raise ValueError( - f"The type of model supplied ({language_model_type}) is not supported by Haystack. " - f"Supported model categories are: {', '.join(HUGGINGFACE_TO_HAYSTACK.keys())}" - ) - # Instantiate the class for this model - self.model.base_model.bert_model = language_model_class( - pretrained_model_name_or_path=pretrained_model_name_or_path, model_type="bert", **kwargs - ).model + def _init_model_haystack_style( + self, + haystack_lm_config: str, + model_name_or_path: Union[str, Path], + model_class: Type[LanguageModel], + model_kwargs: Dict[str, Any], + use_auth_token: Optional[bool] = None + ): + """ + Init a Haystack-style DPR model. + + :param haystack_lm_config: path to the language model config file + :param model_name_or_path: name or path of the model to load + :param model_class: the wrapper class to use for this model (a subclass of LanguageModel, see `HUGGINGFACE_TO_HAYSTACK`) + :param model_kwargs: any kwarg to pass to the model at init + :param use_auth_token: useful if the model is from the HF Hub and private + """ + original_model_config = AutoConfig.from_pretrained(haystack_lm_config) + haystack_lm_model = Path(model_name_or_path) / "language_model.bin" - self.language = self.model.config.language + if original_model_config.model_type == "dpr": + dpr_config = transformers.DPRConfig.from_pretrained(haystack_lm_config) + self.model = model_class.from_pretrained(haystack_lm_model, config=dpr_config, **model_kwargs) + else: - original_model_config = AutoConfig.from_pretrained( - pretrained_model_name_or_path, use_auth_token=use_auth_token + self.model = self._init_model_through_config( + model_config=original_model_config, + model_class=model_class, + model_kwargs=model_kwargs ) - if original_model_config.model_type == "dpr": - # "pretrained dpr model": load existing pretrained DPRQuestionEncoder model - self.model = model_class.from_pretrained( - str(pretrained_model_name_or_path), use_auth_token=use_auth_token, **kwargs - ) - else: - # "from scratch": load weights from different architecture (e.g. bert) into DPRQuestionEncoder - # but keep config values from original architecture - # TODO test for architectures other than BERT, e.g. Electra - if original_model_config.model_type != "bert": - logger.warning( - f"Using a model of type '{original_model_config.model_type}' which might be incompatible with DPR encoders." - f"Bert based encoders are supported that need input_ids,token_type_ids,attention_mask as input tensors." - ) - original_config_dict = vars(original_model_config) - original_config_dict.update(kwargs) - self.model = model_class(config=transformers.DPRConfig(**original_config_dict)) - self.model.base_model.bert_model = AutoModel.from_pretrained( - str(pretrained_model_name_or_path), use_auth_token=use_auth_token, **original_config_dict + language_model_type = _get_model_type( + haystack_lm_config, + use_auth_token=use_auth_token, + **model_kwargs + ) + # Find the class corresponding to this model type + try: + language_model_class = HUGGINGFACE_TO_HAYSTACK[language_model_type] + except KeyError as e: + raise ValueError( + f"The type of model supplied ({model_name_or_path} , " + f"detected type:{language_model_type}) is not supported by Haystack. " + f"Supported model categories are: {', '.join(HUGGINGFACE_TO_HAYSTACK.keys())}" ) - self.language = language or _guess_language(str(pretrained_model_name_or_path)) + # Instantiate the class for this model + self.model.base_model.bert_model = language_model_class( + pretrained_model_name_or_path=model_name_or_path, model_type="bert", **model_kwargs + ).model + + self.language = self.model.config.language + + def _init_model_transformers_style( + self, + model_name_or_path: Union[str, Path], + model_class: Type[LanguageModel], + model_kwargs: Dict[str, Any], + use_auth_token: Optional[bool] = None, + language: Optional[str] = None + ): + """ + Init a Transformers-style DPR model. + + :param model_name_or_path: name or path of the model to load + :param model_class: the wrapper class to use for this model (a subclass of LanguageModel, see `HUGGINGFACE_TO_HAYSTACK`) + :param model_kwargs: any kwarg to pass to the model at init + :param use_auth_token: useful if the model is from the HF Hub and private + :param language: the model's language. If not given, it will be inferred. Defaults to english. + """ + original_model_config = AutoConfig.from_pretrained( + model_name_or_path, use_auth_token=use_auth_token + ) + if original_model_config.model_type == "dpr": + # "pretrained dpr model": load existing pretrained DPRQuestionEncoder model + self.model = model_class.from_pretrained( + str(model_name_or_path), use_auth_token=use_auth_token, **model_kwargs + ) + else: + # "from scratch": load weights from different architecture (e.g. bert) into DPRQuestionEncoder + # but keep config values from original architecture + # TODO test for architectures other than BERT, e.g. Electra + self.model = self._init_model_through_config( + model_config=original_model_config, + model_class=model_class, + model_kwargs=model_kwargs + ) + self.model.base_model.bert_model = AutoModel.from_pretrained( + str(model_name_or_path), use_auth_token=use_auth_token, **vars(original_model_config) + ) + self.language = language or _guess_language(str(model_name_or_path)) + + def _init_model_through_config(self, model_config, model_class, model_kwargs): + """ + Init a DPR model using a config object. + """ + if model_config.model_type != "bert": + logger.warning( + f"Using a model of type '{model_config.model_type}' which might be incompatible with DPR encoders." + f"Bert based encoders are supported that need input_ids,token_type_ids,attention_mask as input tensors." + ) + config_dict = vars(model_config) + config_dict.update(model_kwargs) + return model_class(config=transformers.DPRConfig(**config_dict)) @property def encoder(self): @@ -542,48 +605,42 @@ def encoder(self): self._encoder = self.model.question_encoder if self.role == "question" else self.model.ctx_encoder return self._encoder - def save_config(self, save_dir: Union[Path, str]): + def save_config(self, save_dir: Union[Path, str]) -> None: """ Save the configuration of the language model in Haystack format. + + :param save_dir: the path to save the model at """ # For DPR models, transformers overwrites the model_type with the one set in DPRConfig # Therefore, we copy the model_type from the model config to DPRConfig setattr(transformers.DPRConfig, "model_type", self.model.config.model_type) super().save_config(save_dir=save_dir) - def save(self, save_dir: Union[str, Path], state_dict: Optional[Dict[Any, Any]] = None): + def save(self, save_dir: Union[str, Path], state_dict: Optional[Dict[Any, Any]] = None) -> None: """ Save the model `state_dict` and its configuration file so that it can be loaded again. :param save_dir: The directory in which the model should be saved. - :param state_dict: A dictionary containing the whole state of the module including names of layers. By default, the unchanged state dictionary of the module is used. + :param state_dict: A dictionary containing the whole state of the module including names of layers. + By default, the unchanged state dictionary of the module is used. """ model_to_save = self.model.module if hasattr(self.model, "module") else self.model # Only save the model itself if "dpr" not in self.model.config.model_type.lower(): - if model_to_save.base_model_prefix.startswith("ctx_"): - state_dict = model_to_save.state_dict() - if state_dict: - keys = state_dict.keys() - for key in list(keys): - new_key = key - if key.startswith("ctx_encoder.bert_model.model."): - new_key = key.split("_encoder.bert_model.model.", 1)[1] - elif key.startswith("ctx_encoder.bert_model."): - new_key = key.split("_encoder.bert_model.", 1)[1] - state_dict[new_key] = state_dict.pop(key) - - elif model_to_save.base_model_prefix.startswith("question_"): - state_dict = model_to_save.state_dict() - if state_dict: - keys = state_dict.keys() - for key in list(keys): - new_key = key - if key.startswith("question_encoder.bert_model.model."): - new_key = key.split("_encoder.bert_model.model.", 1)[1] - elif key.startswith("question_encoder.bert_model."): - new_key = key.split("_encoder.bert_model.", 1)[1] - state_dict[new_key] = state_dict.pop(key) + prefix = "question" if self.role == "question" else "ctx" + + state_dict = model_to_save.state_dict() + if state_dict: + for key in state_dict.keys(): + new_key = key + + if key.startswith(f"{prefix}_encoder.bert_model.model."): + new_key = key.split("_encoder.bert_model.model.", 1)[1] + + elif key.startswith(f"{prefix}_encoder.bert_model."): + new_key = key.split("_encoder.bert_model.", 1)[1] + + state_dict[new_key] = state_dict.pop(key) super().save(save_dir=save_dir, state_dict=state_dict) @@ -604,9 +661,11 @@ def forward( It is a tensor of shape [batch_size, number_of_hard_negative_passages, max_seq_len]. :param attention_mask: A mask that assigns 1 to valid input tokens and 0 to padding tokens of shape [batch_size, number_of_hard_negative_passages, max_seq_len]. + :param output_hidden_states: whether to add the hidden states along with the pooled output + :param outout_attentions: unused for DPREncoder :return: Embeddings for each token in the input sequence. """ - if not self.role == "question": + if self.role == "context": max_seq_len = input_ids.shape[-1] input_ids = input_ids.view(-1, max_seq_len) segment_ids = segment_ids.view(-1, max_seq_len) @@ -618,9 +677,9 @@ def forward( if output_hidden_states or self.encoder.config.output_hidden_states: pooled_output, all_hidden_states = output_tuple.pooler_output, output_tuple.hidden_states return pooled_output, all_hidden_states - else: - pooled_output = output_tuple.pooler_output - return pooled_output, None + + pooled_output = output_tuple.pooler_output + return pooled_output, None HUGGINGFACE_TO_HAYSTACK: Dict[str, Type[LanguageModel]] = { From 17db5004fbf6ffd73bdbde4c42a9497d743f4ca9 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 24 Jun 2022 14:04:50 +0000 Subject: [PATCH 34/89] Update Documentation & Code Style --- haystack/modeling/model/language_model.py | 60 ++++++++++------------- 1 file changed, 25 insertions(+), 35 deletions(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 6f4dae1dd5..9a01c5c0a0 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -455,7 +455,7 @@ def __init__( * The name of a remote model on s3 (for example, "facebook/dpr-question_encoder-single-nq-base"). * A local path of a model trained using transformers (for example, "some_dir/huggingface_model"). * A local path of a model trained using Haystack (for example, "some_dir/haystack_model"). - + :param pretrained_model_name_or_path: The path of the base pretrained language model whose weights are used to initialize DPRQuestionEncoder. :param model_type: the type of model (see `HUGGINGFACE_TO_HAYSTACK`) :param model_kwargs: any kwarg to pass to the model at init @@ -481,28 +481,28 @@ def __init__( model_name_or_path=pretrained_model_name_or_path, model_class=model_kwargs or {}, model_kwargs=model_kwargs, - use_auth_token=use_auth_token - ) + use_auth_token=use_auth_token, + ) else: self._init_model_transformers_style( model_name_or_path=pretrained_model_name_or_path, model_class=model_class, model_kwargs=model_kwargs or {}, use_auth_token=use_auth_token, - language=language + language=language, ) def _init_model_haystack_style( - self, - haystack_lm_config: str, - model_name_or_path: Union[str, Path], - model_class: Type[LanguageModel], - model_kwargs: Dict[str, Any], - use_auth_token: Optional[bool] = None + self, + haystack_lm_config: str, + model_name_or_path: Union[str, Path], + model_class: Type[LanguageModel], + model_kwargs: Dict[str, Any], + use_auth_token: Optional[bool] = None, ): """ Init a Haystack-style DPR model. - + :param haystack_lm_config: path to the language model config file :param model_name_or_path: name or path of the model to load :param model_class: the wrapper class to use for this model (a subclass of LanguageModel, see `HUGGINGFACE_TO_HAYSTACK`) @@ -515,18 +515,12 @@ def _init_model_haystack_style( if original_model_config.model_type == "dpr": dpr_config = transformers.DPRConfig.from_pretrained(haystack_lm_config) self.model = model_class.from_pretrained(haystack_lm_model, config=dpr_config, **model_kwargs) - + else: self.model = self._init_model_through_config( - model_config=original_model_config, - model_class=model_class, - model_kwargs=model_kwargs - ) - language_model_type = _get_model_type( - haystack_lm_config, - use_auth_token=use_auth_token, - **model_kwargs + model_config=original_model_config, model_class=model_class, model_kwargs=model_kwargs ) + language_model_type = _get_model_type(haystack_lm_config, use_auth_token=use_auth_token, **model_kwargs) # Find the class corresponding to this model type try: language_model_class = HUGGINGFACE_TO_HAYSTACK[language_model_type] @@ -545,11 +539,11 @@ def _init_model_haystack_style( def _init_model_transformers_style( self, - model_name_or_path: Union[str, Path], - model_class: Type[LanguageModel], - model_kwargs: Dict[str, Any], + model_name_or_path: Union[str, Path], + model_class: Type[LanguageModel], + model_kwargs: Dict[str, Any], use_auth_token: Optional[bool] = None, - language: Optional[str] = None + language: Optional[str] = None, ): """ Init a Transformers-style DPR model. @@ -560,9 +554,7 @@ def _init_model_transformers_style( :param use_auth_token: useful if the model is from the HF Hub and private :param language: the model's language. If not given, it will be inferred. Defaults to english. """ - original_model_config = AutoConfig.from_pretrained( - model_name_or_path, use_auth_token=use_auth_token - ) + original_model_config = AutoConfig.from_pretrained(model_name_or_path, use_auth_token=use_auth_token) if original_model_config.model_type == "dpr": # "pretrained dpr model": load existing pretrained DPRQuestionEncoder model self.model = model_class.from_pretrained( @@ -573,15 +565,13 @@ def _init_model_transformers_style( # but keep config values from original architecture # TODO test for architectures other than BERT, e.g. Electra self.model = self._init_model_through_config( - model_config=original_model_config, - model_class=model_class, - model_kwargs=model_kwargs + model_config=original_model_config, model_class=model_class, model_kwargs=model_kwargs ) self.model.base_model.bert_model = AutoModel.from_pretrained( str(model_name_or_path), use_auth_token=use_auth_token, **vars(original_model_config) ) self.language = language or _guess_language(str(model_name_or_path)) - + def _init_model_through_config(self, model_config, model_class, model_kwargs): """ Init a DPR model using a config object. @@ -617,7 +607,7 @@ def save(self, save_dir: Union[str, Path], state_dict: Optional[Dict[Any, Any]] Save the model `state_dict` and its configuration file so that it can be loaded again. :param save_dir: The directory in which the model should be saved. - :param state_dict: A dictionary containing the whole state of the module including names of layers. + :param state_dict: A dictionary containing the whole state of the module including names of layers. By default, the unchanged state dictionary of the module is used. """ model_to_save = self.model.module if hasattr(self.model, "module") else self.model # Only save the model itself @@ -629,10 +619,10 @@ def save(self, save_dir: Union[str, Path], state_dict: Optional[Dict[Any, Any]] if state_dict: for key in state_dict.keys(): new_key = key - + if key.startswith(f"{prefix}_encoder.bert_model.model."): new_key = key.split("_encoder.bert_model.model.", 1)[1] - + elif key.startswith(f"{prefix}_encoder.bert_model."): new_key = key.split("_encoder.bert_model.", 1)[1] @@ -673,7 +663,7 @@ def forward( if output_hidden_states or self.encoder.config.output_hidden_states: pooled_output, all_hidden_states = output_tuple.pooler_output, output_tuple.hidden_states return pooled_output, all_hidden_states - + pooled_output = output_tuple.pooler_output return pooled_output, None From 90608961d5574e871dbfd79fea6ed994fbb41aa2 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Fri, 24 Jun 2022 16:23:14 +0200 Subject: [PATCH 35/89] Fix wrong param --- haystack/modeling/model/language_model.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 6f4dae1dd5..bf356ac180 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -479,8 +479,8 @@ def __init__( self._init_model_haystack_style( haystack_lm_config=haystack_lm_config, model_name_or_path=pretrained_model_name_or_path, - model_class=model_kwargs or {}, - model_kwargs=model_kwargs, + model_class=model_class, + model_kwargs=model_kwargs or {}, use_auth_token=use_auth_token ) else: @@ -494,7 +494,7 @@ def __init__( def _init_model_haystack_style( self, - haystack_lm_config: str, + haystack_lm_config: Path, model_name_or_path: Union[str, Path], model_class: Type[LanguageModel], model_kwargs: Dict[str, Any], @@ -523,7 +523,7 @@ def _init_model_haystack_style( model_kwargs=model_kwargs ) language_model_type = _get_model_type( - haystack_lm_config, + model_name_or_path=model_name_or_path, use_auth_token=use_auth_token, **model_kwargs ) From bf7bbceeddcf7299ea18815e32a5eaacef5a2d16 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Fri, 24 Jun 2022 16:25:38 +0200 Subject: [PATCH 36/89] mypy --- haystack/modeling/model/language_model.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index bf356ac180..2f3c5a64d4 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -498,7 +498,7 @@ def _init_model_haystack_style( model_name_or_path: Union[str, Path], model_class: Type[LanguageModel], model_kwargs: Dict[str, Any], - use_auth_token: Optional[bool] = None + use_auth_token: Optional[Union[str, bool]] = None ): """ Init a Haystack-style DPR model. @@ -548,7 +548,7 @@ def _init_model_transformers_style( model_name_or_path: Union[str, Path], model_class: Type[LanguageModel], model_kwargs: Dict[str, Any], - use_auth_token: Optional[bool] = None, + use_auth_token: Optional[Union[str, bool]] = None, language: Optional[str] = None ): """ @@ -678,7 +678,7 @@ def forward( return pooled_output, None -HUGGINGFACE_TO_HAYSTACK: Dict[str, Type[LanguageModel]] = { +HUGGINGFACE_TO_HAYSTACK: Dict[str, Union[Type[HFLanguageModel], Type[DPREncoder]]] = { "Auto": HFLanguageModel, "Albert": HFLanguageModel, "Bert": HFLanguageModel, From 2b36db8edde3f22e9427667a14c6291f794e50ae Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 24 Jun 2022 14:28:39 +0000 Subject: [PATCH 37/89] Update Documentation & Code Style --- haystack/modeling/model/language_model.py | 58 ++++++++++------------- 1 file changed, 25 insertions(+), 33 deletions(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 2f3c5a64d4..02c12935c6 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -455,7 +455,7 @@ def __init__( * The name of a remote model on s3 (for example, "facebook/dpr-question_encoder-single-nq-base"). * A local path of a model trained using transformers (for example, "some_dir/huggingface_model"). * A local path of a model trained using Haystack (for example, "some_dir/haystack_model"). - + :param pretrained_model_name_or_path: The path of the base pretrained language model whose weights are used to initialize DPRQuestionEncoder. :param model_type: the type of model (see `HUGGINGFACE_TO_HAYSTACK`) :param model_kwargs: any kwarg to pass to the model at init @@ -481,28 +481,28 @@ def __init__( model_name_or_path=pretrained_model_name_or_path, model_class=model_class, model_kwargs=model_kwargs or {}, - use_auth_token=use_auth_token - ) + use_auth_token=use_auth_token, + ) else: self._init_model_transformers_style( model_name_or_path=pretrained_model_name_or_path, model_class=model_class, model_kwargs=model_kwargs or {}, use_auth_token=use_auth_token, - language=language + language=language, ) def _init_model_haystack_style( - self, - haystack_lm_config: Path, - model_name_or_path: Union[str, Path], - model_class: Type[LanguageModel], - model_kwargs: Dict[str, Any], - use_auth_token: Optional[Union[str, bool]] = None + self, + haystack_lm_config: Path, + model_name_or_path: Union[str, Path], + model_class: Type[LanguageModel], + model_kwargs: Dict[str, Any], + use_auth_token: Optional[Union[str, bool]] = None, ): """ Init a Haystack-style DPR model. - + :param haystack_lm_config: path to the language model config file :param model_name_or_path: name or path of the model to load :param model_class: the wrapper class to use for this model (a subclass of LanguageModel, see `HUGGINGFACE_TO_HAYSTACK`) @@ -515,17 +515,13 @@ def _init_model_haystack_style( if original_model_config.model_type == "dpr": dpr_config = transformers.DPRConfig.from_pretrained(haystack_lm_config) self.model = model_class.from_pretrained(haystack_lm_model, config=dpr_config, **model_kwargs) - + else: self.model = self._init_model_through_config( - model_config=original_model_config, - model_class=model_class, - model_kwargs=model_kwargs + model_config=original_model_config, model_class=model_class, model_kwargs=model_kwargs ) language_model_type = _get_model_type( - model_name_or_path=model_name_or_path, - use_auth_token=use_auth_token, - **model_kwargs + model_name_or_path=model_name_or_path, use_auth_token=use_auth_token, **model_kwargs ) # Find the class corresponding to this model type try: @@ -545,11 +541,11 @@ def _init_model_haystack_style( def _init_model_transformers_style( self, - model_name_or_path: Union[str, Path], - model_class: Type[LanguageModel], - model_kwargs: Dict[str, Any], + model_name_or_path: Union[str, Path], + model_class: Type[LanguageModel], + model_kwargs: Dict[str, Any], use_auth_token: Optional[Union[str, bool]] = None, - language: Optional[str] = None + language: Optional[str] = None, ): """ Init a Transformers-style DPR model. @@ -560,9 +556,7 @@ def _init_model_transformers_style( :param use_auth_token: useful if the model is from the HF Hub and private :param language: the model's language. If not given, it will be inferred. Defaults to english. """ - original_model_config = AutoConfig.from_pretrained( - model_name_or_path, use_auth_token=use_auth_token - ) + original_model_config = AutoConfig.from_pretrained(model_name_or_path, use_auth_token=use_auth_token) if original_model_config.model_type == "dpr": # "pretrained dpr model": load existing pretrained DPRQuestionEncoder model self.model = model_class.from_pretrained( @@ -573,15 +567,13 @@ def _init_model_transformers_style( # but keep config values from original architecture # TODO test for architectures other than BERT, e.g. Electra self.model = self._init_model_through_config( - model_config=original_model_config, - model_class=model_class, - model_kwargs=model_kwargs + model_config=original_model_config, model_class=model_class, model_kwargs=model_kwargs ) self.model.base_model.bert_model = AutoModel.from_pretrained( str(model_name_or_path), use_auth_token=use_auth_token, **vars(original_model_config) ) self.language = language or _guess_language(str(model_name_or_path)) - + def _init_model_through_config(self, model_config, model_class, model_kwargs): """ Init a DPR model using a config object. @@ -617,7 +609,7 @@ def save(self, save_dir: Union[str, Path], state_dict: Optional[Dict[Any, Any]] Save the model `state_dict` and its configuration file so that it can be loaded again. :param save_dir: The directory in which the model should be saved. - :param state_dict: A dictionary containing the whole state of the module including names of layers. + :param state_dict: A dictionary containing the whole state of the module including names of layers. By default, the unchanged state dictionary of the module is used. """ model_to_save = self.model.module if hasattr(self.model, "module") else self.model # Only save the model itself @@ -629,10 +621,10 @@ def save(self, save_dir: Union[str, Path], state_dict: Optional[Dict[Any, Any]] if state_dict: for key in state_dict.keys(): new_key = key - + if key.startswith(f"{prefix}_encoder.bert_model.model."): new_key = key.split("_encoder.bert_model.model.", 1)[1] - + elif key.startswith(f"{prefix}_encoder.bert_model."): new_key = key.split("_encoder.bert_model.", 1)[1] @@ -673,7 +665,7 @@ def forward( if output_hidden_states or self.encoder.config.output_hidden_states: pooled_output, all_hidden_states = output_tuple.pooler_output, output_tuple.hidden_states return pooled_output, all_hidden_states - + pooled_output = output_tuple.pooler_output return pooled_output, None From 67b84da7f6f641599950ef7e322cb50b76a7e763 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Fri, 24 Jun 2022 16:51:04 +0200 Subject: [PATCH 38/89] Use segment_ids instead of token_type_ids --- haystack/modeling/model/language_model.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 2f3c5a64d4..96e6a5bd78 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -589,7 +589,7 @@ def _init_model_through_config(self, model_config, model_class, model_kwargs): if model_config.model_type != "bert": logger.warning( f"Using a model of type '{model_config.model_type}' which might be incompatible with DPR encoders." - f"Bert based encoders are supported that need input_ids,token_type_ids,attention_mask as input tensors." + f"Bert based encoders are supported that need input_ids, token_type_ids, attention_mask as input tensors." ) config_dict = vars(model_config) config_dict.update(model_kwargs) @@ -668,7 +668,8 @@ def forward( attention_mask = attention_mask.view(-1, max_seq_len) output_tuple = self.model( - input_ids=input_ids, token_type_ids=segment_ids, attention_mask=attention_mask, return_dict=True + #input_ids=input_ids, token_type_ids=segment_ids, attention_mask=attention_mask, return_dict=True + input_ids=input_ids, segment_ids=segment_ids, attention_mask=attention_mask, return_dict=True ) if output_hidden_states or self.encoder.config.output_hidden_states: pooled_output, all_hidden_states = output_tuple.pooler_output, output_tuple.hidden_states From 4d56310d06a454f21385376e4b81fe0705b632de Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 24 Jun 2022 14:54:45 +0000 Subject: [PATCH 39/89] Update Documentation & Code Style --- haystack/modeling/model/language_model.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 545bd5177c..28ec56125d 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -660,8 +660,11 @@ def forward( attention_mask = attention_mask.view(-1, max_seq_len) output_tuple = self.model( - #input_ids=input_ids, token_type_ids=segment_ids, attention_mask=attention_mask, return_dict=True - input_ids=input_ids, segment_ids=segment_ids, attention_mask=attention_mask, return_dict=True + # input_ids=input_ids, token_type_ids=segment_ids, attention_mask=attention_mask, return_dict=True + input_ids=input_ids, + segment_ids=segment_ids, + attention_mask=attention_mask, + return_dict=True, ) if output_hidden_states or self.encoder.config.output_hidden_states: pooled_output, all_hidden_states = output_tuple.pooler_output, output_tuple.hidden_states From 28729660620516b555249ef9ae4bd581efe1d31c Mon Sep 17 00:00:00 2001 From: ZanSara Date: Mon, 4 Jul 2022 14:33:50 +0200 Subject: [PATCH 40/89] Fix question_generator tests & factor out distilbert --- haystack/modeling/model/language_model.py | 69 ++++++++++++++++++----- test/conftest.py | 6 -- test/nodes/test_question_generator.py | 9 ++- 3 files changed, 64 insertions(+), 20 deletions(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 28ec56125d..6a71581aef 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -102,8 +102,8 @@ def encoder(self): def forward( self, input_ids: torch.Tensor, - segment_ids: torch.Tensor, attention_mask: torch.Tensor, + segment_ids: Optional[torch.Tensor], # DistilBERT does not use them, see DistilBERTLanguageModel output_hidden_states: Optional[bool] = None, output_attentions: Optional[bool] = None, ): @@ -322,8 +322,8 @@ def __init__( def forward( self, input_ids: torch.Tensor, - segment_ids: torch.Tensor, attention_mask: torch.Tensor, + segment_ids: torch.Tensor, output_hidden_states: Optional[bool] = None, output_attentions: Optional[bool] = None, ): @@ -340,20 +340,24 @@ def forward( :param output_attentions: When set to `True`, outputs attentions in addition to the embeddings. :return: Embeddings for each token in the input sequence. Can also return hidden states and attentions if specified using the arguments `output_hidden_states` and `output_attentions`. """ - encoder = getattr(self, "encoder", None) # Not all models have an encoder - if encoder: + if hasattr(self, "encoder"): # Not all models have an encoder output_hidden_states = output_hidden_states or self.model.encoder.config.output_hidden_states output_attentions = output_attentions or self.model.encoder.config.output_attentions params = {} + if input_ids is not None: + params["input_ids"] = input_ids + if segment_ids is not None: + # Some models don't take this (see DistilBERT) + params["token_type_ids"] = segment_ids + if attention_mask is not None: + params["attention_mask"] = attention_mask if output_hidden_states: params["output_hidden_states"] = output_hidden_states if output_attentions: params["output_attentions"] = output_attentions - return self.model( - input_ids=input_ids, token_type_ids=segment_ids, attention_mask=attention_mask, return_dict=False, **params - ) + return self.model(**params, return_dict=False) class HFLanguageModelWithPooler(HFLanguageModel): @@ -435,6 +439,45 @@ def forward( return (output_tuple[0], pooled_output) + output_tuple[1:] +class DistilBERTLanguageModel(HFLanguageModelWithPooler): + """ + A model that wraps Hugging Face's implementation of DistilBERT + (https://github.com/huggingface/transformers) to fit the LanguageModel class. + + Note that DistilBERT does not use segment_ids, so it is for now kept in a separate subclass. + """ + def forward( + self, + input_ids: torch.Tensor, + attention_mask: torch.Tensor, + segment_ids: Optional[torch.Tensor] = None, + output_hidden_states: Optional[bool] = None, + output_attentions: Optional[bool] = None, + ): + """ + Perform the forward pass of the model. + + :param input_ids: The IDs of each token in the input sequence. It's a tensor of shape [batch_size, max_seq_len]. + :param attention_mask: A mask that assigns 1 to valid input tokens and 0 to padding tokens + of shape [batch_size, max_seq_len]. Different models call this parameter differently (padding/attention mask). + :param segment_ids: Unused, see DistilBERT documentation. + :param output_hidden_states: When set to `True`, outputs hidden states in addition to the embeddings. + :param output_attentions: When set to `True`, outputs attentions in addition to the embeddings. + :return: Embeddings for each token in the input sequence. Can also return hidden states and attentions if + specified using the arguments `output_hidden_states` and `output_attentions`. + """ + if segment_ids is not None: + logging.warning("`segment_ids` is not None, but DistilBERT does not use them. They will be ignored.") + + return super().forward( + input_ids=input_ids, + segment_ids=None, + attention_mask=attention_mask, + output_hidden_states=output_hidden_states, + output_attentions=output_attentions, + ) + + class DPREncoder(LanguageModel): """ A DPREncoder model that wraps Hugging Face's implementation. @@ -682,7 +725,7 @@ def forward( "Camembert": HFLanguageModel, "Codebert": HFLanguageModel, "DebertaV2": HFLanguageModelWithPooler, - "DistilBert": HFLanguageModelWithPooler, + "DistilBert": DistilBERTLanguageModel, "DPRContextEncoder": DPREncoder, "DPRQuestionEncoder": DPREncoder, "Electra": HFLanguageModelWithPooler, @@ -762,8 +805,7 @@ def get_language_model( else: # It's from the model hub - logger.info(f"Could not find '{pretrained_model_name_or_path}' locally.") - logger.info(f"Looking on Transformers Model Hub (in local cache and online)...") + logger.info(f"Could not find '{pretrained_model_name_or_path}' locally. Searching in the Model Hub...") model_type = _get_model_type( pretrained_model_name_or_path, use_auth_token=use_auth_token, @@ -773,7 +815,7 @@ def get_language_model( if not model_type: raise ModelingError( f"Model not found for '{pretrained_model_name_or_path}'. Either supply the local path for a saved " - f"model or one of bert/roberta/xlnet/albert/distilbert models that can be downloaded from remote. " + f"model, or the name of one of a model that can be downloaded from the Model Hub. " f"Ensure that the model class name can be inferred from the directory name when loading a " f"Transformers' model." ) @@ -817,13 +859,14 @@ def _get_model_type( try: config = AutoConfig.from_pretrained( pretrained_model_name_or_path=model_name_or_path, - model_type=model_type, use_auth_token=use_auth_token, revision=revision, **(autoconfig_kwargs or {}), ) + # Find if this mode is present in MODEL_TYPE_BY_NAME.keys() even with a different capitalization - model_type = {key.lower(): key for key in HUGGINGFACE_TO_HAYSTACK.keys()}.get(config.model_type.lower(), None) + if config.model_type: + model_type = {key.lower(): key for key in HUGGINGFACE_TO_HAYSTACK.keys()}.get(config.model_type.lower(), None) except Exception as e: logger.exception(f"AutoConfig failed to load on '{model_name_or_path}'. ") diff --git a/test/conftest.py b/test/conftest.py index 3fd78c197f..2578c44f1c 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -61,7 +61,6 @@ from haystack.nodes.reader.table import TableReader, RCIReader from haystack.nodes.summarizer.transformers import TransformersSummarizer from haystack.nodes.translator import TransformersTranslator -from haystack.nodes.question_generator import QuestionGenerator from haystack.modeling.infer import Inferencer, QAInferencer @@ -509,11 +508,6 @@ def rag_generator(): return RAGenerator(model_name_or_path="facebook/rag-token-nq", generator_type="token", max_length=20) -@pytest.fixture -def question_generator(): - return QuestionGenerator(model_name_or_path="valhalla/t5-small-e2e-qg") - - @pytest.fixture def lfqa_generator(request): return Seq2SeqGenerator(model_name_or_path=request.param, min_length=100, max_length=200) diff --git a/test/nodes/test_question_generator.py b/test/nodes/test_question_generator.py index 52a6712c64..4efe6f88a7 100644 --- a/test/nodes/test_question_generator.py +++ b/test/nodes/test_question_generator.py @@ -1,10 +1,12 @@ +import pytest + from haystack.pipelines import ( QuestionAnswerGenerationPipeline, QuestionGenerationPipeline, RetrieverQuestionGenerationPipeline, ) +from haystack.nodes.question_generator import QuestionGenerator from haystack.schema import Document -import pytest text = 'The Living End are an Australian punk rockabilly band from Melbourne, formed in 1994. Since 2002, the line-up consists of Chris Cheney (vocals, guitar), Scott Owen (double bass, vocals), and Andy Strachan (drums). The band rose to fame in 1997 after the release of their EP Second Solution / Prisoner of Society, which peaked at No. 4 on the Australian ARIA Singles Chart. They have released eight studio albums, two of which reached the No. 1 spot on the ARIA Albums Chart: The Living End (October 1998) and State of Emergency (February 2006). They have also achieved chart success in the U.S. and the United Kingdom. The Band was nominated 27 times and won five awards at the Australian ARIA Music Awards ceremonies: "Highest Selling Single" for Second Solution / Prisoner of Society (1998), "Breakthrough Artist – Album" and "Best Group" for The Living End (1999), as well as "Best Rock Album" for White Noise (2008) and The Ending Is Just the Beginning Repeating (2011). In October 2010, their debut album was listed in the book "100 Best Australian Albums". Australian musicologist Ian McFarlane described the group as "one of Australia’s premier rock acts. By blending a range of styles (punk, rockabilly and flat out rock) with great success, The Living End has managed to produce anthemic choruses and memorable songs in abundance".' @@ -12,6 +14,11 @@ query = "Living End" +@pytest.fixture +def question_generator(): + return QuestionGenerator(model_name_or_path="valhalla/t5-small-e2e-qg") + + def test_qg_pipeline(question_generator): p = QuestionGenerationPipeline(question_generator) result = p.run(documents=[document]) From 20f0e1d558c4a4fbcddbf40f0c67d98d4694ed96 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 4 Jul 2022 12:39:06 +0000 Subject: [PATCH 41/89] Update Documentation & Code Style --- haystack/modeling/model/language_model.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 6a71581aef..d4689406e0 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -446,6 +446,7 @@ class DistilBERTLanguageModel(HFLanguageModelWithPooler): Note that DistilBERT does not use segment_ids, so it is for now kept in a separate subclass. """ + def forward( self, input_ids: torch.Tensor, @@ -463,7 +464,7 @@ def forward( :param segment_ids: Unused, see DistilBERT documentation. :param output_hidden_states: When set to `True`, outputs hidden states in addition to the embeddings. :param output_attentions: When set to `True`, outputs attentions in addition to the embeddings. - :return: Embeddings for each token in the input sequence. Can also return hidden states and attentions if + :return: Embeddings for each token in the input sequence. Can also return hidden states and attentions if specified using the arguments `output_hidden_states` and `output_attentions`. """ if segment_ids is not None: @@ -863,10 +864,12 @@ def _get_model_type( revision=revision, **(autoconfig_kwargs or {}), ) - + # Find if this mode is present in MODEL_TYPE_BY_NAME.keys() even with a different capitalization if config.model_type: - model_type = {key.lower(): key for key in HUGGINGFACE_TO_HAYSTACK.keys()}.get(config.model_type.lower(), None) + model_type = {key.lower(): key for key in HUGGINGFACE_TO_HAYSTACK.keys()}.get( + config.model_type.lower(), None + ) except Exception as e: logger.exception(f"AutoConfig failed to load on '{model_name_or_path}'. ") From b8287cfcbc9caec3017e68d2126d7b3d8ad80639 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Mon, 4 Jul 2022 14:42:32 +0200 Subject: [PATCH 42/89] mypy --- haystack/modeling/model/language_model.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 6a71581aef..e3340f6849 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -412,8 +412,8 @@ def __init__( def forward( self, input_ids: torch.Tensor, - segment_ids: torch.Tensor, attention_mask: torch.Tensor, + segment_ids: Optional[torch.Tensor], output_hidden_states: Optional[bool] = None, output_attentions: Optional[bool] = None, ): @@ -421,6 +421,9 @@ def forward( Perform the forward pass of the model. :param input_ids: The IDs of each token in the input sequence. It's a tensor of shape [batch_size, max_seq_len]. + :param segment_ids: The ID of the segment. For example, in next sentence prediction, the tokens in the + first sentence are marked with 0 and the tokens in the second sentence are marked with 1. + It is a tensor of shape [batch_size, max_seq_len]. Optional, some models don't need it (DistilBERT for example) :param padding_mask/attention_mask: A mask that assigns 1 to valid input tokens and 0 to padding tokens of shape [batch_size, max_seq_len]. Different models call this parameter differently (padding/attention mask). :param output_hidden_states: When set to `True`, outputs hidden states in addition to the embeddings. @@ -439,12 +442,12 @@ def forward( return (output_tuple[0], pooled_output) + output_tuple[1:] -class DistilBERTLanguageModel(HFLanguageModelWithPooler): +class HFLanguageModelNoSegmentIds(HFLanguageModelWithPooler): """ - A model that wraps Hugging Face's implementation of DistilBERT + A model that wraps Hugging Face's implementation of a model that does not need segment ids. (https://github.com/huggingface/transformers) to fit the LanguageModel class. - Note that DistilBERT does not use segment_ids, so it is for now kept in a separate subclass. + These are for now kept in a separate subclass to show a proper warning. """ def forward( self, @@ -460,14 +463,14 @@ def forward( :param input_ids: The IDs of each token in the input sequence. It's a tensor of shape [batch_size, max_seq_len]. :param attention_mask: A mask that assigns 1 to valid input tokens and 0 to padding tokens of shape [batch_size, max_seq_len]. Different models call this parameter differently (padding/attention mask). - :param segment_ids: Unused, see DistilBERT documentation. + :param segment_ids: Unused. See DistilBERT documentation. :param output_hidden_states: When set to `True`, outputs hidden states in addition to the embeddings. :param output_attentions: When set to `True`, outputs attentions in addition to the embeddings. :return: Embeddings for each token in the input sequence. Can also return hidden states and attentions if specified using the arguments `output_hidden_states` and `output_attentions`. """ if segment_ids is not None: - logging.warning("`segment_ids` is not None, but DistilBERT does not use them. They will be ignored.") + logging.warning(f"`segment_ids` is not None, but {self.name} does not use them. They will be ignored.") return super().forward( input_ids=input_ids, @@ -678,8 +681,8 @@ def save(self, save_dir: Union[str, Path], state_dict: Optional[Dict[Any, Any]] def forward( self, input_ids: torch.Tensor, - segment_ids: torch.Tensor, attention_mask: torch.Tensor, + segment_ids: Optional[torch.Tensor], output_hidden_states: Optional[bool] = None, output_attentions: Optional[bool] = None, ): @@ -725,7 +728,7 @@ def forward( "Camembert": HFLanguageModel, "Codebert": HFLanguageModel, "DebertaV2": HFLanguageModelWithPooler, - "DistilBert": DistilBERTLanguageModel, + "DistilBert": HFLanguageModelNoSegmentIds, "DPRContextEncoder": DPREncoder, "DPRQuestionEncoder": DPREncoder, "Electra": HFLanguageModelWithPooler, From 6278f69108735939e30ceab4315069e207575808 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Mon, 4 Jul 2022 14:46:07 +0200 Subject: [PATCH 43/89] mypy again --- haystack/modeling/model/language_model.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index a30326ee48..9da292fe28 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -703,9 +703,10 @@ def forward( if self.role == "context": max_seq_len = input_ids.shape[-1] input_ids = input_ids.view(-1, max_seq_len) - segment_ids = segment_ids.view(-1, max_seq_len) attention_mask = attention_mask.view(-1, max_seq_len) - + if segment_ids: + segment_ids = segment_ids.view(-1, max_seq_len) + output_tuple = self.model( # input_ids=input_ids, token_type_ids=segment_ids, attention_mask=attention_mask, return_dict=True input_ids=input_ids, From 35e464fcb64f37612946b7c5b61980ce2d05b750 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 4 Jul 2022 12:49:02 +0000 Subject: [PATCH 44/89] Update Documentation & Code Style --- haystack/modeling/model/language_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 9da292fe28..e0ed2884c3 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -706,7 +706,7 @@ def forward( attention_mask = attention_mask.view(-1, max_seq_len) if segment_ids: segment_ids = segment_ids.view(-1, max_seq_len) - + output_tuple = self.model( # input_ids=input_ids, token_type_ids=segment_ids, attention_mask=attention_mask, return_dict=True input_ids=input_ids, From 3e5f080c53b5d139147fc01bc7836d70af1cfb37 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Mon, 4 Jul 2022 17:32:59 +0200 Subject: [PATCH 45/89] remove infer_tokenizer_classes from dense.py, unused --- haystack/modeling/model/adaptive_model.py | 25 +++++++++------------ haystack/modeling/model/biadaptive_model.py | 6 ++--- haystack/modeling/model/language_model.py | 16 ++++++++----- haystack/nodes/retriever/dense.py | 22 ++---------------- test/nodes/test_retriever.py | 3 ++- 5 files changed, 28 insertions(+), 44 deletions(-) diff --git a/haystack/modeling/model/adaptive_model.py b/haystack/modeling/model/adaptive_model.py index e3af09286b..1d01dc4671 100644 --- a/haystack/modeling/model/adaptive_model.py +++ b/haystack/modeling/model/adaptive_model.py @@ -488,16 +488,14 @@ def forward( output_hidden_states=output_hidden_states, output_attentions=output_attentions, ) - if output_hidden_states: - if output_attentions: - sequence_output, pooled_output, hidden_states, attentions = output_tuple - else: - sequence_output, pooled_output, hidden_states = output_tuple + if output_hidden_states and output_attentions: + sequence_output, pooled_output, hidden_states, attentions = output_tuple + elif output_hidden_states: + sequence_output, pooled_output, hidden_states = output_tuple + elif output_attentions: + sequence_output, pooled_output, attentions = output_tuple else: - if output_attentions: - sequence_output, pooled_output, attentions = output_tuple - else: - sequence_output, pooled_output = output_tuple + sequence_output, pooled_output = output_tuple # Run forward pass of (multiple) prediction heads using the output from above all_logits = [] if len(self.prediction_heads) > 0: @@ -520,12 +518,11 @@ def forward( # just return LM output (e.g. useful for extracting embeddings at inference time) all_logits.append((sequence_output, pooled_output)) + if output_hidden_states and output_attentions: + return all_logits, hidden_states, attentions if output_hidden_states: - if output_attentions: - return all_logits, hidden_states, attentions - else: - return all_logits, hidden_states - elif output_attentions: + return all_logits, hidden_states + if output_attentions: return all_logits, attentions return all_logits diff --git a/haystack/modeling/model/biadaptive_model.py b/haystack/modeling/model/biadaptive_model.py index ed5152892a..bf41db0b7f 100644 --- a/haystack/modeling/model/biadaptive_model.py +++ b/haystack/modeling/model/biadaptive_model.py @@ -344,9 +344,9 @@ def forward_lm( if passage_input_ids is not None and passage_segment_ids is not None and passage_attention_mask is not None: pooled_output2, _ = self.language_model2( - input_ids=passage_input_ids[0], - segment_ids=passage_segment_ids[0], - attention_mask=passage_attention_mask[0], + input_ids=passage_input_ids, + segment_ids=passage_segment_ids, + attention_mask=passage_attention_mask, ) pooled_output[1] = pooled_output2 diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 9da292fe28..4943edb7b2 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -402,7 +402,7 @@ def __init__( # The pooler takes the first hidden representation & feeds it to a dense layer of (hidden_dim x hidden_dim). # We don't want a dropout in the end of the pooler, since we do that already in the adaptive model before we # feed everything to the prediction head - sequence_summary_config = PARAMETERS_BY_MODEL.get(self.name.lower(), {}) + sequence_summary_config = POOLER_PARAMETERS.get(self.name.lower(), {}) for key, value in sequence_summary_config.items(): setattr(config, key, value) @@ -697,20 +697,19 @@ def forward( :param attention_mask: A mask that assigns 1 to valid input tokens and 0 to padding tokens of shape [batch_size, number_of_hard_negative_passages, max_seq_len]. :param output_hidden_states: whether to add the hidden states along with the pooled output - :param outout_attentions: unused for DPREncoder + :param output_attentions: unused for DPREncoder :return: Embeddings for each token in the input sequence. """ if self.role == "context": max_seq_len = input_ids.shape[-1] input_ids = input_ids.view(-1, max_seq_len) attention_mask = attention_mask.view(-1, max_seq_len) - if segment_ids: + if segment_ids is not None: segment_ids = segment_ids.view(-1, max_seq_len) output_tuple = self.model( - # input_ids=input_ids, token_type_ids=segment_ids, attention_mask=attention_mask, return_dict=True input_ids=input_ids, - segment_ids=segment_ids, + token_type_ids=segment_ids, attention_mask=attention_mask, return_dict=True, ) @@ -722,6 +721,7 @@ def forward( return pooled_output, None +#: Match the name of the HuggingFace Model class to the corresponding Haystack wrapper HUGGINGFACE_TO_HAYSTACK: Dict[str, Union[Type[HFLanguageModel], Type[DPREncoder]]] = { "Auto": HFLanguageModel, "Albert": HFLanguageModel, @@ -743,6 +743,8 @@ def forward( "XLMRoberta": HFLanguageModel, "XLNet": HFLanguageModelWithPooler, } + +#: Regex to match variants of the HF class name, to enhance our mode type guessing abilities. NAME_HINTS: Dict[str, str] = { "xlm.*roberta": "XLMRoberta", "roberta.*xml": "XLMRoberta", @@ -753,7 +755,9 @@ def forward( "dpr.*ctx.*encoder": "DPRContextEncoder", "deberta-v2": "DebertaV2", } -PARAMETERS_BY_MODEL: Dict[str, Dict[str, Any]] = { + +#: Parameters or the pooler of models that don't have their own pooler +POOLER_PARAMETERS: Dict[str, Dict[str, Any]] = { "DistilBert": {"summary_last_dropout": 0, "summary_type": "first", "summary_activation": "tanh"}, "XLNet": {"summary_last_dropout": 0}, "Electra": { diff --git a/haystack/nodes/retriever/dense.py b/haystack/nodes/retriever/dense.py index 3d908e456b..34000ba23e 100644 --- a/haystack/nodes/retriever/dense.py +++ b/haystack/nodes/retriever/dense.py @@ -61,7 +61,6 @@ def __init__( batch_size: int = 16, embed_title: bool = True, use_fast_tokenizers: bool = True, - infer_tokenizer_classes: bool = False, similarity_function: str = "dot_product", global_loss_buffer_size: int = 150000, progress_bar: bool = True, @@ -106,8 +105,6 @@ def __init__( before writing them to the DocumentStore like this: {"text": "my text", "meta": {"name": "my title"}}. :param use_fast_tokenizers: Whether to use fast Rust tokenizers - :param infer_tokenizer_classes: Whether to infer tokenizer class from the model config / name. - If `False`, the class always loads `DPRQuestionEncoderTokenizer` and `DPRContextEncoderTokenizer`. :param similarity_function: Which function to apply for calculating the similarity of query and passage embeddings during training. Options: `dot_product` (Default) or `cosine` :param global_loss_buffer_size: Buffer size for all_gather() in DDP. @@ -155,8 +152,6 @@ def __init__( "This can be set when initializing the DocumentStore" ) - self.infer_tokenizer_classes = infer_tokenizer_classes - # Init & Load Encoders self.query_tokenizer = DPRQuestionEncoderTokenizerFast.from_pretrained( pretrained_model_name_or_path=query_embedding_model, @@ -491,8 +486,8 @@ def _get_predictions(self, dicts): leave=False, disable=disable_tqdm, ) as progress_bar: - for batch in data_loader: - batch = {key: batch[key].to(self.devices[0]) for key in batch} + for raw_batch in data_loader: + batch = {key: raw_batch[key].to(self.devices[0]) for key in raw_batch} # get logits with torch.no_grad(): @@ -555,7 +550,6 @@ def embed_documents(self, docs: List[Document]) -> List[np.ndarray]: for d in docs ] embeddings = self._get_predictions(passages)["passages"] - return embeddings def train( @@ -731,7 +725,6 @@ def load( similarity_function: str = "dot_product", query_encoder_dir: str = "query_encoder", passage_encoder_dir: str = "passage_encoder", - infer_tokenizer_classes: bool = False, ): """ Load DensePassageRetriever from the specified directory. @@ -748,7 +741,6 @@ def load( embed_title=embed_title, use_fast_tokenizers=use_fast_tokenizers, similarity_function=similarity_function, - infer_tokenizer_classes=infer_tokenizer_classes, ) logger.info(f"DPR model loaded from {load_dir}") @@ -779,7 +771,6 @@ def __init__( batch_size: int = 16, embed_meta_fields: List[str] = ["name", "section_title", "caption"], use_fast_tokenizers: bool = True, - infer_tokenizer_classes: bool = False, similarity_function: str = "dot_product", global_loss_buffer_size: int = 150000, progress_bar: bool = True, @@ -810,8 +801,6 @@ def __init__( performance if your titles contain meaningful information for retrieval (topic, entities etc.). :param use_fast_tokenizers: Whether to use fast Rust tokenizers - :param infer_tokenizer_classes: Whether to infer tokenizer class from the model config / name. - If `False`, the class always loads `DPRQuestionEncoderTokenizer` and `DPRContextEncoderTokenizer`. :param similarity_function: Which function to apply for calculating the similarity of query and passage embeddings during training. Options: `dot_product` (Default) or `cosine` :param global_loss_buffer_size: Buffer size for all_gather() in DDP. @@ -860,16 +849,11 @@ def __init__( "This can be set when initializing the DocumentStore" ) - self.infer_tokenizer_classes = infer_tokenizer_classes tokenizers_default_classes: Dict[str, Type[PreTrainedTokenizer]] = { "query": DPRQuestionEncoderTokenizerFast, "passage": DPRContextEncoderTokenizerFast, "table": DPRContextEncoderTokenizerFast, } - if self.infer_tokenizer_classes: - tokenizers_default_classes["query"] = None # type: ignore - tokenizers_default_classes["passage"] = None # type: ignore - tokenizers_default_classes["table"] = None # type: ignore # Init & Load Encoders self.query_tokenizer = tokenizers_default_classes["query"].from_pretrained( @@ -1421,7 +1405,6 @@ def load( query_encoder_dir: str = "query_encoder", passage_encoder_dir: str = "passage_encoder", table_encoder_dir: str = "table_encoder", - infer_tokenizer_classes: bool = False, ): """ Load TableTextRetriever from the specified directory. @@ -1441,7 +1424,6 @@ def load( embed_meta_fields=embed_meta_fields, use_fast_tokenizers=use_fast_tokenizers, similarity_function=similarity_function, - infer_tokenizer_classes=infer_tokenizer_classes, ) logger.info(f"TableTextRetriever model loaded from {load_dir}") diff --git a/test/nodes/test_retriever.py b/test/nodes/test_retriever.py index b3fd71ff2f..c3c03e40fa 100644 --- a/test/nodes/test_retriever.py +++ b/test/nodes/test_retriever.py @@ -11,6 +11,7 @@ from elasticsearch import Elasticsearch from haystack.document_stores import WeaviateDocumentStore +from haystack.nodes.retriever.base import BaseRetriever from haystack.schema import Document from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore from haystack.document_stores.faiss import FAISSDocumentStore @@ -40,7 +41,7 @@ ], indirect=True, ) -def test_retrieval(retriever_with_docs, document_store_with_docs): +def test_retrieval(retriever_with_docs: BaseRetriever, document_store_with_docs: BaseDocumentStore): if not isinstance(retriever_with_docs, (BM25Retriever, FilterRetriever, TfidfRetriever)): document_store_with_docs.update_embeddings(retriever_with_docs) From 3f973674550d7e8e7bbfcc55cbae17bfb6e13c87 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 4 Jul 2022 15:35:48 +0000 Subject: [PATCH 46/89] Update Documentation & Code Style --- docs/_src/api/api/retriever.md | 12 ++++-------- .../haystack-pipeline-master.schema.json | 10 ---------- haystack/modeling/model/biadaptive_model.py | 4 +--- haystack/modeling/model/language_model.py | 5 +---- 4 files changed, 6 insertions(+), 25 deletions(-) diff --git a/docs/_src/api/api/retriever.md b/docs/_src/api/api/retriever.md index 5e9f90f71b..3b060fb91f 100644 --- a/docs/_src/api/api/retriever.md +++ b/docs/_src/api/api/retriever.md @@ -519,7 +519,7 @@ Karpukhin, Vladimir, et al. (2020): "Dense Passage Retrieval for Open-Domain Que #### DensePassageRetriever.\_\_init\_\_ ```python -def __init__(document_store: BaseDocumentStore, query_embedding_model: Union[Path, str] = "facebook/dpr-question_encoder-single-nq-base", passage_embedding_model: Union[Path, str] = "facebook/dpr-ctx_encoder-single-nq-base", model_version: Optional[str] = None, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, top_k: int = 10, use_gpu: bool = True, batch_size: int = 16, embed_title: bool = True, use_fast_tokenizers: bool = True, infer_tokenizer_classes: bool = False, similarity_function: str = "dot_product", global_loss_buffer_size: int = 150000, progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None, scale_score: bool = True) +def __init__(document_store: BaseDocumentStore, query_embedding_model: Union[Path, str] = "facebook/dpr-question_encoder-single-nq-base", passage_embedding_model: Union[Path, str] = "facebook/dpr-ctx_encoder-single-nq-base", model_version: Optional[str] = None, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, top_k: int = 10, use_gpu: bool = True, batch_size: int = 16, embed_title: bool = True, use_fast_tokenizers: bool = True, similarity_function: str = "dot_product", global_loss_buffer_size: int = 150000, progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None, scale_score: bool = True) ``` Init the Retriever incl. the two encoder models from a local or remote model checkpoint. @@ -561,8 +561,6 @@ The title is expected to be present in doc.meta["name"] and can be supplied in t before writing them to the DocumentStore like this: {"text": "my text", "meta": {"name": "my title"}}. - `use_fast_tokenizers`: Whether to use fast Rust tokenizers -- `infer_tokenizer_classes`: Whether to infer tokenizer class from the model config / name. -If `False`, the class always loads `DPRQuestionEncoderTokenizer` and `DPRContextEncoderTokenizer`. - `similarity_function`: Which function to apply for calculating the similarity of query and passage embeddings during training. Options: `dot_product` (Default) or `cosine` - `global_loss_buffer_size`: Buffer size for all_gather() in DDP. @@ -871,7 +869,7 @@ None ```python @classmethod -def load(cls, load_dir: Union[Path, str], document_store: BaseDocumentStore, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, use_gpu: bool = True, batch_size: int = 16, embed_title: bool = True, use_fast_tokenizers: bool = True, similarity_function: str = "dot_product", query_encoder_dir: str = "query_encoder", passage_encoder_dir: str = "passage_encoder", infer_tokenizer_classes: bool = False) +def load(cls, load_dir: Union[Path, str], document_store: BaseDocumentStore, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, use_gpu: bool = True, batch_size: int = 16, embed_title: bool = True, use_fast_tokenizers: bool = True, similarity_function: str = "dot_product", query_encoder_dir: str = "query_encoder", passage_encoder_dir: str = "passage_encoder") ``` Load DensePassageRetriever from the specified directory. @@ -895,7 +893,7 @@ Kostić, Bogdan, et al. (2021): "Multi-modal Retrieval of Tables and Texts Using #### TableTextRetriever.\_\_init\_\_ ```python -def __init__(document_store: BaseDocumentStore, query_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-question_encoder", passage_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-passage_encoder", table_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-table_encoder", model_version: Optional[str] = None, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, max_seq_len_table: int = 256, top_k: int = 10, use_gpu: bool = True, batch_size: int = 16, embed_meta_fields: List[str] = ["name", "section_title", "caption"], use_fast_tokenizers: bool = True, infer_tokenizer_classes: bool = False, similarity_function: str = "dot_product", global_loss_buffer_size: int = 150000, progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None, scale_score: bool = True) +def __init__(document_store: BaseDocumentStore, query_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-question_encoder", passage_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-passage_encoder", table_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-table_encoder", model_version: Optional[str] = None, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, max_seq_len_table: int = 256, top_k: int = 10, use_gpu: bool = True, batch_size: int = 16, embed_meta_fields: List[str] = ["name", "section_title", "caption"], use_fast_tokenizers: bool = True, similarity_function: str = "dot_product", global_loss_buffer_size: int = 150000, progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None, scale_score: bool = True) ``` Init the Retriever incl. the two encoder models from a local or remote model checkpoint. @@ -923,8 +921,6 @@ This is the approach used in the original paper and is likely to improve performance if your titles contain meaningful information for retrieval (topic, entities etc.). - `use_fast_tokenizers`: Whether to use fast Rust tokenizers -- `infer_tokenizer_classes`: Whether to infer tokenizer class from the model config / name. -If `False`, the class always loads `DPRQuestionEncoderTokenizer` and `DPRContextEncoderTokenizer`. - `similarity_function`: Which function to apply for calculating the similarity of query and passage embeddings during training. Options: `dot_product` (Default) or `cosine` - `global_loss_buffer_size`: Buffer size for all_gather() in DDP. @@ -1153,7 +1149,7 @@ None ```python @classmethod -def load(cls, load_dir: Union[Path, str], document_store: BaseDocumentStore, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, max_seq_len_table: int = 256, use_gpu: bool = True, batch_size: int = 16, embed_meta_fields: List[str] = ["name", "section_title", "caption"], use_fast_tokenizers: bool = True, similarity_function: str = "dot_product", query_encoder_dir: str = "query_encoder", passage_encoder_dir: str = "passage_encoder", table_encoder_dir: str = "table_encoder", infer_tokenizer_classes: bool = False) +def load(cls, load_dir: Union[Path, str], document_store: BaseDocumentStore, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, max_seq_len_table: int = 256, use_gpu: bool = True, batch_size: int = 16, embed_meta_fields: List[str] = ["name", "section_title", "caption"], use_fast_tokenizers: bool = True, similarity_function: str = "dot_product", query_encoder_dir: str = "query_encoder", passage_encoder_dir: str = "passage_encoder", table_encoder_dir: str = "table_encoder") ``` Load TableTextRetriever from the specified directory. diff --git a/haystack/json-schemas/haystack-pipeline-master.schema.json b/haystack/json-schemas/haystack-pipeline-master.schema.json index ff55116564..1429981500 100644 --- a/haystack/json-schemas/haystack-pipeline-master.schema.json +++ b/haystack/json-schemas/haystack-pipeline-master.schema.json @@ -2091,11 +2091,6 @@ "default": true, "type": "boolean" }, - "infer_tokenizer_classes": { - "title": "Infer Tokenizer Classes", - "default": false, - "type": "boolean" - }, "similarity_function": { "title": "Similarity Function", "default": "dot_product", @@ -4088,11 +4083,6 @@ "default": true, "type": "boolean" }, - "infer_tokenizer_classes": { - "title": "Infer Tokenizer Classes", - "default": false, - "type": "boolean" - }, "similarity_function": { "title": "Similarity Function", "default": "dot_product", diff --git a/haystack/modeling/model/biadaptive_model.py b/haystack/modeling/model/biadaptive_model.py index bf41db0b7f..f3fa76a538 100644 --- a/haystack/modeling/model/biadaptive_model.py +++ b/haystack/modeling/model/biadaptive_model.py @@ -344,9 +344,7 @@ def forward_lm( if passage_input_ids is not None and passage_segment_ids is not None and passage_attention_mask is not None: pooled_output2, _ = self.language_model2( - input_ids=passage_input_ids, - segment_ids=passage_segment_ids, - attention_mask=passage_attention_mask, + input_ids=passage_input_ids, segment_ids=passage_segment_ids, attention_mask=passage_attention_mask ) pooled_output[1] = pooled_output2 diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index f989042670..f5b09aa74a 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -708,10 +708,7 @@ def forward( segment_ids = segment_ids.view(-1, max_seq_len) output_tuple = self.model( - input_ids=input_ids, - token_type_ids=segment_ids, - attention_mask=attention_mask, - return_dict=True, + input_ids=input_ids, token_type_ids=segment_ids, attention_mask=attention_mask, return_dict=True ) if output_hidden_states or self.encoder.config.output_hidden_states: pooled_output, all_hidden_states = output_tuple.pooler_output, output_tuple.hidden_states From e514b2f1ed9d2d641ec206fa3570c47ff2c68110 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Mon, 4 Jul 2022 18:04:22 +0200 Subject: [PATCH 47/89] Remove usage of kwargs in evaluation.py eval() --- haystack/modeling/evaluation/eval.py | 8 +++++++- haystack/modeling/model/language_model.py | 11 ++++++----- test/conftest.py | 6 ++++++ test/nodes/test_question_generator.py | 5 ----- 4 files changed, 19 insertions(+), 11 deletions(-) diff --git a/haystack/modeling/evaluation/eval.py b/haystack/modeling/evaluation/eval.py index 4cdba7409f..c12e39152c 100644 --- a/haystack/modeling/evaluation/eval.py +++ b/haystack/modeling/evaluation/eval.py @@ -69,7 +69,13 @@ def eval( with torch.no_grad(): - logits = model.forward(**batch) + logits = model.forward( + input_ids=batch.get("input_ids", None), + segment_ids=batch.get("segment_ids", None), + padding_mask=batch.get("padding_mask", None), + output_hidden_states=batch.get("input_ids", False), + output_attentions=batch.get("input_ids", False), + ) losses_per_head = model.logits_to_loss_per_head(logits=logits, **batch) preds = model.logits_to_preds(logits=logits, **batch) labels = model.prepare_labels(**batch) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index f5b09aa74a..79fbe36113 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -798,6 +798,7 @@ def get_language_model( :param language_model_type: (Optional) Name of the language model class to load (for example `Bert`). Overrides any other discovered value. """ logger.info(f" * LOADING MODEL: '{pretrained_model_name_or_path}'") + from_where = "" config_file = Path(pretrained_model_name_or_path) / "language_model_config.json" @@ -805,13 +806,13 @@ def get_language_model( if os.path.exists(config_file): # it's a local directory in Haystack format - logger.info(f"Model found locally at {pretrained_model_name_or_path}") + from_where = "local storage" config = json.load(open(config_file)) model_type = config["name"] else: # It's from the model hub - logger.info(f"Could not find '{pretrained_model_name_or_path}' locally. Searching in the Model Hub...") + from_where = "the Model Hub" model_type = _get_model_type( pretrained_model_name_or_path, use_auth_token=use_auth_token, @@ -831,8 +832,8 @@ def get_language_model( language_model_class: Type[Union[HFLanguageModel, DPREncoder]] = HUGGINGFACE_TO_HAYSTACK[model_type] except KeyError as e: raise ValueError( - f"The type of model supplied ({model_type}) is not supported by Haystack. " - f"Supported model categories are: {', '.join(HUGGINGFACE_TO_HAYSTACK.keys())}" + f"The type of model supplied ({model_type}) is not supported by Haystack or was not correclty identified. " + f"Supported model types are: {', '.join(HUGGINGFACE_TO_HAYSTACK.keys())}" ) from e # Instantiate the class for this model @@ -844,7 +845,7 @@ def get_language_model( use_auth_token=use_auth_token, model_kwargs=model_kwargs, ) - logger.info(f"Loaded '{pretrained_model_name_or_path}' ({model_type} model)") + logger.info(f"Loaded '{pretrained_model_name_or_path}' ({model_type} model) from {from_where}.") return language_model diff --git a/test/conftest.py b/test/conftest.py index 2578c44f1c..3fd78c197f 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -61,6 +61,7 @@ from haystack.nodes.reader.table import TableReader, RCIReader from haystack.nodes.summarizer.transformers import TransformersSummarizer from haystack.nodes.translator import TransformersTranslator +from haystack.nodes.question_generator import QuestionGenerator from haystack.modeling.infer import Inferencer, QAInferencer @@ -508,6 +509,11 @@ def rag_generator(): return RAGenerator(model_name_or_path="facebook/rag-token-nq", generator_type="token", max_length=20) +@pytest.fixture +def question_generator(): + return QuestionGenerator(model_name_or_path="valhalla/t5-small-e2e-qg") + + @pytest.fixture def lfqa_generator(request): return Seq2SeqGenerator(model_name_or_path=request.param, min_length=100, max_length=200) diff --git a/test/nodes/test_question_generator.py b/test/nodes/test_question_generator.py index 4efe6f88a7..1813c5be1c 100644 --- a/test/nodes/test_question_generator.py +++ b/test/nodes/test_question_generator.py @@ -14,11 +14,6 @@ query = "Living End" -@pytest.fixture -def question_generator(): - return QuestionGenerator(model_name_or_path="valhalla/t5-small-e2e-qg") - - def test_qg_pipeline(question_generator): p = QuestionGenerationPipeline(question_generator) result = p.run(documents=[document]) From b4c6bebe06c5bd347454b40a303ffa3941a8a3f8 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Mon, 4 Jul 2022 18:42:24 +0200 Subject: [PATCH 48/89] fix log --- haystack/modeling/model/language_model.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 79fbe36113..51521f2998 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -798,7 +798,7 @@ def get_language_model( :param language_model_type: (Optional) Name of the language model class to load (for example `Bert`). Overrides any other discovered value. """ logger.info(f" * LOADING MODEL: '{pretrained_model_name_or_path}'") - from_where = "" + from_where = "local storage" config_file = Path(pretrained_model_name_or_path) / "language_model_config.json" @@ -806,7 +806,6 @@ def get_language_model( if os.path.exists(config_file): # it's a local directory in Haystack format - from_where = "local storage" config = json.load(open(config_file)) model_type = config["name"] From 1797d227c8ce38babb0b6dc77abd4dd204f0a678 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Mon, 4 Jul 2022 18:44:21 +0200 Subject: [PATCH 49/89] typo --- haystack/modeling/evaluation/eval.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/haystack/modeling/evaluation/eval.py b/haystack/modeling/evaluation/eval.py index c12e39152c..d73d77213c 100644 --- a/haystack/modeling/evaluation/eval.py +++ b/haystack/modeling/evaluation/eval.py @@ -73,8 +73,8 @@ def eval( input_ids=batch.get("input_ids", None), segment_ids=batch.get("segment_ids", None), padding_mask=batch.get("padding_mask", None), - output_hidden_states=batch.get("input_ids", False), - output_attentions=batch.get("input_ids", False), + output_hidden_states=batch.get("output_hidden_states", False), + output_attentions=batch.get("output_attentions", False), ) losses_per_head = model.logits_to_loss_per_head(logits=logits, **batch) preds = model.logits_to_preds(logits=logits, **batch) From c32e10c97d4873100b6083f3909c1d074147b6c5 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 6 Jul 2022 17:53:05 +0200 Subject: [PATCH 50/89] Fix dpr tests --- haystack/modeling/model/biadaptive_model.py | 22 ++-- haystack/modeling/model/language_model.py | 106 ++++++++++++-------- test/modeling/test_dpr.py | 37 +++++-- 3 files changed, 111 insertions(+), 54 deletions(-) diff --git a/haystack/modeling/model/biadaptive_model.py b/haystack/modeling/model/biadaptive_model.py index f3fa76a538..45daf5b867 100644 --- a/haystack/modeling/model/biadaptive_model.py +++ b/haystack/modeling/model/biadaptive_model.py @@ -6,6 +6,7 @@ import torch from torch import nn +from transformers import DPRContextEncoder from haystack.modeling.data_handler.processor import Processor from haystack.modeling.model.language_model import get_language_model, LanguageModel @@ -28,8 +29,11 @@ def loss_per_head_sum( class BiAdaptiveModel(nn.Module): - """PyTorch implementation containing all the modelling needed for your NLP task. Combines 2 language - models for representation of 2 sequences and a prediction head. Allows for gradient flow back to the 2 language model components.""" + """ + PyTorch implementation containing all the modelling needed for your NLP task. + Combines 2 language models for representation of 2 sequences and a prediction head. + Allows for gradient flow back to the 2 language model components. + """ def __init__( self, @@ -140,13 +144,13 @@ def load( """ # Language Model if lm1_name: - language_model1 = get_language_model(os.path.join(load_dir, lm1_name)) + language_model1 = get_language_model(os.path.join(load_dir, lm1_name), model_type="DPRQuestionEncoder") else: - language_model1 = get_language_model(load_dir) + language_model1 = get_language_model(load_dir, model_type="DPRQuestionEncoder") if lm2_name: - language_model2 = get_language_model(os.path.join(load_dir, lm2_name)) + language_model2 = get_language_model(os.path.join(load_dir, lm2_name), model_type="DPRContextEncoder") else: - language_model2 = get_language_model(load_dir) + language_model2 = get_language_model(load_dir, model_type="DPRContextEncoder") # Prediction heads ph_config_files = cls._get_prediction_head_files(load_dir) @@ -343,6 +347,12 @@ def forward_lm( pooled_output[0] = pooled_output1 if passage_input_ids is not None and passage_segment_ids is not None and passage_attention_mask is not None: + + max_seq_len = passage_input_ids.shape[-1] + passage_input_ids = passage_input_ids.view(-1, max_seq_len) + passage_attention_mask = passage_attention_mask.view(-1, max_seq_len) + passage_segment_ids = passage_segment_ids.view(-1, max_seq_len) + pooled_output2, _ = self.language_model2( input_ids=passage_input_ids, segment_ids=passage_segment_ids, attention_mask=passage_attention_mask ) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 51521f2998..f40410582e 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -106,6 +106,7 @@ def forward( segment_ids: Optional[torch.Tensor], # DistilBERT does not use them, see DistilBERTLanguageModel output_hidden_states: Optional[bool] = None, output_attentions: Optional[bool] = None, + return_dict: bool = False, ): raise NotImplementedError @@ -326,6 +327,7 @@ def forward( segment_ids: torch.Tensor, output_hidden_states: Optional[bool] = None, output_attentions: Optional[bool] = None, + return_dict: bool = False, ): """ Perform the forward pass of the model. @@ -357,7 +359,7 @@ def forward( if output_attentions: params["output_attentions"] = output_attentions - return self.model(**params, return_dict=False) + return self.model(**params, return_dict=return_dict) class HFLanguageModelWithPooler(HFLanguageModel): @@ -416,6 +418,7 @@ def forward( segment_ids: Optional[torch.Tensor], output_hidden_states: Optional[bool] = None, output_attentions: Optional[bool] = None, + return_dict: bool = False, ): """ Perform the forward pass of the model. @@ -437,6 +440,7 @@ def forward( attention_mask=attention_mask, output_hidden_states=output_hidden_states, output_attentions=output_attentions, + return_dict=return_dict ) pooled_output = self.pooler(output_tuple[0]) return (output_tuple[0], pooled_output) + output_tuple[1:] @@ -457,6 +461,7 @@ def forward( segment_ids: Optional[torch.Tensor] = None, output_hidden_states: Optional[bool] = None, output_attentions: Optional[bool] = None, + return_dict: bool = False, ): """ Perform the forward pass of the model. @@ -478,7 +483,8 @@ def forward( segment_ids=None, attention_mask=attention_mask, output_hidden_states=output_hidden_states, - output_attentions=output_attentions, + output_attentions=output_attentions,, + return_dict=return_dict ) @@ -543,7 +549,7 @@ def _init_model_haystack_style( self, haystack_lm_config: Path, model_name_or_path: Union[str, Path], - model_class: Type[LanguageModel], + model_class: Type[PreTrainedModel], model_kwargs: Dict[str, Any], use_auth_token: Optional[Union[str, bool]] = None, ): @@ -552,14 +558,14 @@ def _init_model_haystack_style( :param haystack_lm_config: path to the language model config file :param model_name_or_path: name or path of the model to load - :param model_class: the wrapper class to use for this model (a subclass of LanguageModel, see `HUGGINGFACE_TO_HAYSTACK`) + :param model_class: The HuggingFace model class name :param model_kwargs: any kwarg to pass to the model at init :param use_auth_token: useful if the model is from the HF Hub and private """ original_model_config = AutoConfig.from_pretrained(haystack_lm_config) haystack_lm_model = Path(model_name_or_path) / "language_model.bin" - if original_model_config.model_type == "dpr": + if "dpr" in original_model_config.model_type.lower(): dpr_config = transformers.DPRConfig.from_pretrained(haystack_lm_config) self.model = model_class.from_pretrained(haystack_lm_model, config=dpr_config, **model_kwargs) @@ -567,21 +573,19 @@ def _init_model_haystack_style( self.model = self._init_model_through_config( model_config=original_model_config, model_class=model_class, model_kwargs=model_kwargs ) - language_model_type = _get_model_type( - model_name_or_path=model_name_or_path, use_auth_token=use_auth_token, **model_kwargs - ) - # Find the class corresponding to this model type try: - language_model_class = HUGGINGFACE_TO_HAYSTACK[language_model_type] + language_model_class = HUGGINGFACE_TO_HAYSTACK_CASE_INSENSITIVE.get(original_model_config.model_type) except KeyError as e: raise ValueError( f"The type of model supplied ({model_name_or_path} , " - f"detected type:{language_model_type}) is not supported by Haystack. " + f"({original_model_config.model_type}) is not supported by Haystack. " f"Supported model categories are: {', '.join(HUGGINGFACE_TO_HAYSTACK.keys())}" ) # Instantiate the class for this model self.model.base_model.bert_model = language_model_class( - pretrained_model_name_or_path=model_name_or_path, model_type="bert", **model_kwargs + pretrained_model_name_or_path=model_name_or_path, + model_type=HUGGINGFACE_CAPITALIZE.get(original_model_config.model_type.lower()), + **model_kwargs ).model self.language = self.model.config.language @@ -589,7 +593,7 @@ def _init_model_haystack_style( def _init_model_transformers_style( self, model_name_or_path: Union[str, Path], - model_class: Type[LanguageModel], + model_class: Type[PreTrainedModel], model_kwargs: Dict[str, Any], use_auth_token: Optional[Union[str, bool]] = None, language: Optional[str] = None, @@ -598,13 +602,13 @@ def _init_model_transformers_style( Init a Transformers-style DPR model. :param model_name_or_path: name or path of the model to load - :param model_class: the wrapper class to use for this model (a subclass of LanguageModel, see `HUGGINGFACE_TO_HAYSTACK`) + :param model_class: The HuggingFace model class name :param model_kwargs: any kwarg to pass to the model at init :param use_auth_token: useful if the model is from the HF Hub and private :param language: the model's language. If not given, it will be inferred. Defaults to english. """ original_model_config = AutoConfig.from_pretrained(model_name_or_path, use_auth_token=use_auth_token) - if original_model_config.model_type == "dpr": + if "dpr" in original_model_config.model_type.lower(): # "pretrained dpr model": load existing pretrained DPRQuestionEncoder model self.model = model_class.from_pretrained( str(model_name_or_path), use_auth_token=use_auth_token, **model_kwargs @@ -621,14 +625,16 @@ def _init_model_transformers_style( ) self.language = language or _guess_language(str(model_name_or_path)) - def _init_model_through_config(self, model_config, model_class, model_kwargs): + def _init_model_through_config( + self, model_config: AutoConfig, model_class: Type[PreTrainedModel], model_kwargs: Optional[Dict[str, Any]] + ): """ Init a DPR model using a config object. """ - if model_config.model_type != "bert": + if model_config.model_type.lower() != "bert": logger.warning( - f"Using a model of type '{model_config.model_type}' which might be incompatible with DPR encoders." - f"Bert based encoders are supported that need input_ids, token_type_ids, attention_mask as input tensors." + f"Using a model of type '{model_config.model_type}' which might be incompatible with DPR encoders. " + f"Only Bert-based encoders are supported. They need input_ids, token_type_ids, attention_mask as input tensors." ) config_dict = vars(model_config) config_dict.update(model_kwargs) @@ -666,7 +672,7 @@ def save(self, save_dir: Union[str, Path], state_dict: Optional[Dict[Any, Any]] state_dict = model_to_save.state_dict() if state_dict: - for key in state_dict.keys(): + for key in list(state_dict.keys()): # list() here performs a copy and allows editing the dict new_key = key if key.startswith(f"{prefix}_encoder.bert_model.model."): @@ -700,13 +706,6 @@ def forward( :param output_attentions: unused for DPREncoder :return: Embeddings for each token in the input sequence. """ - if self.role == "context": - max_seq_len = input_ids.shape[-1] - input_ids = input_ids.view(-1, max_seq_len) - attention_mask = attention_mask.view(-1, max_seq_len) - if segment_ids is not None: - segment_ids = segment_ids.view(-1, max_seq_len) - output_tuple = self.model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=attention_mask, return_dict=True ) @@ -717,6 +716,23 @@ def forward( pooled_output = output_tuple.pooler_output return pooled_output, None + # # If inheriting from HFLanguageModel + # output_tuple = super().forward( + # input_ids=input_ids, + # segment_ids=segment_ids, + # attention_mask=attention_mask, + # output_hidden_states=output_hidden_states, + # output_attentions=output_attentions, + # return_dict=return_dict + # ) + # if output_hidden_states or self.encoder.config.output_hidden_states: + # pooled_output, all_hidden_states = output_tuple.pooler_output, output_tuple.hidden_states + # return pooled_output, all_hidden_states + + # pooled_output = output_tuple.pooler_output + # return pooled_output, None + + #: Match the name of the HuggingFace Model class to the corresponding Haystack wrapper HUGGINGFACE_TO_HAYSTACK: Dict[str, Union[Type[HFLanguageModel], Type[DPREncoder]]] = { @@ -740,6 +756,10 @@ def forward( "XLMRoberta": HFLanguageModel, "XLNet": HFLanguageModelWithPooler, } +#: Case insensitive version +HUGGINGFACE_TO_HAYSTACK_CASE_INSENSITIVE = {k.lower(): v for k, v in HUGGINGFACE_TO_HAYSTACK.items()} +#: HF Capitalization pairs +HUGGINGFACE_CAPITALIZE = {k.lower(): k for k in HUGGINGFACE_TO_HAYSTACK.keys()} #: Regex to match variants of the HF class name, to enhance our mode type guessing abilities. NAME_HINTS: Dict[str, str] = { @@ -747,9 +767,11 @@ def forward( "roberta.*xml": "XLMRoberta", "codebert.*mlm": "Roberta", "mlm.*codebert": "Roberta", - "dpr.*question.*encoder": "DPRQuestionEncoder", - "dpr.*context.*encoder": "DPRContextEncoder", - "dpr.*ctx.*encoder": "DPRContextEncoder", + "[dpr]?.*question.*encoder": "DPRQuestionEncoder", + "[dpr]?.*query.*encoder": "DPRQuestionEncoder", + "[dpr]?.*passage.*encoder": "DPRContextEncoder", + "[dpr]?.*context.*encoder": "DPRContextEncoder", + "[dpr]?.*ctx.*encoder": "DPRContextEncoder", "deberta-v2": "DebertaV2", } @@ -797,7 +819,7 @@ def get_language_model( :param revision: The version of the model to use from the Hugging Face model hub. This can be a tag name, a branch name, or a commit hash. :param language_model_type: (Optional) Name of the language model class to load (for example `Bert`). Overrides any other discovered value. """ - logger.info(f" * LOADING MODEL: '{pretrained_model_name_or_path}'") + logger.info(f" * LOADING MODEL: '{pretrained_model_name_or_path}' {'('+model_type+')' if model_type else ''}") from_where = "local storage" config_file = Path(pretrained_model_name_or_path) / "language_model_config.json" @@ -820,10 +842,12 @@ def get_language_model( ) if not model_type: raise ModelingError( - f"Model not found for '{pretrained_model_name_or_path}'. Either supply the local path for a saved " - f"model, or the name of one of a model that can be downloaded from the Model Hub. " - f"Ensure that the model class name can be inferred from the directory name when loading a " - f"Transformers' model." + f"Model type not understood for '{pretrained_model_name_or_path}' " + f"({model_type if model_type else 'model_type not set'}). " + "Either supply the local path for a saved model, " + "or the name of one of a model that can be downloaded from the Model Hub. " + "Ensure that the model class name can be inferred from the directory name " + "when loading a Transformers model." ) # Find the class corresponding to this model type @@ -831,7 +855,7 @@ def get_language_model( language_model_class: Type[Union[HFLanguageModel, DPREncoder]] = HUGGINGFACE_TO_HAYSTACK[model_type] except KeyError as e: raise ValueError( - f"The type of model supplied ({model_type}) is not supported by Haystack or was not correclty identified. " + f"The type of model supplied ({model_type}) is not supported by Haystack or was not correctly identified. " f"Supported model types are: {', '.join(HUGGINGFACE_TO_HAYSTACK.keys())}" ) from e @@ -870,14 +894,12 @@ def _get_model_type( **(autoconfig_kwargs or {}), ) - # Find if this mode is present in MODEL_TYPE_BY_NAME.keys() even with a different capitalization - if config.model_type: - model_type = {key.lower(): key for key in HUGGINGFACE_TO_HAYSTACK.keys()}.get( - config.model_type.lower(), None - ) + # Find if this mode is present in HUGGINGFACE_TO_HAYSTACK_CASE_INSENSITIVE.keys() + if config.model_type.lower() in HUGGINGFACE_TO_HAYSTACK_CASE_INSENSITIVE.keys(): + model_type = config.model_type except Exception as e: - logger.exception(f"AutoConfig failed to load on '{model_name_or_path}'. ") + logger.error(f"AutoConfig failed to load on '{model_name_or_path}': {str(e)}") if not model_type: logger.warning("Could not infer the model type from its config. Looking for clues in the model name.") diff --git a/test/modeling/test_dpr.py b/test/modeling/test_dpr.py index d66e2f9d4f..1c22ac0559 100644 --- a/test/modeling/test_dpr.py +++ b/test/modeling/test_dpr.py @@ -1,3 +1,5 @@ +from typing import Tuple + import os import logging from pathlib import Path @@ -7,6 +9,7 @@ import torch from torch.utils.data import SequentialSampler from tqdm import tqdm +from transformers import DPRQuestionEncoder from haystack.modeling.data_handler.dataloader import NamedDataLoader from haystack.modeling.data_handler.processor import TextSimilarityProcessor @@ -134,10 +137,15 @@ def test_dpr_modules(caplog=None): features_passage = { key.replace("passage_", ""): value for key, value in features.items() if key.startswith("passage_") } + max_seq_len = features_passage.get("input_ids").shape[-1] + features_passage = { + key: value.view(-1, max_seq_len) for key, value in features_passage.items() + } # test model encodings query_vector = model.language_model1(**features_query)[0] passage_vector = model.language_model2(**features_passage)[0] + assert torch.all( torch.le( query_vector[0, :10].cpu() @@ -679,7 +687,7 @@ def test_dpr_processor_save_load(tmp_path): {"query": "facebook/dpr-question_encoder-single-nq-base", "passage": "facebook/dpr-ctx_encoder-single-nq-base"}, ], ) -def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path, query_and_passage_model): +def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path: Path, query_and_passage_model: Tuple[str, str]): """ This test compares 1) a model that was loaded from model hub with 2) a model from model hub that was saved to disk and then loaded from disk and @@ -691,7 +699,24 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path, query_and_passage_ "passages": [ { "title": "Etalab", - "text": "Etalab est une administration publique française qui fait notamment office de Chief Data Officer de l'État et coordonne la conception et la mise en œuvre de sa stratégie dans le domaine de la donnée (ouverture et partage des données publiques ou open data, exploitation des données et intelligence artificielle...). Ainsi, Etalab développe et maintient le portail des données ouvertes du gouvernement français data.gouv.fr. Etalab promeut également une plus grande ouverture l'administration sur la société (gouvernement ouvert) : transparence de l'action publique, innovation ouverte, participation citoyenne... elle promeut l’innovation, l’expérimentation, les méthodes de travail ouvertes, agiles et itératives, ainsi que les synergies avec la société civile pour décloisonner l’administration et favoriser l’adoption des meilleures pratiques professionnelles dans le domaine du numérique. À ce titre elle étudie notamment l’opportunité de recourir à des technologies en voie de maturation issues du monde de la recherche. Cette entité chargée de l'innovation au sein de l'administration doit contribuer à l'amélioration du service public grâce au numérique. Elle est rattachée à la Direction interministérielle du numérique, dont les missions et l’organisation ont été fixées par le décret du 30 octobre 2019.  Dirigé par Laure Lucchesi depuis 2016, elle rassemble une équipe pluridisciplinaire d'une trentaine de personnes.", + "text": "Etalab est une administration publique française qui fait notamment office " + "de Chief Data Officer de l'État et coordonne la conception et la mise en œuvre " + "de sa stratégie dans le domaine de la donnée (ouverture et partage des données " + "publiques ou open data, exploitation des données et intelligence artificielle...). " + "Ainsi, Etalab développe et maintient le portail des données ouvertes du gouvernement " + "français data.gouv.fr. Etalab promeut également une plus grande ouverture " + "l'administration sur la société (gouvernement ouvert) : transparence de l'action " + "publique, innovation ouverte, participation citoyenne... elle promeut l’innovation, " + "l’expérimentation, les méthodes de travail ouvertes, agiles et itératives, ainsi que " + "les synergies avec la société civile pour décloisonner l’administration et favoriser " + "l’adoption des meilleures pratiques professionnelles dans le domaine du numérique. " + "À ce titre elle étudie notamment l’opportunité de recourir à des technologies en voie " + "de maturation issues du monde de la recherche. Cette entité chargée de l'innovation " + "au sein de l'administration doit contribuer à l'amélioration du service public grâce " + "au numérique. Elle est rattachée à la Direction interministérielle du numérique, dont " + "les missions et l’organisation ont été fixées par le décret du 30 octobre 2019.  Dirigé " + "par Laure Lucchesi depuis 2016, elle rassemble une équipe pluridisciplinaire d'une " + "trentaine de personnes.", "label": "positive", "external_id": "1", } @@ -704,9 +729,9 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path, query_and_passage_ query_tokenizer = get_tokenizer( pretrained_model_name_or_path=query_embedding_model ) # tokenizer class is inferred automatically - query_encoder = get_language_model(pretrained_model_name_or_path=query_embedding_model) + query_encoder = get_language_model(pretrained_model_name_or_path=query_embedding_model, model_type="DPRQuestionEncoder") passage_tokenizer = get_tokenizer(pretrained_model_name_or_path=passage_embedding_model) - passage_encoder = get_language_model(pretrained_model_name_or_path=passage_embedding_model) + passage_encoder = get_language_model(pretrained_model_name_or_path=passage_embedding_model, model_type="DPRContextEncoder") processor = TextSimilarityProcessor( query_tokenizer=query_tokenizer, @@ -748,11 +773,11 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path, query_and_passage_ loaded_query_tokenizer = get_tokenizer( pretrained_model_name_or_path=Path(save_dir) / query_encoder_dir, use_fast=True ) # tokenizer class is inferred automatically - loaded_query_encoder = get_language_model(pretrained_model_name_or_path=Path(save_dir) / query_encoder_dir) + loaded_query_encoder = get_language_model(pretrained_model_name_or_path=Path(save_dir) / query_encoder_dir, model_type="DPRQuestionEncoder") loaded_passage_tokenizer = get_tokenizer( pretrained_model_name_or_path=Path(save_dir) / passage_encoder_dir, use_fast=True ) - loaded_passage_encoder = get_language_model(pretrained_model_name_or_path=Path(save_dir) / passage_encoder_dir) + loaded_passage_encoder = get_language_model(pretrained_model_name_or_path=Path(save_dir) / passage_encoder_dir, model_type="DPRQuestionEncoder") loaded_processor = TextSimilarityProcessor( query_tokenizer=loaded_query_tokenizer, From 41c8b1dd5397265f85c34cef227d498cbbdf0e08 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 6 Jul 2022 15:58:24 +0000 Subject: [PATCH 51/89] Update Documentation & Code Style --- haystack/modeling/model/biadaptive_model.py | 6 +-- test/modeling/test_dpr.py | 54 ++++++++++++--------- 2 files changed, 33 insertions(+), 27 deletions(-) diff --git a/haystack/modeling/model/biadaptive_model.py b/haystack/modeling/model/biadaptive_model.py index 45daf5b867..3527174832 100644 --- a/haystack/modeling/model/biadaptive_model.py +++ b/haystack/modeling/model/biadaptive_model.py @@ -30,8 +30,8 @@ def loss_per_head_sum( class BiAdaptiveModel(nn.Module): """ - PyTorch implementation containing all the modelling needed for your NLP task. - Combines 2 language models for representation of 2 sequences and a prediction head. + PyTorch implementation containing all the modelling needed for your NLP task. + Combines 2 language models for representation of 2 sequences and a prediction head. Allows for gradient flow back to the 2 language model components. """ @@ -352,7 +352,7 @@ def forward_lm( passage_input_ids = passage_input_ids.view(-1, max_seq_len) passage_attention_mask = passage_attention_mask.view(-1, max_seq_len) passage_segment_ids = passage_segment_ids.view(-1, max_seq_len) - + pooled_output2, _ = self.language_model2( input_ids=passage_input_ids, segment_ids=passage_segment_ids, attention_mask=passage_attention_mask ) diff --git a/test/modeling/test_dpr.py b/test/modeling/test_dpr.py index 1c22ac0559..04eab24763 100644 --- a/test/modeling/test_dpr.py +++ b/test/modeling/test_dpr.py @@ -138,9 +138,7 @@ def test_dpr_modules(caplog=None): key.replace("passage_", ""): value for key, value in features.items() if key.startswith("passage_") } max_seq_len = features_passage.get("input_ids").shape[-1] - features_passage = { - key: value.view(-1, max_seq_len) for key, value in features_passage.items() - } + features_passage = {key: value.view(-1, max_seq_len) for key, value in features_passage.items()} # test model encodings query_vector = model.language_model1(**features_query)[0] @@ -700,23 +698,23 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path: Path, query_and_pa { "title": "Etalab", "text": "Etalab est une administration publique française qui fait notamment office " - "de Chief Data Officer de l'État et coordonne la conception et la mise en œuvre " - "de sa stratégie dans le domaine de la donnée (ouverture et partage des données " - "publiques ou open data, exploitation des données et intelligence artificielle...). " - "Ainsi, Etalab développe et maintient le portail des données ouvertes du gouvernement " - "français data.gouv.fr. Etalab promeut également une plus grande ouverture " - "l'administration sur la société (gouvernement ouvert) : transparence de l'action " - "publique, innovation ouverte, participation citoyenne... elle promeut l’innovation, " - "l’expérimentation, les méthodes de travail ouvertes, agiles et itératives, ainsi que " - "les synergies avec la société civile pour décloisonner l’administration et favoriser " - "l’adoption des meilleures pratiques professionnelles dans le domaine du numérique. " - "À ce titre elle étudie notamment l’opportunité de recourir à des technologies en voie " - "de maturation issues du monde de la recherche. Cette entité chargée de l'innovation " - "au sein de l'administration doit contribuer à l'amélioration du service public grâce " - "au numérique. Elle est rattachée à la Direction interministérielle du numérique, dont " - "les missions et l’organisation ont été fixées par le décret du 30 octobre 2019.  Dirigé " - "par Laure Lucchesi depuis 2016, elle rassemble une équipe pluridisciplinaire d'une " - "trentaine de personnes.", + "de Chief Data Officer de l'État et coordonne la conception et la mise en œuvre " + "de sa stratégie dans le domaine de la donnée (ouverture et partage des données " + "publiques ou open data, exploitation des données et intelligence artificielle...). " + "Ainsi, Etalab développe et maintient le portail des données ouvertes du gouvernement " + "français data.gouv.fr. Etalab promeut également une plus grande ouverture " + "l'administration sur la société (gouvernement ouvert) : transparence de l'action " + "publique, innovation ouverte, participation citoyenne... elle promeut l’innovation, " + "l’expérimentation, les méthodes de travail ouvertes, agiles et itératives, ainsi que " + "les synergies avec la société civile pour décloisonner l’administration et favoriser " + "l’adoption des meilleures pratiques professionnelles dans le domaine du numérique. " + "À ce titre elle étudie notamment l’opportunité de recourir à des technologies en voie " + "de maturation issues du monde de la recherche. Cette entité chargée de l'innovation " + "au sein de l'administration doit contribuer à l'amélioration du service public grâce " + "au numérique. Elle est rattachée à la Direction interministérielle du numérique, dont " + "les missions et l’organisation ont été fixées par le décret du 30 octobre 2019.  Dirigé " + "par Laure Lucchesi depuis 2016, elle rassemble une équipe pluridisciplinaire d'une " + "trentaine de personnes.", "label": "positive", "external_id": "1", } @@ -729,9 +727,13 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path: Path, query_and_pa query_tokenizer = get_tokenizer( pretrained_model_name_or_path=query_embedding_model ) # tokenizer class is inferred automatically - query_encoder = get_language_model(pretrained_model_name_or_path=query_embedding_model, model_type="DPRQuestionEncoder") + query_encoder = get_language_model( + pretrained_model_name_or_path=query_embedding_model, model_type="DPRQuestionEncoder" + ) passage_tokenizer = get_tokenizer(pretrained_model_name_or_path=passage_embedding_model) - passage_encoder = get_language_model(pretrained_model_name_or_path=passage_embedding_model, model_type="DPRContextEncoder") + passage_encoder = get_language_model( + pretrained_model_name_or_path=passage_embedding_model, model_type="DPRContextEncoder" + ) processor = TextSimilarityProcessor( query_tokenizer=query_tokenizer, @@ -773,11 +775,15 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path: Path, query_and_pa loaded_query_tokenizer = get_tokenizer( pretrained_model_name_or_path=Path(save_dir) / query_encoder_dir, use_fast=True ) # tokenizer class is inferred automatically - loaded_query_encoder = get_language_model(pretrained_model_name_or_path=Path(save_dir) / query_encoder_dir, model_type="DPRQuestionEncoder") + loaded_query_encoder = get_language_model( + pretrained_model_name_or_path=Path(save_dir) / query_encoder_dir, model_type="DPRQuestionEncoder" + ) loaded_passage_tokenizer = get_tokenizer( pretrained_model_name_or_path=Path(save_dir) / passage_encoder_dir, use_fast=True ) - loaded_passage_encoder = get_language_model(pretrained_model_name_or_path=Path(save_dir) / passage_encoder_dir, model_type="DPRQuestionEncoder") + loaded_passage_encoder = get_language_model( + pretrained_model_name_or_path=Path(save_dir) / passage_encoder_dir, model_type="DPRQuestionEncoder" + ) loaded_processor = TextSimilarityProcessor( query_tokenizer=loaded_query_tokenizer, From 377010150e6f7d913bd514a3a0a266d59019dee1 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Thu, 7 Jul 2022 11:19:04 +0200 Subject: [PATCH 52/89] typo --- haystack/modeling/model/language_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index f40410582e..c6b22627eb 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -483,7 +483,7 @@ def forward( segment_ids=None, attention_mask=attention_mask, output_hidden_states=output_hidden_states, - output_attentions=output_attentions,, + output_attentions=output_attentions, return_dict=return_dict ) From 4c27ce14d7c434e7df35b0af9221a51c6d696969 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 7 Jul 2022 09:25:14 +0000 Subject: [PATCH 53/89] Update Documentation & Code Style --- haystack/modeling/model/language_model.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index c6b22627eb..4d3a5cdff4 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -440,7 +440,7 @@ def forward( attention_mask=attention_mask, output_hidden_states=output_hidden_states, output_attentions=output_attentions, - return_dict=return_dict + return_dict=return_dict, ) pooled_output = self.pooler(output_tuple[0]) return (output_tuple[0], pooled_output) + output_tuple[1:] @@ -484,7 +484,7 @@ def forward( attention_mask=attention_mask, output_hidden_states=output_hidden_states, output_attentions=output_attentions, - return_dict=return_dict + return_dict=return_dict, ) @@ -558,7 +558,7 @@ def _init_model_haystack_style( :param haystack_lm_config: path to the language model config file :param model_name_or_path: name or path of the model to load - :param model_class: The HuggingFace model class name + :param model_class: The HuggingFace model class name :param model_kwargs: any kwarg to pass to the model at init :param use_auth_token: useful if the model is from the HF Hub and private """ @@ -583,9 +583,9 @@ def _init_model_haystack_style( ) # Instantiate the class for this model self.model.base_model.bert_model = language_model_class( - pretrained_model_name_or_path=model_name_or_path, - model_type=HUGGINGFACE_CAPITALIZE.get(original_model_config.model_type.lower()), - **model_kwargs + pretrained_model_name_or_path=model_name_or_path, + model_type=HUGGINGFACE_CAPITALIZE.get(original_model_config.model_type.lower()), + **model_kwargs, ).model self.language = self.model.config.language @@ -602,7 +602,7 @@ def _init_model_transformers_style( Init a Transformers-style DPR model. :param model_name_or_path: name or path of the model to load - :param model_class: The HuggingFace model class name + :param model_class: The HuggingFace model class name :param model_kwargs: any kwarg to pass to the model at init :param use_auth_token: useful if the model is from the HF Hub and private :param language: the model's language. If not given, it will be inferred. Defaults to english. @@ -672,7 +672,7 @@ def save(self, save_dir: Union[str, Path], state_dict: Optional[Dict[Any, Any]] state_dict = model_to_save.state_dict() if state_dict: - for key in list(state_dict.keys()): # list() here performs a copy and allows editing the dict + for key in list(state_dict.keys()): # list() here performs a copy and allows editing the dict new_key = key if key.startswith(f"{prefix}_encoder.bert_model.model."): @@ -733,7 +733,6 @@ def forward( # return pooled_output, None - #: Match the name of the HuggingFace Model class to the corresponding Haystack wrapper HUGGINGFACE_TO_HAYSTACK: Dict[str, Union[Type[HFLanguageModel], Type[DPREncoder]]] = { "Auto": HFLanguageModel, From 676d554e110eaf3dcb8dcd4e1dbf2ad58e2b9e9b Mon Sep 17 00:00:00 2001 From: ZanSara Date: Thu, 7 Jul 2022 11:45:21 +0200 Subject: [PATCH 54/89] mypy --- haystack/modeling/model/language_model.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index c6b22627eb..65df09c6c5 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -574,7 +574,7 @@ def _init_model_haystack_style( model_config=original_model_config, model_class=model_class, model_kwargs=model_kwargs ) try: - language_model_class = HUGGINGFACE_TO_HAYSTACK_CASE_INSENSITIVE.get(original_model_config.model_type) + language_model_class = HUGGINGFACE_TO_HAYSTACK_CASE_INSENSITIVE[original_model_config.model_type.lower()] except KeyError as e: raise ValueError( f"The type of model supplied ({model_name_or_path} , " @@ -637,7 +637,8 @@ def _init_model_through_config( f"Only Bert-based encoders are supported. They need input_ids, token_type_ids, attention_mask as input tensors." ) config_dict = vars(model_config) - config_dict.update(model_kwargs) + if model_kwargs: + config_dict.update(model_kwargs) return model_class(config=transformers.DPRConfig(**config_dict)) @property @@ -692,6 +693,7 @@ def forward( segment_ids: Optional[torch.Tensor], output_hidden_states: Optional[bool] = None, output_attentions: Optional[bool] = None, + return_dict: bool = True ): """ Perform the forward pass of the DPR encoder model. @@ -707,7 +709,7 @@ def forward( :return: Embeddings for each token in the input sequence. """ output_tuple = self.model( - input_ids=input_ids, token_type_ids=segment_ids, attention_mask=attention_mask, return_dict=True + input_ids=input_ids, token_type_ids=segment_ids, attention_mask=attention_mask, return_dict=return_dict ) if output_hidden_states or self.encoder.config.output_hidden_states: pooled_output, all_hidden_states = output_tuple.pooler_output, output_tuple.hidden_states From fe95b113b63002985457c6526566afda96ee9af1 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Thu, 7 Jul 2022 11:46:53 +0200 Subject: [PATCH 55/89] pylint --- haystack/modeling/model/biadaptive_model.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/haystack/modeling/model/biadaptive_model.py b/haystack/modeling/model/biadaptive_model.py index 3527174832..41be92cce5 100644 --- a/haystack/modeling/model/biadaptive_model.py +++ b/haystack/modeling/model/biadaptive_model.py @@ -6,7 +6,7 @@ import torch from torch import nn -from transformers import DPRContextEncoder +from transformers import DPRContextEncoder, DPRQuestionEncoder, AutoModel from haystack.modeling.data_handler.processor import Processor from haystack.modeling.model.language_model import get_language_model, LanguageModel @@ -434,8 +434,6 @@ def _get_prediction_head_files(cls, load_dir: Union[str, Path]): return config_files def convert_to_transformers(self): - from transformers import DPRContextEncoder, DPRQuestionEncoder, AutoModel - if len(self.prediction_heads) != 1: raise ValueError( f"Currently conversion only works for models with a SINGLE prediction head. " From b662f0f60b17a4eab34d2ee95269a8eded1fab4f Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 7 Jul 2022 09:50:23 +0000 Subject: [PATCH 56/89] Update Documentation & Code Style --- haystack/modeling/model/language_model.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 78b5abd880..71bc849570 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -574,7 +574,9 @@ def _init_model_haystack_style( model_config=original_model_config, model_class=model_class, model_kwargs=model_kwargs ) try: - language_model_class = HUGGINGFACE_TO_HAYSTACK_CASE_INSENSITIVE[original_model_config.model_type.lower()] + language_model_class = HUGGINGFACE_TO_HAYSTACK_CASE_INSENSITIVE[ + original_model_config.model_type.lower() + ] except KeyError as e: raise ValueError( f"The type of model supplied ({model_name_or_path} , " @@ -693,7 +695,7 @@ def forward( segment_ids: Optional[torch.Tensor], output_hidden_states: Optional[bool] = None, output_attentions: Optional[bool] = None, - return_dict: bool = True + return_dict: bool = True, ): """ Perform the forward pass of the DPR encoder model. From 2d906093b2e2f0a7ab3cc95faf0003283c7ef680 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Thu, 7 Jul 2022 14:51:53 +0200 Subject: [PATCH 57/89] capitalize model type --- haystack/modeling/model/language_model.py | 32 +++++++++++++++-------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 78b5abd880..ffd3d1e5a5 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -18,7 +18,7 @@ Thanks for the great work! """ -from typing import Type, Optional, Dict, Any, Union, List +from typing import Type, Tuple, Optional, Dict, Any, Union, List import re import json @@ -574,17 +574,18 @@ def _init_model_haystack_style( model_config=original_model_config, model_class=model_class, model_kwargs=model_kwargs ) try: - language_model_class = HUGGINGFACE_TO_HAYSTACK_CASE_INSENSITIVE[original_model_config.model_type.lower()] + original_model_type, language_model_class = capitalize_and_get_class(original_model_config.model_type.lower()) except KeyError as e: raise ValueError( f"The type of model supplied ({model_name_or_path} , " - f"({original_model_config.model_type}) is not supported by Haystack. " + f"({original_model_type}) is not supported by Haystack. " f"Supported model categories are: {', '.join(HUGGINGFACE_TO_HAYSTACK.keys())}" ) # Instantiate the class for this model self.model.base_model.bert_model = language_model_class( pretrained_model_name_or_path=model_name_or_path, - model_type=HUGGINGFACE_CAPITALIZE.get(original_model_config.model_type.lower()), + model_type=original_model_type, + use_auth_token=use_auth_token, **model_kwargs, ).model @@ -757,11 +758,23 @@ def forward( "XLMRoberta": HFLanguageModel, "XLNet": HFLanguageModelWithPooler, } -#: Case insensitive version -HUGGINGFACE_TO_HAYSTACK_CASE_INSENSITIVE = {k.lower(): v for k, v in HUGGINGFACE_TO_HAYSTACK.items()} #: HF Capitalization pairs HUGGINGFACE_CAPITALIZE = {k.lower(): k for k in HUGGINGFACE_TO_HAYSTACK.keys()} + +def capitalize_and_get_class(model_type: str) -> Tuple[str, Type[LanguageModel]]: + """ + Returns the proper capitalized model type and the corresponding Haystack LanguageModel subclass + """ + model_type_capitalized, lm_class = None, None + + model_type_capitalized = HUGGINGFACE_CAPITALIZE.get(model_type.lower()) + if model_type_capitalized: + lm_class = HUGGINGFACE_TO_HAYSTACK.get(model_type_capitalized) + + return model_type_capitalized, lm_class + + #: Regex to match variants of the HF class name, to enhance our mode type guessing abilities. NAME_HINTS: Dict[str, str] = { "xlm.*roberta": "XLMRoberta", @@ -853,7 +866,7 @@ def get_language_model( # Find the class corresponding to this model type try: - language_model_class: Type[Union[HFLanguageModel, DPREncoder]] = HUGGINGFACE_TO_HAYSTACK[model_type] + model_type, language_model_class = capitalize_and_get_class(model_type) except KeyError as e: raise ValueError( f"The type of model supplied ({model_type}) is not supported by Haystack or was not correctly identified. " @@ -894,10 +907,7 @@ def _get_model_type( revision=revision, **(autoconfig_kwargs or {}), ) - - # Find if this mode is present in HUGGINGFACE_TO_HAYSTACK_CASE_INSENSITIVE.keys() - if config.model_type.lower() in HUGGINGFACE_TO_HAYSTACK_CASE_INSENSITIVE.keys(): - model_type = config.model_type + model_type = config.model_type except Exception as e: logger.error(f"AutoConfig failed to load on '{model_name_or_path}': {str(e)}") From 7df6778657eac5a5a4342e10eeff0f5da1c4ce02 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 7 Jul 2022 12:57:14 +0000 Subject: [PATCH 58/89] Update Documentation & Code Style --- haystack/modeling/model/language_model.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 795a459983..7a22309e56 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -574,7 +574,9 @@ def _init_model_haystack_style( model_config=original_model_config, model_class=model_class, model_kwargs=model_kwargs ) try: - original_model_type, language_model_class = capitalize_and_get_class(original_model_config.model_type.lower()) + original_model_type, language_model_class = capitalize_and_get_class( + original_model_config.model_type.lower() + ) except KeyError as e: raise ValueError( f"The type of model supplied ({model_name_or_path} , " From b5f5b408e383e325fc79754117d6fb11d92c1a92 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Thu, 7 Jul 2022 15:03:06 +0200 Subject: [PATCH 59/89] mypy --- haystack/modeling/model/language_model.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 795a459983..3858c13f74 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -573,8 +573,9 @@ def _init_model_haystack_style( self.model = self._init_model_through_config( model_config=original_model_config, model_class=model_class, model_kwargs=model_kwargs ) + original_model_type = original_model_config.model_type try: - original_model_type, language_model_class = capitalize_and_get_class(original_model_config.model_type.lower()) + original_model_type, language_model_class = capitalize_and_get_class(original_model_type.lower()) except KeyError as e: raise ValueError( f"The type of model supplied ({model_name_or_path} , " @@ -762,7 +763,7 @@ def forward( HUGGINGFACE_CAPITALIZE = {k.lower(): k for k in HUGGINGFACE_TO_HAYSTACK.keys()} -def capitalize_and_get_class(model_type: str) -> Tuple[str, Type[LanguageModel]]: +def capitalize_and_get_class(model_type: str) -> Tuple[str, Type[Union[HFLanguageModel, DPREncoder]]]: """ Returns the proper capitalized model type and the corresponding Haystack LanguageModel subclass """ From 8b56255dcd92b2f30fa17218dd313c4aeeb8b64d Mon Sep 17 00:00:00 2001 From: ZanSara Date: Thu, 7 Jul 2022 16:40:22 +0200 Subject: [PATCH 60/89] mypy --- haystack/modeling/model/language_model.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 3858c13f74..53dbec9f12 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -763,9 +763,11 @@ def forward( HUGGINGFACE_CAPITALIZE = {k.lower(): k for k in HUGGINGFACE_TO_HAYSTACK.keys()} -def capitalize_and_get_class(model_type: str) -> Tuple[str, Type[Union[HFLanguageModel, DPREncoder]]]: +def capitalize_and_get_class(model_type: str) -> Tuple[Optional[str], Optional[Type[Union[HFLanguageModel, DPREncoder]]]]: """ - Returns the proper capitalized model type and the corresponding Haystack LanguageModel subclass + Returns the proper capitalized model type and the corresponding Haystack LanguageModel subclass. + :param model_type: the model_type as found in the config file + :return: the capitalized version of the model type, if found, and the wrapper class, if found. """ model_type_capitalized, lm_class = None, None From cbb644ae5e5d9aae159ddf40d0b8942cb2f8a865 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 7 Jul 2022 14:43:18 +0000 Subject: [PATCH 61/89] Update Documentation & Code Style --- haystack/modeling/model/language_model.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 53dbec9f12..f9e0b49ae3 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -763,7 +763,9 @@ def forward( HUGGINGFACE_CAPITALIZE = {k.lower(): k for k in HUGGINGFACE_TO_HAYSTACK.keys()} -def capitalize_and_get_class(model_type: str) -> Tuple[Optional[str], Optional[Type[Union[HFLanguageModel, DPREncoder]]]]: +def capitalize_and_get_class( + model_type: str, +) -> Tuple[Optional[str], Optional[Type[Union[HFLanguageModel, DPREncoder]]]]: """ Returns the proper capitalized model type and the corresponding Haystack LanguageModel subclass. :param model_type: the model_type as found in the config file From f4a37bfdab70ac59e3b61a027309f86193ed07dc Mon Sep 17 00:00:00 2001 From: ZanSara Date: Thu, 7 Jul 2022 16:59:00 +0200 Subject: [PATCH 62/89] mypy --- haystack/modeling/model/language_model.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 53dbec9f12..e144e6ea35 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -565,7 +565,7 @@ def _init_model_haystack_style( original_model_config = AutoConfig.from_pretrained(haystack_lm_config) haystack_lm_model = Path(model_name_or_path) / "language_model.bin" - if "dpr" in original_model_config.model_type.lower(): + if original_model_config.model_type and "dpr" in original_model_config.model_type.lower(): dpr_config = transformers.DPRConfig.from_pretrained(haystack_lm_config) self.model = model_class.from_pretrained(haystack_lm_model, config=dpr_config, **model_kwargs) @@ -868,13 +868,12 @@ def get_language_model( ) # Find the class corresponding to this model type - try: - model_type, language_model_class = capitalize_and_get_class(model_type) - except KeyError as e: + model_type, language_model_class = capitalize_and_get_class(model_type) + if not language_model_class: raise ValueError( f"The type of model supplied ({model_type}) is not supported by Haystack or was not correctly identified. " f"Supported model types are: {', '.join(HUGGINGFACE_TO_HAYSTACK.keys())}" - ) from e + ) # Instantiate the class for this model language_model = language_model_class( From 38baeb755eba5ebd9aeb518eea74150080b160a8 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Thu, 7 Jul 2022 17:11:33 +0200 Subject: [PATCH 63/89] mypy --- haystack/modeling/model/language_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index f708b85f13..1f0398932a 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -565,7 +565,8 @@ def _init_model_haystack_style( original_model_config = AutoConfig.from_pretrained(haystack_lm_config) haystack_lm_model = Path(model_name_or_path) / "language_model.bin" - if original_model_config.model_type and "dpr" in original_model_config.model_type.lower(): + original_model_type = original_model_config.model_type + if original_model_type and "dpr" in original_model_type.lower(): dpr_config = transformers.DPRConfig.from_pretrained(haystack_lm_config) self.model = model_class.from_pretrained(haystack_lm_model, config=dpr_config, **model_kwargs) @@ -573,7 +574,6 @@ def _init_model_haystack_style( self.model = self._init_model_through_config( model_config=original_model_config, model_class=model_class, model_kwargs=model_kwargs ) - original_model_type = original_model_config.model_type try: original_model_type, language_model_class = capitalize_and_get_class(original_model_type.lower()) except KeyError as e: From 1a458cf2b67ac310dd720474bbfb1fe42383e921 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Thu, 7 Jul 2022 17:15:52 +0200 Subject: [PATCH 64/89] mypy --- haystack/modeling/model/language_model.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 1f0398932a..2b29c3334e 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -574,9 +574,8 @@ def _init_model_haystack_style( self.model = self._init_model_through_config( model_config=original_model_config, model_class=model_class, model_kwargs=model_kwargs ) - try: - original_model_type, language_model_class = capitalize_and_get_class(original_model_type.lower()) - except KeyError as e: + original_model_type, language_model_class = capitalize_and_get_class(original_model_type.lower()) + if not language_model_class: raise ValueError( f"The type of model supplied ({model_name_or_path} , " f"({original_model_type}) is not supported by Haystack. " From 1d47c90e74bbb066cd8dde71f454c32e87657707 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Thu, 7 Jul 2022 20:55:10 +0200 Subject: [PATCH 65/89] fix tests --- haystack/modeling/model/language_model.py | 47 ++++++++++++-------- haystack/modeling/model/triadaptive_model.py | 15 +++++-- 2 files changed, 39 insertions(+), 23 deletions(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 2b29c3334e..7c4a6c8fec 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -759,7 +759,11 @@ def forward( "XLNet": HFLanguageModelWithPooler, } #: HF Capitalization pairs -HUGGINGFACE_CAPITALIZE = {k.lower(): k for k in HUGGINGFACE_TO_HAYSTACK.keys()} +HUGGINGFACE_CAPITALIZE = { + "xlm-roberta": "XLMRoberta", + "deberta-v2": "DebertaV2", + **{k.lower(): k for k in HUGGINGFACE_TO_HAYSTACK.keys()} +} def capitalize_and_get_class( @@ -770,13 +774,13 @@ def capitalize_and_get_class( :param model_type: the model_type as found in the config file :return: the capitalized version of the model type, if found, and the wrapper class, if found. """ - model_type_capitalized, lm_class = None, None + lm_class = None - model_type_capitalized = HUGGINGFACE_CAPITALIZE.get(model_type.lower()) - if model_type_capitalized: - lm_class = HUGGINGFACE_TO_HAYSTACK.get(model_type_capitalized) + model_type = HUGGINGFACE_CAPITALIZE.get(model_type.lower(), model_type) + if model_type: + lm_class = HUGGINGFACE_TO_HAYSTACK.get(model_type) - return model_type_capitalized, lm_class + return model_type, lm_class #: Regex to match variants of the HF class name, to enhance our mode type guessing abilities. @@ -848,6 +852,13 @@ def get_language_model( # it's a local directory in Haystack format config = json.load(open(config_file)) model_type = config["name"] + if not model_type: + model_type = _get_model_type( + pretrained_model_name_or_path, + use_auth_token=use_auth_token, + revision=revision, + autoconfig_kwargs=autoconfig_kwargs, + ) else: # It's from the model hub @@ -858,15 +869,17 @@ def get_language_model( revision=revision, autoconfig_kwargs=autoconfig_kwargs, ) - if not model_type: - raise ModelingError( - f"Model type not understood for '{pretrained_model_name_or_path}' " - f"({model_type if model_type else 'model_type not set'}). " - "Either supply the local path for a saved model, " - "or the name of one of a model that can be downloaded from the Model Hub. " - "Ensure that the model class name can be inferred from the directory name " - "when loading a Transformers model." - ) + + if not model_type: + logger.error( + f"Model type not understood for '{pretrained_model_name_or_path}' " + f"({model_type if model_type else 'model_type not set'}). " + "Either supply the local path for a saved model, " + "or the name of a model that can be downloaded from the Model Hub. " + "Ensure that the model class name can be inferred from the directory name " + "when loading a Transformers model.") + logger.error(f"Using the AutoModel class for '{pretrained_model_name_or_path}'. This can cause crashes!") + model_type = "Auto" # Find the class corresponding to this model type model_type, language_model_class = capitalize_and_get_class(model_type) @@ -929,10 +942,6 @@ def _get_model_type( f"MLM part of codebert is currently not supported in Haystack: '{model_name_or_path}' may crash later." ) - if not model_type: - logger.error("Model type not found. Using the AutoModel class. This can cause crashes later!") - model_type = "Auto" - return model_type diff --git a/haystack/modeling/model/triadaptive_model.py b/haystack/modeling/model/triadaptive_model.py index 66bee19507..8d381407b9 100644 --- a/haystack/modeling/model/triadaptive_model.py +++ b/haystack/modeling/model/triadaptive_model.py @@ -350,10 +350,17 @@ def forward_lm(self, **kwargs): pooled_output[1] = pooled_output_combined # Current batch consists of only texts else: - passage_params = { - key.replace("passage_", ""): value for key, value in kwargs.items() if key.startswith("passage_") - } - pooled_output2, hidden_states2 = self.language_model2(**passage_params) + # Make input two-dimensional + max_seq_len = kwargs["passage_input_ids"].shape[-1] + input_ids = kwargs["passage_input_ids"].view(-1, max_seq_len) + attention_mask = kwargs["passage_attention_mask"].view(-1, max_seq_len) + segment_ids = kwargs["passage_segment_ids"].view(-1, max_seq_len) + + pooled_output2, hidden_states2 = self.language_model2( + input_ids=input_ids, + attention_mask=attention_mask, + segment_ids=segment_ids + ) pooled_output[1] = pooled_output2 return tuple(pooled_output) From cacc11d638ff80711483731f928c36815946fd58 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 7 Jul 2022 18:59:58 +0000 Subject: [PATCH 66/89] Update Documentation & Code Style --- haystack/modeling/model/language_model.py | 5 +++-- haystack/modeling/model/triadaptive_model.py | 4 +--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 7c4a6c8fec..315e9fa9d5 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -762,7 +762,7 @@ def forward( HUGGINGFACE_CAPITALIZE = { "xlm-roberta": "XLMRoberta", "deberta-v2": "DebertaV2", - **{k.lower(): k for k in HUGGINGFACE_TO_HAYSTACK.keys()} + **{k.lower(): k for k in HUGGINGFACE_TO_HAYSTACK.keys()}, } @@ -877,7 +877,8 @@ def get_language_model( "Either supply the local path for a saved model, " "or the name of a model that can be downloaded from the Model Hub. " "Ensure that the model class name can be inferred from the directory name " - "when loading a Transformers model.") + "when loading a Transformers model." + ) logger.error(f"Using the AutoModel class for '{pretrained_model_name_or_path}'. This can cause crashes!") model_type = "Auto" diff --git a/haystack/modeling/model/triadaptive_model.py b/haystack/modeling/model/triadaptive_model.py index 8d381407b9..f3c52fd7a9 100644 --- a/haystack/modeling/model/triadaptive_model.py +++ b/haystack/modeling/model/triadaptive_model.py @@ -357,9 +357,7 @@ def forward_lm(self, **kwargs): segment_ids = kwargs["passage_segment_ids"].view(-1, max_seq_len) pooled_output2, hidden_states2 = self.language_model2( - input_ids=input_ids, - attention_mask=attention_mask, - segment_ids=segment_ids + input_ids=input_ids, attention_mask=attention_mask, segment_ids=segment_ids ) pooled_output[1] = pooled_output2 From e4c08baf7d8993f3d1a581b7afcdc7e4fc52c196 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Fri, 8 Jul 2022 09:17:57 +0200 Subject: [PATCH 67/89] typing --- haystack/modeling/model/language_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 7c4a6c8fec..51b3c9e968 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -907,7 +907,7 @@ def _get_model_type( use_auth_token: Optional[Union[str, bool]] = None, revision: Optional[str] = None, autoconfig_kwargs: Optional[Dict[str, Any]] = None, -) -> str: +) -> Optional[str]: """ Given a model name, try to use AutoConfig to understand which model type it is. In case it's not successful, tries to infer the type from the name of the model. From 888d3d61bb50e23366d56babc0a4311f2f4c7f9e Mon Sep 17 00:00:00 2001 From: ZanSara Date: Tue, 12 Jul 2022 10:55:47 +0200 Subject: [PATCH 68/89] Fix for triadaptive model --- haystack/modeling/model/triadaptive_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/haystack/modeling/model/triadaptive_model.py b/haystack/modeling/model/triadaptive_model.py index f3c52fd7a9..229e5184c3 100644 --- a/haystack/modeling/model/triadaptive_model.py +++ b/haystack/modeling/model/triadaptive_model.py @@ -323,12 +323,12 @@ def forward_lm(self, **kwargs): table_input_ids = passage_input_ids[table_mask] table_segment_ids = table_segment_ids[table_mask] table_attention_mask = passage_attention_mask[table_mask] - pooled_output_tables, _ = self.language_model3(table_input_ids, table_segment_ids, table_attention_mask) + pooled_output_tables, _ = self.language_model3(input_ids=table_input_ids, segment_ids=table_segment_ids, attention_mask=table_attention_mask) text_input_ids = passage_input_ids[~table_mask] text_segment_ids = passage_segment_ids[~table_mask] text_attention_mask = passage_attention_mask[~table_mask] - pooled_output_text, _ = self.language_model2(text_input_ids, text_segment_ids, text_attention_mask) + pooled_output_text, _ = self.language_model2(input_ids=text_input_ids, segment_ids=text_segment_ids, attention_mask=text_attention_mask) last_table_idx = 0 last_text_idx = 0 From 176276133bbe084b210e9446aebee78cffb69fc8 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Tue, 12 Jul 2022 09:01:06 +0000 Subject: [PATCH 69/89] Update Documentation & Code Style --- haystack/modeling/model/triadaptive_model.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/haystack/modeling/model/triadaptive_model.py b/haystack/modeling/model/triadaptive_model.py index 229e5184c3..773172beaf 100644 --- a/haystack/modeling/model/triadaptive_model.py +++ b/haystack/modeling/model/triadaptive_model.py @@ -323,12 +323,16 @@ def forward_lm(self, **kwargs): table_input_ids = passage_input_ids[table_mask] table_segment_ids = table_segment_ids[table_mask] table_attention_mask = passage_attention_mask[table_mask] - pooled_output_tables, _ = self.language_model3(input_ids=table_input_ids, segment_ids=table_segment_ids, attention_mask=table_attention_mask) + pooled_output_tables, _ = self.language_model3( + input_ids=table_input_ids, segment_ids=table_segment_ids, attention_mask=table_attention_mask + ) text_input_ids = passage_input_ids[~table_mask] text_segment_ids = passage_segment_ids[~table_mask] text_attention_mask = passage_attention_mask[~table_mask] - pooled_output_text, _ = self.language_model2(input_ids=text_input_ids, segment_ids=text_segment_ids, attention_mask=text_attention_mask) + pooled_output_text, _ = self.language_model2( + input_ids=text_input_ids, segment_ids=text_segment_ids, attention_mask=text_attention_mask + ) last_table_idx = 0 last_text_idx = 0 From d47338e3865fcd6fee761fd25c2aac010a1d9adf Mon Sep 17 00:00:00 2001 From: ZanSara Date: Tue, 12 Jul 2022 14:03:28 +0200 Subject: [PATCH 70/89] Remove comment --- haystack/modeling/model/language_model.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 10286d8af6..e1497a3ef5 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -719,22 +719,6 @@ def forward( pooled_output = output_tuple.pooler_output return pooled_output, None - # # If inheriting from HFLanguageModel - # output_tuple = super().forward( - # input_ids=input_ids, - # segment_ids=segment_ids, - # attention_mask=attention_mask, - # output_hidden_states=output_hidden_states, - # output_attentions=output_attentions, - # return_dict=return_dict - # ) - # if output_hidden_states or self.encoder.config.output_hidden_states: - # pooled_output, all_hidden_states = output_tuple.pooler_output, output_tuple.hidden_states - # return pooled_output, all_hidden_states - - # pooled_output = output_tuple.pooler_output - # return pooled_output, None - #: Match the name of the HuggingFace Model class to the corresponding Haystack wrapper HUGGINGFACE_TO_HAYSTACK: Dict[str, Union[Type[HFLanguageModel], Type[DPREncoder]]] = { From 5e6c83a65d6d02f76548a3b4fc1335b5a21a5ed0 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 13 Jul 2022 10:12:09 +0200 Subject: [PATCH 71/89] split capitalize_and_get_class --- haystack/modeling/model/language_model.py | 48 +++++++++++++---------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index e1497a3ef5..bba5699aac 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -574,7 +574,8 @@ def _init_model_haystack_style( self.model = self._init_model_through_config( model_config=original_model_config, model_class=model_class, model_kwargs=model_kwargs ) - original_model_type, language_model_class = capitalize_and_get_class(original_model_type.lower()) + original_model_type = capitalize_model_type(original_model_type) + language_model_class = get_language_model_class(original_model_type) if not language_model_class: raise ValueError( f"The type of model supplied ({model_name_or_path} , " @@ -749,24 +750,6 @@ def forward( **{k.lower(): k for k in HUGGINGFACE_TO_HAYSTACK.keys()}, } - -def capitalize_and_get_class( - model_type: str, -) -> Tuple[Optional[str], Optional[Type[Union[HFLanguageModel, DPREncoder]]]]: - """ - Returns the proper capitalized model type and the corresponding Haystack LanguageModel subclass. - :param model_type: the model_type as found in the config file - :return: the capitalized version of the model type, if found, and the wrapper class, if found. - """ - lm_class = None - - model_type = HUGGINGFACE_CAPITALIZE.get(model_type.lower(), model_type) - if model_type: - lm_class = HUGGINGFACE_TO_HAYSTACK.get(model_type) - - return model_type, lm_class - - #: Regex to match variants of the HF class name, to enhance our mode type guessing abilities. NAME_HINTS: Dict[str, str] = { "xlm.*roberta": "XLMRoberta", @@ -800,6 +783,30 @@ def capitalize_and_get_class( } +def capitalize_model_type( + model_type: str, +) -> Optional[str]: + """ + Returns the proper capitalized version of the model type, that can be used to + retrieve the model class from transformers. + :param model_type: the model_type as found in the config file + :return: the capitalized version of the model type, if found, or None. + """ + return HUGGINGFACE_CAPITALIZE.get(model_type.lower(), model_type) + + +def get_language_model_class( + model_type: Optional[str], +) -> Optional[Type[Union[HFLanguageModel, DPREncoder]]]: + """ + Returns the corresponding Haystack LanguageModel subclass. + :param model_type: the model_type , properly capitalized (see `capitalize_model_type()`) + :return: the wrapper class, or `None` if `model_type` was `None` or was not recognized. + Lower case model_type values will return `None` as well + """ + return HUGGINGFACE_TO_HAYSTACK.get(model_type) + + def get_language_model( pretrained_model_name_or_path: Union[Path, str], model_type: Optional[str] = None, @@ -867,7 +874,8 @@ def get_language_model( model_type = "Auto" # Find the class corresponding to this model type - model_type, language_model_class = capitalize_and_get_class(model_type) + model_type = capitalize_model_type(model_type) + language_model_class = get_language_model_class(model_type) if not language_model_class: raise ValueError( f"The type of model supplied ({model_type}) is not supported by Haystack or was not correctly identified. " From 81ae6acdeddfca3ec68ce912db0142249f5af1b8 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 13 Jul 2022 10:32:33 +0200 Subject: [PATCH 72/89] improve triadaptive_model.py --- haystack/modeling/model/triadaptive_model.py | 47 +++++++++++++++----- 1 file changed, 35 insertions(+), 12 deletions(-) diff --git a/haystack/modeling/model/triadaptive_model.py b/haystack/modeling/model/triadaptive_model.py index 773172beaf..d0688f178f 100644 --- a/haystack/modeling/model/triadaptive_model.py +++ b/haystack/modeling/model/triadaptive_model.py @@ -294,22 +294,30 @@ def forward_lm(self, **kwargs): pooled_output = [None, None] # Forward pass for the queries if "query_input_ids" in kwargs.keys(): - query_params = { - key.replace("query_", ""): value for key, value in kwargs.items() if key.startswith("query_") - } - pooled_output1, hidden_states1 = self.language_model1(**query_params) + pooled_output1 = self.language_model1( + input_ids=kwargs.get("query_input_ids"), + segment_ids=kwargs.get("query_segment_ids"), + attention_mask=kwargs.get("query_attention_mask"), + output_hidden_states=False, + output_attentions=False, + ) pooled_output[0] = pooled_output1 + # Forward pass for text passages and tables if "passage_input_ids" in kwargs.keys(): table_mask = torch.flatten(kwargs["is_table"]) == True + # Current batch consists of only tables if all(table_mask): - pooled_output2, hidden_states2 = self.language_model3( + pooled_output2 = self.language_model3( passage_input_ids=kwargs["passage_input_ids"], passage_segment_ids=kwargs["table_segment_ids"], passage_attention_mask=kwargs["passage_attention_mask"], + output_hidden_states=False, + output_attentions=False, ) pooled_output[1] = pooled_output2 + # Current batch consists of tables and texts elif any(table_mask): @@ -323,21 +331,31 @@ def forward_lm(self, **kwargs): table_input_ids = passage_input_ids[table_mask] table_segment_ids = table_segment_ids[table_mask] table_attention_mask = passage_attention_mask[table_mask] - pooled_output_tables, _ = self.language_model3( - input_ids=table_input_ids, segment_ids=table_segment_ids, attention_mask=table_attention_mask + + pooled_output_tables, = self.language_model3( + input_ids=table_input_ids, + segment_ids=table_segment_ids, + attention_mask=table_attention_mask, + output_hidden_states=False, + output_attentions=False, ) text_input_ids = passage_input_ids[~table_mask] text_segment_ids = passage_segment_ids[~table_mask] text_attention_mask = passage_attention_mask[~table_mask] - pooled_output_text, _ = self.language_model2( - input_ids=text_input_ids, segment_ids=text_segment_ids, attention_mask=text_attention_mask + + pooled_output_text = self.language_model2( + input_ids=text_input_ids, + segment_ids=text_segment_ids, + attention_mask=text_attention_mask, + output_hidden_states=False, + output_attentions=False, ) last_table_idx = 0 last_text_idx = 0 combined_outputs = [] - for idx, mask in enumerate(table_mask): + for mask in table_mask: if mask: combined_outputs.append(pooled_output_tables[last_table_idx]) last_table_idx += 1 @@ -352,6 +370,7 @@ def forward_lm(self, **kwargs): ), "Passage embedding model and table embedding model use different embedding sizes" pooled_output_combined = combined_outputs.view(-1, embedding_size) pooled_output[1] = pooled_output_combined + # Current batch consists of only texts else: # Make input two-dimensional @@ -360,8 +379,12 @@ def forward_lm(self, **kwargs): attention_mask = kwargs["passage_attention_mask"].view(-1, max_seq_len) segment_ids = kwargs["passage_segment_ids"].view(-1, max_seq_len) - pooled_output2, hidden_states2 = self.language_model2( - input_ids=input_ids, attention_mask=attention_mask, segment_ids=segment_ids + pooled_output2 = self.language_model2( + input_ids=input_ids, + attention_mask=attention_mask, + segment_ids=segment_ids, + output_hidden_states=False, + output_attentions=False, ) pooled_output[1] = pooled_output2 From 7b464e65fe32df47457c7f3d6461faeed0ca7dda Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 13 Jul 2022 10:35:59 +0200 Subject: [PATCH 73/89] simplifying more **kwargs --- haystack/modeling/training/base.py | 19 ++++++++----------- haystack/nodes/retriever/dense.py | 21 +++++++++++---------- 2 files changed, 19 insertions(+), 21 deletions(-) diff --git a/haystack/modeling/training/base.py b/haystack/modeling/training/base.py index c448cae3c1..94ec4dd181 100644 --- a/haystack/modeling/training/base.py +++ b/haystack/modeling/training/base.py @@ -251,7 +251,7 @@ def train(self): vocab_size2=len(self.data_silo.processor.passage_tokenizer), ) elif ( - not self.model.language_model.name == "debertav2" + self.model.language_model.name != "debertav2" ): # DebertaV2 has mismatched vocab size on purpose (see https://github.com/huggingface/transformers/issues/12428) self.model.verify_vocab_size(vocab_size=len(self.data_silo.processor.tokenizer)) self.model.train() @@ -767,16 +767,13 @@ def compute_loss(self, batch: dict, step: int) -> torch.Tensor: keys = [key for key in keys if key.startswith("teacher_output")] teacher_logits = [batch.pop(key) for key in keys] - params = { - "input_ids": batch["input_ids"], - "segment_ids": batch["segment_ids"], - "padding_mask": batch["padding_mask"], - } - if "output_hidden_states" in batch.keys(): - params["output_hidden_states"] = batch["output_hidden_states"] - if "output_attentions" in batch.keys(): - params["output_attentions"] = batch["output_attentions"] - logits = self.model.forward(**params) + logits = self.model.forward( + input_ids=batch.get("input_ids"), + segment_ids=batch.get("segment_ids"), + padding_mask=batch.get("padding_mask"), + output_hidden_states=batch.get("output_hidden_states"), + output_attentions=batch.get("output_attentions"), + ) student_loss = self.model.logits_to_loss(logits=logits, global_step=self.global_step, **batch) distillation_loss = self.distillation_loss_fn( diff --git a/haystack/nodes/retriever/dense.py b/haystack/nodes/retriever/dense.py index c1c4acf349..20ed4d3bb7 100644 --- a/haystack/nodes/retriever/dense.py +++ b/haystack/nodes/retriever/dense.py @@ -17,7 +17,8 @@ AutoConfig, DPRContextEncoderTokenizerFast, DPRQuestionEncoderTokenizerFast, - PreTrainedTokenizer, + DPRContextEncoderTokenizer, + DPRQuestionEncoderTokenizer, ) from haystack.errors import HaystackError @@ -66,7 +67,7 @@ def __init__( progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None, - scale_score: bool = True, + scale_score: bool = True ): """ Init the Retriever incl. the two encoder models from a local or remote model checkpoint. @@ -777,6 +778,7 @@ def __init__( devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None, scale_score: bool = True, + use_fast: bool = True ): """ Init the Retriever incl. the two encoder models from a local or remote model checkpoint. @@ -818,6 +820,7 @@ def __init__( :param scale_score: Whether to scale the similarity score to the unit interval (range of [0,1]). If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant. Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. + :param use_fast: Whether to use the fast version of DPR tokenizers or fallback to the standard version. Defaults to True. """ super().__init__() @@ -849,14 +852,12 @@ def __init__( "This can be set when initializing the DocumentStore" ) - tokenizers_default_classes: Dict[str, Type[PreTrainedTokenizer]] = { - "query": DPRQuestionEncoderTokenizerFast, - "passage": DPRContextEncoderTokenizerFast, - "table": DPRContextEncoderTokenizerFast, - } + query_tokenizer_class = DPRQuestionEncoderTokenizerFast if use_fast else DPRQuestionEncoderTokenizer + passage_tokenizer_class = DPRContextEncoderTokenizerFast if use_fast else DPRContextEncoderTokenizer + table_tokenizer_class = DPRContextEncoderTokenizerFast if use_fast else DPRContextEncoderTokenizer # Init & Load Encoders - self.query_tokenizer = tokenizers_default_classes["query"].from_pretrained( + self.query_tokenizer = query_tokenizer_class.from_pretrained( query_embedding_model, revision=model_version, do_lower_case=True, @@ -869,7 +870,7 @@ def __init__( revision=model_version, use_auth_token=use_auth_token, ) - self.passage_tokenizer = tokenizers_default_classes["passage"].from_pretrained( + self.passage_tokenizer = passage_tokenizer_class.from_pretrained( passage_embedding_model, revision=model_version, do_lower_case=True, @@ -882,7 +883,7 @@ def __init__( revision=model_version, use_auth_token=use_auth_token, ) - self.table_tokenizer = tokenizers_default_classes["table"].from_pretrained( + self.table_tokenizer = table_tokenizer_class.from_pretrained( table_embedding_model, revision=model_version, do_lower_case=True, From 8bdb42b20776058de8c1982e67f4997a296dc66c Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 13 Jul 2022 10:37:39 +0200 Subject: [PATCH 74/89] more **kwargs gone --- haystack/modeling/training/base.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/haystack/modeling/training/base.py b/haystack/modeling/training/base.py index 94ec4dd181..e1c155127b 100644 --- a/haystack/modeling/training/base.py +++ b/haystack/modeling/training/base.py @@ -906,16 +906,13 @@ def __init__( self.loss = DataParallel(self.loss).to(device) def compute_loss(self, batch: dict, step: int) -> torch.Tensor: - params = { - "input_ids": batch["input_ids"], - "segment_ids": batch["segment_ids"], - "padding_mask": batch["padding_mask"], - } - if "output_hidden_states" in batch.keys(): - params["output_hidden_states"] = batch["output_hidden_states"] - if "output_attentions" in batch.keys(): - params["output_attentions"] = batch["output_attentions"] - return self.backward_propagate(torch.sum(self.loss(**params)), step) + return self.backward_propagate(torch.sum(self.loss( + input_ids=batch.get("input_ids"), + segment_ids=batch.get("segment_ids"), + padding_mask=batch.get("padding_mask"), + output_hidden_states=batch.get("output_hidden_states"), + output_attentions=batch.get("output_attentions"), + )), step) class DistillationLoss(Module): From c9095992b35205afcb9ae4389b416a7adaf74e3b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 13 Jul 2022 08:39:14 +0000 Subject: [PATCH 75/89] Update Documentation & Code Style --- haystack/modeling/model/language_model.py | 8 ++------ haystack/modeling/model/triadaptive_model.py | 18 +++++++++--------- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index bba5699aac..5e5006617e 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -783,9 +783,7 @@ def forward( } -def capitalize_model_type( - model_type: str, -) -> Optional[str]: +def capitalize_model_type(model_type: str) -> Optional[str]: """ Returns the proper capitalized version of the model type, that can be used to retrieve the model class from transformers. @@ -795,9 +793,7 @@ def capitalize_model_type( return HUGGINGFACE_CAPITALIZE.get(model_type.lower(), model_type) -def get_language_model_class( - model_type: Optional[str], -) -> Optional[Type[Union[HFLanguageModel, DPREncoder]]]: +def get_language_model_class(model_type: Optional[str]) -> Optional[Type[Union[HFLanguageModel, DPREncoder]]]: """ Returns the corresponding Haystack LanguageModel subclass. :param model_type: the model_type , properly capitalized (see `capitalize_model_type()`) diff --git a/haystack/modeling/model/triadaptive_model.py b/haystack/modeling/model/triadaptive_model.py index d0688f178f..8ce2388de8 100644 --- a/haystack/modeling/model/triadaptive_model.py +++ b/haystack/modeling/model/triadaptive_model.py @@ -331,10 +331,10 @@ def forward_lm(self, **kwargs): table_input_ids = passage_input_ids[table_mask] table_segment_ids = table_segment_ids[table_mask] table_attention_mask = passage_attention_mask[table_mask] - - pooled_output_tables, = self.language_model3( - input_ids=table_input_ids, - segment_ids=table_segment_ids, + + (pooled_output_tables,) = self.language_model3( + input_ids=table_input_ids, + segment_ids=table_segment_ids, attention_mask=table_attention_mask, output_hidden_states=False, output_attentions=False, @@ -345,8 +345,8 @@ def forward_lm(self, **kwargs): text_attention_mask = passage_attention_mask[~table_mask] pooled_output_text = self.language_model2( - input_ids=text_input_ids, - segment_ids=text_segment_ids, + input_ids=text_input_ids, + segment_ids=text_segment_ids, attention_mask=text_attention_mask, output_hidden_states=False, output_attentions=False, @@ -370,7 +370,7 @@ def forward_lm(self, **kwargs): ), "Passage embedding model and table embedding model use different embedding sizes" pooled_output_combined = combined_outputs.view(-1, embedding_size) pooled_output[1] = pooled_output_combined - + # Current batch consists of only texts else: # Make input two-dimensional @@ -380,8 +380,8 @@ def forward_lm(self, **kwargs): segment_ids = kwargs["passage_segment_ids"].view(-1, max_seq_len) pooled_output2 = self.language_model2( - input_ids=input_ids, - attention_mask=attention_mask, + input_ids=input_ids, + attention_mask=attention_mask, segment_ids=segment_ids, output_hidden_states=False, output_attentions=False, From a1685bd8393b90084325080960468efc80819005 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 13 Jul 2022 10:52:05 +0200 Subject: [PATCH 76/89] mypy & pylint --- haystack/modeling/model/language_model.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index bba5699aac..2b66e2a6e7 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -18,7 +18,7 @@ Thanks for the great work! """ -from typing import Type, Tuple, Optional, Dict, Any, Union, List +from typing import Type, Optional, Dict, Any, Union, List import re import json @@ -790,13 +790,13 @@ def capitalize_model_type( Returns the proper capitalized version of the model type, that can be used to retrieve the model class from transformers. :param model_type: the model_type as found in the config file - :return: the capitalized version of the model type, if found, or None. + :return: the capitalized version of the model type, or the original name of not found. """ return HUGGINGFACE_CAPITALIZE.get(model_type.lower(), model_type) def get_language_model_class( - model_type: Optional[str], + model_type: str, ) -> Optional[Type[Union[HFLanguageModel, DPREncoder]]]: """ Returns the corresponding Haystack LanguageModel subclass. From 44c7726a20f866f4e4dfd0fd5d05283c4ae3d957 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 13 Jul 2022 08:55:41 +0000 Subject: [PATCH 77/89] Update Documentation & Code Style --- docs/_src/api/api/retriever.md | 3 ++- .../haystack-pipeline-master.schema.json | 5 +++++ haystack/modeling/model/language_model.py | 4 +--- haystack/modeling/training/base.py | 19 ++++++++++++------- haystack/nodes/retriever/dense.py | 4 ++-- 5 files changed, 22 insertions(+), 13 deletions(-) diff --git a/docs/_src/api/api/retriever.md b/docs/_src/api/api/retriever.md index cd5f180388..715c2e2156 100644 --- a/docs/_src/api/api/retriever.md +++ b/docs/_src/api/api/retriever.md @@ -893,7 +893,7 @@ Kostić, Bogdan, et al. (2021): "Multi-modal Retrieval of Tables and Texts Using #### TableTextRetriever.\_\_init\_\_ ```python -def __init__(document_store: BaseDocumentStore, query_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-question_encoder", passage_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-passage_encoder", table_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-table_encoder", model_version: Optional[str] = None, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, max_seq_len_table: int = 256, top_k: int = 10, use_gpu: bool = True, batch_size: int = 16, embed_meta_fields: List[str] = ["name", "section_title", "caption"], use_fast_tokenizers: bool = True, similarity_function: str = "dot_product", global_loss_buffer_size: int = 150000, progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None, scale_score: bool = True) +def __init__(document_store: BaseDocumentStore, query_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-question_encoder", passage_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-passage_encoder", table_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-table_encoder", model_version: Optional[str] = None, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, max_seq_len_table: int = 256, top_k: int = 10, use_gpu: bool = True, batch_size: int = 16, embed_meta_fields: List[str] = ["name", "section_title", "caption"], use_fast_tokenizers: bool = True, similarity_function: str = "dot_product", global_loss_buffer_size: int = 150000, progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None, scale_score: bool = True, use_fast: bool = True) ``` Init the Retriever incl. the two encoder models from a local or remote model checkpoint. @@ -938,6 +938,7 @@ Additional information can be found here https://huggingface.co/transformers/mai - `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]). If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant. Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. +- `use_fast`: Whether to use the fast version of DPR tokenizers or fallback to the standard version. Defaults to True. diff --git a/haystack/json-schemas/haystack-pipeline-master.schema.json b/haystack/json-schemas/haystack-pipeline-master.schema.json index 5fede84149..70e0f94e26 100644 --- a/haystack/json-schemas/haystack-pipeline-master.schema.json +++ b/haystack/json-schemas/haystack-pipeline-master.schema.json @@ -4277,6 +4277,11 @@ "title": "Scale Score", "default": true, "type": "boolean" + }, + "use_fast": { + "title": "Use Fast", + "default": true, + "type": "boolean" } }, "required": [ diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 827155c859..4dba36cc6e 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -793,9 +793,7 @@ def capitalize_model_type(model_type: str) -> Optional[str]: return HUGGINGFACE_CAPITALIZE.get(model_type.lower(), model_type) -def get_language_model_class( - model_type: str, -) -> Optional[Type[Union[HFLanguageModel, DPREncoder]]]: +def get_language_model_class(model_type: str) -> Optional[Type[Union[HFLanguageModel, DPREncoder]]]: """ Returns the corresponding Haystack LanguageModel subclass. :param model_type: the model_type , properly capitalized (see `capitalize_model_type()`) diff --git a/haystack/modeling/training/base.py b/haystack/modeling/training/base.py index e1c155127b..2be10aaab6 100644 --- a/haystack/modeling/training/base.py +++ b/haystack/modeling/training/base.py @@ -906,13 +906,18 @@ def __init__( self.loss = DataParallel(self.loss).to(device) def compute_loss(self, batch: dict, step: int) -> torch.Tensor: - return self.backward_propagate(torch.sum(self.loss( - input_ids=batch.get("input_ids"), - segment_ids=batch.get("segment_ids"), - padding_mask=batch.get("padding_mask"), - output_hidden_states=batch.get("output_hidden_states"), - output_attentions=batch.get("output_attentions"), - )), step) + return self.backward_propagate( + torch.sum( + self.loss( + input_ids=batch.get("input_ids"), + segment_ids=batch.get("segment_ids"), + padding_mask=batch.get("padding_mask"), + output_hidden_states=batch.get("output_hidden_states"), + output_attentions=batch.get("output_attentions"), + ) + ), + step, + ) class DistillationLoss(Module): diff --git a/haystack/nodes/retriever/dense.py b/haystack/nodes/retriever/dense.py index 20ed4d3bb7..2d14545e49 100644 --- a/haystack/nodes/retriever/dense.py +++ b/haystack/nodes/retriever/dense.py @@ -67,7 +67,7 @@ def __init__( progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None, - scale_score: bool = True + scale_score: bool = True, ): """ Init the Retriever incl. the two encoder models from a local or remote model checkpoint. @@ -778,7 +778,7 @@ def __init__( devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None, scale_score: bool = True, - use_fast: bool = True + use_fast: bool = True, ): """ Init the Retriever incl. the two encoder models from a local or remote model checkpoint. From d5eb60623fe3d214e9917fcb99c13ebf57cd9a4a Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 13 Jul 2022 11:06:55 +0200 Subject: [PATCH 78/89] mypy & pylint again --- haystack/modeling/model/language_model.py | 2 +- haystack/nodes/retriever/dense.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 827155c859..ddc287bfd5 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -783,7 +783,7 @@ def forward( } -def capitalize_model_type(model_type: str) -> Optional[str]: +def capitalize_model_type(model_type: str) -> str: """ Returns the proper capitalized version of the model type, that can be used to retrieve the model class from transformers. diff --git a/haystack/nodes/retriever/dense.py b/haystack/nodes/retriever/dense.py index 20ed4d3bb7..0d3d3c2e46 100644 --- a/haystack/nodes/retriever/dense.py +++ b/haystack/nodes/retriever/dense.py @@ -1,4 +1,4 @@ -from typing import List, Dict, Union, Optional, Any, Type +from typing import List, Dict, Union, Optional, Any import logging from pathlib import Path From 0bb11042ce74df971ecbd189389df6edcf30c9e6 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 13 Jul 2022 15:11:54 +0200 Subject: [PATCH 79/89] Improve management of output_hidden_states --- haystack/modeling/model/language_model.py | 28 ++++++++++++++--------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 653a196021..29bb84ed44 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -343,8 +343,10 @@ def forward( :return: Embeddings for each token in the input sequence. Can also return hidden states and attentions if specified using the arguments `output_hidden_states` and `output_attentions`. """ if hasattr(self, "encoder"): # Not all models have an encoder - output_hidden_states = output_hidden_states or self.model.encoder.config.output_hidden_states - output_attentions = output_attentions or self.model.encoder.config.output_attentions + if output_hidden_states is None: + output_hidden_states: Optional[bool] = self.model.encoder.config.output_hidden_states + if output_attentions is None: + output_attentions: Optional[bool] = self.model.encoder.config.output_attentions params = {} if input_ids is not None: @@ -433,7 +435,6 @@ def forward( :param output_attentions: When set to `True`, outputs attentions in addition to the embeddings. :return: Embeddings for each token in the input sequence. """ - output_tuple = super().forward( input_ids=input_ids, segment_ids=segment_ids, @@ -707,18 +708,23 @@ def forward( :param attention_mask: A mask that assigns 1 to valid input tokens and 0 to padding tokens of shape [batch_size, number_of_hard_negative_passages, max_seq_len]. :param output_hidden_states: whether to add the hidden states along with the pooled output - :param output_attentions: unused for DPREncoder + :param output_attentions: unused :return: Embeddings for each token in the input sequence. """ - output_tuple = self.model( - input_ids=input_ids, token_type_ids=segment_ids, attention_mask=attention_mask, return_dict=return_dict + output_hidden_states = output_hidden_states if output_hidden_states is not None else self.encoder.config.output_hidden_states + + model_output = self.model( + input_ids=input_ids, + token_type_ids=segment_ids, + attention_mask=attention_mask, + output_hidden_states=output_hidden_states, + output_attentions=False, + return_dict=return_dict ) - if output_hidden_states or self.encoder.config.output_hidden_states: - pooled_output, all_hidden_states = output_tuple.pooler_output, output_tuple.hidden_states - return pooled_output, all_hidden_states - pooled_output = output_tuple.pooler_output - return pooled_output, None + if output_hidden_states: + return model_output.pooler_output, model_output.hidden_states + return model_output.pooler_output, None #: Match the name of the HuggingFace Model class to the corresponding Haystack wrapper From 34121d78e831dc839ac3a7b203bd4d18765d6b6f Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 13 Jul 2022 13:15:32 +0000 Subject: [PATCH 80/89] Update Documentation & Code Style --- haystack/modeling/model/language_model.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 29bb84ed44..fd51efae9b 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -711,15 +711,17 @@ def forward( :param output_attentions: unused :return: Embeddings for each token in the input sequence. """ - output_hidden_states = output_hidden_states if output_hidden_states is not None else self.encoder.config.output_hidden_states + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.encoder.config.output_hidden_states + ) model_output = self.model( - input_ids=input_ids, - token_type_ids=segment_ids, - attention_mask=attention_mask, - output_hidden_states=output_hidden_states, - output_attentions=False, - return_dict=return_dict + input_ids=input_ids, + token_type_ids=segment_ids, + attention_mask=attention_mask, + output_hidden_states=output_hidden_states, + output_attentions=False, + return_dict=return_dict, ) if output_hidden_states: From c5a6dd09b970907a79aa0c811fb0728ed6ba74ab Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 13 Jul 2022 15:26:36 +0200 Subject: [PATCH 81/89] mypy --- haystack/modeling/model/language_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 29bb84ed44..d6175c01af 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -344,9 +344,9 @@ def forward( """ if hasattr(self, "encoder"): # Not all models have an encoder if output_hidden_states is None: - output_hidden_states: Optional[bool] = self.model.encoder.config.output_hidden_states + output_hidden_states = self.model.encoder.config.output_hidden_states if output_attentions is None: - output_attentions: Optional[bool] = self.model.encoder.config.output_attentions + output_attentions = self.model.encoder.config.output_attentions params = {} if input_ids is not None: From 8df63f74ccd786b2fe9d91e05fdfe81e55ccbe55 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 13 Jul 2022 16:32:05 +0200 Subject: [PATCH 82/89] fix tests --- haystack/modeling/model/triadaptive_model.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/haystack/modeling/model/triadaptive_model.py b/haystack/modeling/model/triadaptive_model.py index 8ce2388de8..9a76dab0d3 100644 --- a/haystack/modeling/model/triadaptive_model.py +++ b/haystack/modeling/model/triadaptive_model.py @@ -294,7 +294,7 @@ def forward_lm(self, **kwargs): pooled_output = [None, None] # Forward pass for the queries if "query_input_ids" in kwargs.keys(): - pooled_output1 = self.language_model1( + pooled_output1, _ = self.language_model1( input_ids=kwargs.get("query_input_ids"), segment_ids=kwargs.get("query_segment_ids"), attention_mask=kwargs.get("query_attention_mask"), @@ -309,7 +309,7 @@ def forward_lm(self, **kwargs): # Current batch consists of only tables if all(table_mask): - pooled_output2 = self.language_model3( + pooled_output2, _ = self.language_model3( passage_input_ids=kwargs["passage_input_ids"], passage_segment_ids=kwargs["table_segment_ids"], passage_attention_mask=kwargs["passage_attention_mask"], @@ -332,7 +332,7 @@ def forward_lm(self, **kwargs): table_segment_ids = table_segment_ids[table_mask] table_attention_mask = passage_attention_mask[table_mask] - (pooled_output_tables,) = self.language_model3( + pooled_output_tables, _ = self.language_model3( input_ids=table_input_ids, segment_ids=table_segment_ids, attention_mask=table_attention_mask, @@ -344,7 +344,7 @@ def forward_lm(self, **kwargs): text_segment_ids = passage_segment_ids[~table_mask] text_attention_mask = passage_attention_mask[~table_mask] - pooled_output_text = self.language_model2( + pooled_output_text, _ = self.language_model2( input_ids=text_input_ids, segment_ids=text_segment_ids, attention_mask=text_attention_mask, @@ -379,7 +379,7 @@ def forward_lm(self, **kwargs): attention_mask = kwargs["passage_attention_mask"].view(-1, max_seq_len) segment_ids = kwargs["passage_segment_ids"].view(-1, max_seq_len) - pooled_output2 = self.language_model2( + pooled_output2, _ = self.language_model2( input_ids=input_ids, attention_mask=attention_mask, segment_ids=segment_ids, From 2e9f12f97b00b49bf8ac16cd7d3b88654c86e5e0 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Thu, 14 Jul 2022 15:03:20 +0200 Subject: [PATCH 83/89] remove excess params from trainer --- haystack/modeling/training/base.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/haystack/modeling/training/base.py b/haystack/modeling/training/base.py index 2be10aaab6..d8224cda9c 100644 --- a/haystack/modeling/training/base.py +++ b/haystack/modeling/training/base.py @@ -911,9 +911,7 @@ def compute_loss(self, batch: dict, step: int) -> torch.Tensor: self.loss( input_ids=batch.get("input_ids"), segment_ids=batch.get("segment_ids"), - padding_mask=batch.get("padding_mask"), - output_hidden_states=batch.get("output_hidden_states"), - output_attentions=batch.get("output_attentions"), + padding_mask=batch.get("padding_mask") ) ), step, From 3a5b9ececa2956599afc3d70541218794f6d0562 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 14 Jul 2022 13:09:00 +0000 Subject: [PATCH 84/89] Update Documentation & Code Style --- haystack/modeling/training/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/modeling/training/base.py b/haystack/modeling/training/base.py index d8224cda9c..c8ab06ce79 100644 --- a/haystack/modeling/training/base.py +++ b/haystack/modeling/training/base.py @@ -911,7 +911,7 @@ def compute_loss(self, batch: dict, step: int) -> torch.Tensor: self.loss( input_ids=batch.get("input_ids"), segment_ids=batch.get("segment_ids"), - padding_mask=batch.get("padding_mask") + padding_mask=batch.get("padding_mask"), ) ), step, From e7ebad469aa17b7b69a2c7f893d1f0071adf4db2 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Wed, 20 Jul 2022 17:45:47 +0200 Subject: [PATCH 85/89] simplifying tokenizer tests --- haystack/modeling/model/language_model.py | 8 +- haystack/modeling/model/tokenization.py | 109 +-- test/modeling/test_tokenization.py | 772 +++++++++------------- 3 files changed, 372 insertions(+), 517 deletions(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index c49689e111..b477b2e503 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -37,6 +37,10 @@ from haystack.errors import ModelingError + +logger = logging.getLogger(__name__) + + LANGUAGE_HINTS = ( ("german", "german"), ("english", "english"), @@ -50,10 +54,6 @@ ("multilingual", "multilingual"), ) - -logger = logging.getLogger(__name__) - - #: Names of the attributes in various model configs which refer to the number of dimensions in the output vectors OUTPUT_DIM_NAMES = ["dim", "hidden_size", "d_model"] diff --git a/haystack/modeling/model/tokenization.py b/haystack/modeling/model/tokenization.py index 2b6ae16a52..6d258d6535 100644 --- a/haystack/modeling/model/tokenization.py +++ b/haystack/modeling/model/tokenization.py @@ -52,14 +52,14 @@ def get_tokenizer( model_name_or_path = str(pretrained_model_name_or_path) if "mlm" in model_name_or_path.lower(): - raise NotImplementedError("MLM part of codebert is currently not supported in Haystack") + logging.error("MLM part of codebert is currently not supported in Haystack. Proceed at your own risk.") params = {} if any(tokenizer_type in model_name_or_path for tokenizer_type in ["albert", "xlnet"]): params["keep_accents"] = True return AutoTokenizer.from_pretrained( - model_name_or_path, revision=revision, use_fast=use_fast, use_auth_token=use_auth_token, **params, **kwargs + pretrained_model_name_or_path=model_name_or_path, revision=revision, use_fast=use_fast, use_auth_token=use_auth_token, **params, **kwargs ) @@ -144,6 +144,62 @@ def _get_start_of_word_QA(word_ids): return [1] + list(np.ediff1d(np.array(word_ids))) +def truncate_sequences( + seq_a: list, + seq_b: Optional[list], + tokenizer: AutoTokenizer, + max_seq_len: int, + truncation_strategy: str = "longest_first", + with_special_tokens: bool = True, + stride: int = 0, +) -> Tuple[List[Any], Optional[List[Any]], List[Any]]: + """ + Reduces a single sequence or a pair of sequences to a maximum sequence length. + The sequences can contain tokens or any other elements (offsets, masks ...). + If `with_special_tokens` is enabled, it'll remove some additional tokens to have exactly + enough space for later adding special tokens (CLS, SEP etc.) + + Supported truncation strategies: + + - longest_first: (default) Iteratively reduce the inputs sequence until the input is under + max_length starting from the longest one at each token (when there is a pair of input sequences). + Overflowing tokens only contains overflow from the first sequence. + - only_first: Only truncate the first sequence. raise an error if the first sequence is + shorter or equal to than num_tokens_to_remove. + - only_second: Only truncate the second sequence + - do_not_truncate: Does not truncate (raise an error if the input sequence is longer than max_length) + + :param seq_a: First sequence of tokens/offsets/... + :param seq_b: Optional second sequence of tokens/offsets/... + :param tokenizer: Tokenizer (e.g. from get_tokenizer)) + :param max_seq_len: + :param truncation_strategy: how the sequence(s) should be truncated down. + Default: "longest_first" (see above for other options). + :param with_special_tokens: If true, it'll remove some additional tokens to have exactly enough space + for later adding special tokens (CLS, SEP etc.) + :param stride: optional stride of the window during truncation + :return: truncated seq_a, truncated seq_b, overflowing tokens + """ + pair = seq_b is not None + len_a = len(seq_a) + len_b = len(seq_b) if seq_b is not None else 0 + num_special_tokens = tokenizer.num_special_tokens_to_add(pair=pair) if with_special_tokens else 0 + total_len = len_a + len_b + num_special_tokens + overflowing_tokens = [] + + if max_seq_len and total_len > max_seq_len: + seq_a, seq_b, overflowing_tokens = tokenizer.truncate_sequences( + seq_a, + pair_ids=seq_b, + num_tokens_to_remove=total_len - max_seq_len, + truncation_strategy=truncation_strategy, + stride=stride, + ) + return (seq_a, seq_b, overflowing_tokens) + +# +# FIXME this is a relic from FARM. If there's the occasion, remove it! +# def tokenize_with_metadata(text: str, tokenizer: PreTrainedTokenizer) -> Dict[str, Any]: """ Performing tokenization while storing some important metadata for each token: @@ -201,54 +257,7 @@ def tokenize_with_metadata(text: str, tokenizer: PreTrainedTokenizer) -> Dict[st return {"tokens": tokens, "offsets": offsets, "start_of_word": start_of_word} -def truncate_sequences( - seq_a: list, - seq_b: Optional[list], - tokenizer, - max_seq_len: int, - truncation_strategy: str = "longest_first", - with_special_tokens: bool = True, - stride: int = 0, -) -> Tuple[List[Any], Optional[List[Any]], List[Any]]: - """ - Reduces a single sequence or a pair of sequences to a maximum sequence length. - The sequences can contain tokens or any other elements (offsets, masks ...). - If `with_special_tokens` is enabled, it'll remove some additional tokens to have exactly enough space for later adding special tokens (CLS, SEP etc.) - - Supported truncation strategies: - - - longest_first: (default) Iteratively reduce the inputs sequence until the input is under max_length starting from the longest one at each token (when there is a pair of input sequences). Overflowing tokens only contains overflow from the first sequence. - - only_first: Only truncate the first sequence. raise an error if the first sequence is shorter or equal to than num_tokens_to_remove. - - only_second: Only truncate the second sequence - - do_not_truncate: Does not truncate (raise an error if the input sequence is longer than max_length) - - :param seq_a: First sequence of tokens/offsets/... - :param seq_b: Optional second sequence of tokens/offsets/... - :param tokenizer: Tokenizer (e.g. from get_tokenizer)) - :param max_seq_len: - :param truncation_strategy: how the sequence(s) should be truncated down. Default: "longest_first" (see above for other options). - :param with_special_tokens: If true, it'll remove some additional tokens to have exactly enough space for later adding special tokens (CLS, SEP etc.) - :param stride: optional stride of the window during truncation - :return: truncated seq_a, truncated seq_b, overflowing tokens - """ - pair = seq_b is not None - len_a = len(seq_a) - len_b = len(seq_b) if seq_b is not None else 0 - num_special_tokens = tokenizer.num_special_tokens_to_add(pair=pair) if with_special_tokens else 0 - total_len = len_a + len_b + num_special_tokens - overflowing_tokens = [] - - if max_seq_len and total_len > max_seq_len: - seq_a, seq_b, overflowing_tokens = tokenizer.truncate_sequences( - seq_a, - pair_ids=seq_b, - num_tokens_to_remove=total_len - max_seq_len, - truncation_strategy=truncation_strategy, - stride=stride, - ) - return (seq_a, seq_b, overflowing_tokens) - - +# Note: only used by tokenize_with_metadata() def _words_to_tokens( words: List[str], word_offsets: List[int], tokenizer: PreTrainedTokenizer ) -> Tuple[List[str], List[int], List[bool]]: diff --git a/test/modeling/test_tokenization.py b/test/modeling/test_tokenization.py index 865348ce7c..70a3dc8fcf 100644 --- a/test/modeling/test_tokenization.py +++ b/test/modeling/test_tokenization.py @@ -1,500 +1,346 @@ -import logging -import pytest +from typing import Tuple + import re -from transformers import ( - BertTokenizer, - BertTokenizerFast, - RobertaTokenizer, - RobertaTokenizerFast, - XLNetTokenizer, - XLNetTokenizerFast, - ElectraTokenizerFast, -) + +import pytest +import numpy as np +from unittest.mock import MagicMock from tokenizers.pre_tokenizers import WhitespaceSplit +import haystack from haystack.modeling.model.tokenization import get_tokenizer -import numpy as np +TOKENIZERS_TO_TEST = ["bert-base-cased", "roberta-base", "xlnet-base-cased"] +TOKENIZERS_TO_TEST_WITH_TOKEN_MARKER = [("bert-base-cased", "##"), ("roberta-base", "Ġ"), ("xlnet-base-cased", "▁")] -TEXTS = [ - "This is a sentence", - "Der entscheidende Pass", - "This is a sentence with multiple spaces", - "力加勝北区ᴵᴺᵀᵃছজটডণত", - "Thiso text is included tolod makelio sure Unicodeel is handled properly:", - "This is a sentence...", - "Let's see all on this text and. !23# neverseenwordspossible", - """This is a sentence. - With linebreak""", - """Sentence with multiple + +REGULAR_SENTENCE = "This is a sentence" +GERMAN_SENTENCE = "Der entscheidende Pass" +OTHER_ALPHABETS = "力加勝北区ᴵᴺᵀᵃছজটডণত" +GIBBERISH_SENTENCE = "Thiso text is included tolod makelio sure Unicodeel is handled properly:" +SENTENCE_WITH_ELLIPSIS = "This is a sentence..." +SENTENCE_WITH_LINEBREAK_1 = "and another one\n\n\nwithout space" +SENTENCE_WITH_LINEBREAK_2 = """This is a sentence. + With linebreak""" +SENTENCE_WITH_LINEBREAKS = """Sentence + with + multiple newlines - """, - "and another one\n\n\nwithout space", - "This is a sentence with tab", - "This is a sentence with multiple tabs", -] - - -def test_basic_loading(caplog): - caplog.set_level(logging.CRITICAL) - # slow tokenizers - tokenizer = get_tokenizer(pretrained_model_name_or_path="bert-base-cased", do_lower_case=True, use_fast=False) - assert type(tokenizer) == BertTokenizer - assert tokenizer.basic_tokenizer.do_lower_case == True - - tokenizer = get_tokenizer(pretrained_model_name_or_path="xlnet-base-cased", do_lower_case=True, use_fast=False) - assert type(tokenizer) == XLNetTokenizer - assert tokenizer.do_lower_case == True - - tokenizer = get_tokenizer(pretrained_model_name_or_path="roberta-base", use_fast=False) - assert type(tokenizer) == RobertaTokenizer - - # fast tokenizers - tokenizer = get_tokenizer(pretrained_model_name_or_path="bert-base-cased", do_lower_case=True) - assert type(tokenizer) == BertTokenizerFast - assert tokenizer.do_lower_case == True - - tokenizer = get_tokenizer(pretrained_model_name_or_path="xlnet-base-cased", do_lower_case=True) - assert type(tokenizer) == XLNetTokenizerFast - assert tokenizer.do_lower_case == True - - tokenizer = get_tokenizer(pretrained_model_name_or_path="roberta-base") - assert type(tokenizer) == RobertaTokenizerFast - - -def test_bert_tokenizer_all_meta(caplog): - caplog.set_level(logging.CRITICAL) - - lang_model = "bert-base-cased" - - tokenizer = get_tokenizer(pretrained_model_name_or_path=lang_model, do_lower_case=False) - - basic_text = "Some Text with neverseentokens plus !215?#. and a combined-token_with/chars" - - tokenized = tokenizer.tokenize(basic_text) - assert tokenized == [ - "Some", - "Text", - "with", - "never", - "##see", - "##nto", - "##ken", - "##s", - "plus", - "!", - "215", - "?", - "#", - ".", - "and", - "a", - "combined", - "-", - "token", - "_", - "with", - "/", - "ch", - "##ars", - ] + """ +SENTENCE_WITH_EXCESS_WHITESPACE = "This is a sentence with multiple spaces" +SENTENCE_WITH_TABS = "This is a sentence with multiple tabs" +SENTENCE_WITH_CUSTOM_TOKEN = "Let's see all on this text and. !23# neverseenwordspossible" - encoded_batch = tokenizer.encode_plus(basic_text) - encoded = encoded_batch.encodings[0] - words = np.array(encoded.words) - words[words == None] = -1 - start_of_word_single = [False] + list(np.ediff1d(words) > 0) - assert encoded.tokens == [ - "[CLS]", - "Some", - "Text", - "with", - "never", - "##see", - "##nto", - "##ken", - "##s", - "plus", - "!", - "215", - "?", - "#", - ".", - "and", - "a", - "combined", - "-", - "token", - "_", - "with", - "/", - "ch", - "##ars", - "[SEP]", - ] - assert [x[0] for x in encoded.offsets] == [ - 0, - 0, - 5, - 10, - 15, - 20, - 23, - 26, - 29, - 31, - 36, - 37, - 40, - 41, - 42, - 44, - 48, - 50, - 58, - 59, - 64, - 65, - 69, - 70, - 72, - 0, - ] - assert start_of_word_single == [ - False, - True, - True, - True, - True, - False, - False, - False, - False, - True, - True, - True, - True, - True, - True, - True, - True, - True, - True, - True, - True, - True, - True, - True, - False, - False, - ] +class AutoTokenizer: + mocker: MagicMock = MagicMock() -def test_save_load(tmp_path, caplog): - caplog.set_level(logging.CRITICAL) - - lang_names = ["bert-base-cased", "roberta-base", "xlnet-base-cased"] - tokenizers = [] - for lang_name in lang_names: - if "xlnet" in lang_name.lower(): - t = get_tokenizer(lang_name, lower_case=False, use_fast=True, from_slow=True) - else: - t = get_tokenizer(lang_name, lower_case=False) - t.add_tokens(new_tokens=["neverseentokens"]) - tokenizers.append(t) - - basic_text = "Some Text with neverseentokens plus !215?#. and a combined-token_with/chars" - - for tokenizer in tokenizers: - tokenizer_type = tokenizer.__class__.__name__ - save_dir = f"{tmp_path}/testsave/{tokenizer_type}" - tokenizer.save_pretrained(save_dir) - tokenizer_loaded = get_tokenizer(save_dir, tokenizer_class=tokenizer_type) - encoded_before = tokenizer.encode_plus(basic_text).encodings[0] - encoded_after = tokenizer_loaded.encode_plus(basic_text).encodings[0] - data_before = { - "tokens": encoded_before.tokens, - "offsets": encoded_before.offsets, - "words": encoded_before.words, - } - data_after = {"tokens": encoded_after.tokens, "offsets": encoded_after.offsets, "words": encoded_after.words} - assert data_before == data_after - - -@pytest.mark.parametrize("model_name", ["bert-base-german-cased", "google/electra-small-discriminator"]) -def test_fast_tokenizer_with_examples(caplog, model_name): - fast_tokenizer = get_tokenizer(model_name, lower_case=False) - tokenizer = get_tokenizer(model_name, lower_case=False, use_fast=False) - - for text in TEXTS: - # plain tokenize function - tokenized = tokenizer.tokenize(text) - fast_tokenized = fast_tokenizer.tokenize(text) - - assert tokenized == fast_tokenized - - -def test_all_tokenizer_on_special_cases(caplog): - caplog.set_level(logging.CRITICAL) - - lang_names = ["bert-base-cased", "roberta-base", "xlnet-base-cased"] - - tokenizers = [] - for lang_name in lang_names: - if "roberta" in lang_name: - add_prefix_space = True - else: - add_prefix_space = False - t = get_tokenizer(lang_name, lower_case=False, add_prefix_space=add_prefix_space) - tokenizers.append(t) - - texts = [ - "This is a sentence", - "Der entscheidende Pass", - "力加勝北区ᴵᴺᵀᵃছজটডণত", - "Thiso text is included tolod makelio sure Unicodeel is handled properly:", - "This is a sentence...", - "Let's see all on this text and. !23# neverseenwordspossible" "This is a sentence with multiple spaces", - """This is a sentence. - With linebreak""", - """Sentence with multiple - newlines - """, - "and another one\n\n\nwithout space", - "This is a sentence with multiple tabs", - ] + @classmethod + def from_pretrained(cls, *args, **kwargs): + cls.mocker.from_pretrained(*args, **kwargs) + return cls() - expected_to_fail = {(2, 1), (2, 5)} - - for i_tok, tokenizer in enumerate(tokenizers): - for i_text, text in enumerate(texts): - # Important: we don't assume to preserve whitespaces after tokenization. - # This means: \t, \n " " etc will all resolve to a single " ". - # This doesn't make a difference for BERT + XLNet but it does for roBERTa - - test_passed = True - - # 1. original tokenize function from transformer repo on full sentence - standardized_whitespace_text = " ".join(text.split()) # remove multiple whitespaces - tokenized = tokenizer.tokenize(standardized_whitespace_text) - - # 2. Our tokenization method using a pretokenizer which can normalize multiple white spaces - # This approach is used in NER - pre_tokenizer = WhitespaceSplit() - words_and_spans = pre_tokenizer.pre_tokenize_str(text) - words = [x[0] for x in words_and_spans] - word_spans = [x[1] for x in words_and_spans] - - encoded = tokenizer.encode_plus(words, is_split_into_words=True, add_special_tokens=False).encodings[0] - - # verify that tokenization on full sequence is the same as the one on "whitespace tokenized words" - if encoded.tokens != tokenized: - test_passed = False - - # token offsets are originally relative to the beginning of the word - # These lines convert them so they are relative to the beginning of the sentence - token_offsets = [] - for ((start, end), w_index) in zip(encoded.offsets, encoded.words): - word_start_ch = word_spans[w_index][0] - token_offsets.append((start + word_start_ch, end + word_start_ch)) - - # verify that offsets align back to original text - if text == "力加勝北区ᴵᴺᵀᵃছজটডণত": - # contains [UNK] that are impossible to match back to original text space - continue - for tok, (start, end) in zip(encoded.tokens, token_offsets): - # subword-tokens have special chars depending on model type. In order to align with original text we need to get rid of them - tok = re.sub(r"^(##|Ġ|▁)", "", tok) - # tok = tokenizer.decode(tokenizer.convert_tokens_to_ids(tok)) - original_tok = text[start:end] - if tok != original_tok: - test_passed = False - if (i_tok, i_text) in expected_to_fail: - assert not test_passed, f"Behaviour of {tokenizer.__class__.__name__} has changed on text {text}'" - else: - assert test_passed, f"Behaviour of {tokenizer.__class__.__name__} has changed on text {text}'" - - -def test_bert_custom_vocab(caplog): - caplog.set_level(logging.CRITICAL) - - lang_model = "bert-base-cased" - - tokenizer = get_tokenizer(pretrained_model_name_or_path=lang_model, do_lower_case=False) - - # deprecated: tokenizer.add_custom_vocab("samples/tokenizer/custom_vocab.txt") - tokenizer.add_tokens(new_tokens=["neverseentokens"]) - basic_text = "Some Text with neverseentokens plus !215?#. and a combined-token_with/chars" - - # original tokenizer from transformer repo - tokenized = tokenizer.tokenize(basic_text) - assert tokenized == [ - "Some", - "Text", - "with", - "neverseentokens", - "plus", - "!", - "215", - "?", - "#", - ".", - "and", - "a", - "combined", - "-", - "token", - "_", - "with", - "/", - "ch", - "##ars", - ] - # ours with metadata - encoded = tokenizer.encode_plus(basic_text, add_special_tokens=False).encodings[0] - offsets = [x[0] for x in encoded.offsets] - start_of_word_single = [True] + list(np.ediff1d(encoded.words) > 0) - assert encoded.tokens == tokenized - assert offsets == [0, 5, 10, 15, 31, 36, 37, 40, 41, 42, 44, 48, 50, 58, 59, 64, 65, 69, 70, 72] - assert start_of_word_single == [ - True, - True, - True, - True, - True, - True, - True, - True, - True, - True, - True, - True, - True, - True, - True, - True, - True, - True, - True, - False, - ] +@pytest.fixture(autouse=True) +def mock_autotokenizer(request, monkeypatch): + # Do not patch integration tests + if "integration" in request.keywords: + return + monkeypatch.setattr(haystack.modeling.model.tokenization, "AutoTokenizer", AutoTokenizer) -def test_fast_bert_custom_vocab(caplog): - caplog.set_level(logging.CRITICAL) - lang_model = "bert-base-cased" +def convert_offset_from_word_reference_to_text_reference(offsets, words, word_spans): + """ + Token offsets are originally relative to the beginning of the word + We make them relative to the beginning of the sentence. - tokenizer = get_tokenizer(pretrained_model_name_or_path=lang_model, do_lower_case=False) + Not a fixture, just a utility. + """ + token_offsets = [] + for ((start, end), word_index) in zip(offsets, words): + word_start = word_spans[word_index][0] + token_offsets.append((start + word_start, end + word_start)) + return token_offsets + - # deprecated: tokenizer.add_custom_vocab("samples/tokenizer/custom_vocab.txt") - tokenizer.add_tokens(new_tokens=["neverseentokens"]) - basic_text = "Some Text with neverseentokens plus !215?#. and a combined-token_with/chars" - - # original tokenizer from transformer repo - tokenized = tokenizer.tokenize(basic_text) - assert tokenized == [ - "Some", - "Text", - "with", - "neverseentokens", - "plus", - "!", - "215", - "?", - "#", - ".", - "and", - "a", - "combined", - "-", - "token", - "_", - "with", - "/", - "ch", - "##ars", - ] +# +# Unit tests +# - # ours with metadata - encoded = tokenizer.encode_plus(basic_text, add_special_tokens=False).encodings[0] - offsets = [x[0] for x in encoded.offsets] - start_of_word_single = [True] + list(np.ediff1d(encoded.words) > 0) - assert encoded.tokens == tokenized - assert offsets == [0, 5, 10, 15, 31, 36, 37, 40, 41, 42, 44, 48, 50, 58, 59, 64, 65, 69, 70, 72] - assert start_of_word_single == [ - True, - True, - True, - True, - True, - True, - True, - True, - True, - True, - True, - True, - True, - True, - True, - True, - True, - True, - True, - False, - ] +def test_get_tokenizer_str(): + tokenizer = get_tokenizer(pretrained_model_name_or_path="test-model-name") + tokenizer.mocker.from_pretrained.assert_called_with( + pretrained_model_name_or_path='test-model-name', + revision=None, + use_fast=True, + use_auth_token=None + ) + + +def test_get_tokenizer_path(tmp_path): + tokenizer = get_tokenizer(pretrained_model_name_or_path=tmp_path / "test-path") + tokenizer.mocker.from_pretrained.assert_called_with( + pretrained_model_name_or_path=str(tmp_path / "test-path"), + revision=None, + use_fast=True, + use_auth_token=None + ) + + +def test_get_tokenizer_keep_accents(): + tokenizer = get_tokenizer(pretrained_model_name_or_path="test-model-name-albert") + tokenizer.mocker.from_pretrained.assert_called_with( + pretrained_model_name_or_path='test-model-name-albert', + revision=None, + use_fast=True, + use_auth_token=None, + keep_accents=True + ) + + +def test_get_tokenizer_mlm_warning(caplog): + tokenizer = get_tokenizer(pretrained_model_name_or_path="test-model-name-mlm") + tokenizer.mocker.from_pretrained.assert_called_with( + pretrained_model_name_or_path='test-model-name-mlm', + revision=None, + use_fast=True, + use_auth_token=None + ) + assert "MLM part of codebert is currently not supported in Haystack".lower() in caplog.text.lower() -@pytest.mark.parametrize( - "model_name, tokenizer_type", - [("bert-base-german-cased", BertTokenizerFast), ("google/electra-small-discriminator", ElectraTokenizerFast)], -) -def test_fast_tokenizer_type(caplog, model_name, tokenizer_type): - caplog.set_level(logging.CRITICAL) - tokenizer = get_tokenizer(model_name) - assert type(tokenizer) is tokenizer_type +# +# Integration tests +# +@pytest.mark.integration +@pytest.mark.parametrize("model_name", TOKENIZERS_TO_TEST) +def test_save_load(tmp_path, model_name: str): + tokenizer = get_tokenizer(pretrained_model_name_or_path=model_name, do_lower_case=False) + text = "Some Text with neverseentokens plus !215?#. and a combined-token_with/chars" -# See discussion in https://github.com/deepset-ai/FARM/pull/624 for reason to remove the test -# def test_fast_bert_tokenizer_strip_accents(caplog): -# caplog.set_level(logging.CRITICAL) -# -# tokenizer = get_tokenizer("dbmdz/bert-base-german-uncased", -# use_fast=True, -# strip_accents=False) -# assert type(tokenizer) is BertTokenizerFast -# assert tokenizer.do_lower_case -# assert tokenizer._tokenizer._parameters['strip_accents'] is False + tokenizer.add_tokens(new_tokens=["neverseentokens"]) + original_encoding = tokenizer.encode_plus(text) + save_dir = tmp_path / "saved_tokenizer" + tokenizer.save_pretrained(save_dir) -def test_fast_electra_tokenizer(caplog): - caplog.set_level(logging.CRITICAL) + tokenizer_loaded = get_tokenizer(pretrained_model_name_or_path=save_dir) + new_encoding = tokenizer_loaded.encode_plus(text) - tokenizer = get_tokenizer("dbmdz/electra-base-german-europeana-cased-discriminator") - assert type(tokenizer) is ElectraTokenizerFast + assert original_encoding == new_encoding -@pytest.mark.parametrize("model_name", ["bert-base-cased", "distilbert-base-uncased", "deepset/electra-base-squad2"]) -def test_detokenization_in_fast_tokenizers(model_name): - tokenizer = get_tokenizer(pretrained_model_name_or_path=model_name) - for text in TEXTS: - encoded = tokenizer.encode_plus(text, add_special_tokens=False).encodings[0] +@pytest.mark.integration +def test_tokenize_custom_vocab_bert(): + tokenizer = get_tokenizer(pretrained_model_name_or_path="bert-base-cased", do_lower_case=False) + tokenizer.add_tokens(new_tokens=["neverseentokens"]) + text = "Some Text with neverseentokens plus !215?#. and a combined-token_with/chars" + + tokenized = tokenizer.tokenize(text) + assert tokenized == f"Some Text with neverseentokens plus ! 215 ? # . and a combined - token _ with / ch ##ars".split() + + +@pytest.mark.integration +@pytest.mark.parametrize('edge_case', [ + REGULAR_SENTENCE, + OTHER_ALPHABETS, + GIBBERISH_SENTENCE, + SENTENCE_WITH_ELLIPSIS, + SENTENCE_WITH_LINEBREAK_1, + SENTENCE_WITH_LINEBREAK_2, + SENTENCE_WITH_LINEBREAKS, + SENTENCE_WITH_EXCESS_WHITESPACE, + SENTENCE_WITH_TABS +]) +@pytest.mark.parametrize("model_name", TOKENIZERS_TO_TEST) +def test_tokenization_on_edge_cases_full_sequence_tokenization(model_name: str, edge_case: str): + """ + Verify that tokenization on full sequence is the same as the one on "whitespace tokenized words" + """ + tokenizer = get_tokenizer(pretrained_model_name_or_path=model_name, do_lower_case=False, add_prefix_space=True) + + pre_tokenizer = WhitespaceSplit() + words_and_spans = pre_tokenizer.pre_tokenize_str(edge_case) + words = [x[0] for x in words_and_spans] + + encoded = tokenizer.encode_plus(words, is_split_into_words=True, add_special_tokens=False).encodings[0] + expected_tokenization = tokenizer.tokenize(" ".join(edge_case.split())) # remove multiple whitespaces + + assert encoded.tokens == expected_tokenization + + +@pytest.mark.integration +@pytest.mark.parametrize('edge_case', [ + SENTENCE_WITH_CUSTOM_TOKEN, + GERMAN_SENTENCE, +]) +@pytest.mark.parametrize("model_name", [t for t in TOKENIZERS_TO_TEST if t != "roberta-base"]) +def test_tokenization_on_edge_cases_full_sequence_tokenization_roberta_exceptions(model_name: str, edge_case: str): + """ + Verify that tokenization on full sequence is the same as the one on "whitespace tokenized words". + These test cases work for all tokenizers under test except for RoBERTa. + """ + tokenizer = get_tokenizer(pretrained_model_name_or_path=model_name, do_lower_case=False, add_prefix_space=True) + + pre_tokenizer = WhitespaceSplit() + words_and_spans = pre_tokenizer.pre_tokenize_str(edge_case) + words = [x[0] for x in words_and_spans] + + encoded = tokenizer.encode_plus(words, is_split_into_words=True, add_special_tokens=False).encodings[0] + expected_tokenization = tokenizer.tokenize(" ".join(edge_case.split())) # remove multiple whitespaces + + assert encoded.tokens == expected_tokenization + + +@pytest.mark.integration +@pytest.mark.parametrize('edge_case', [ + REGULAR_SENTENCE, + # OTHER_ALPHABETS, # contains [UNK] that are impossible to match back to original text space + GIBBERISH_SENTENCE, + SENTENCE_WITH_ELLIPSIS, + SENTENCE_WITH_LINEBREAK_1, + SENTENCE_WITH_LINEBREAK_2, + SENTENCE_WITH_LINEBREAKS, + SENTENCE_WITH_EXCESS_WHITESPACE, + SENTENCE_WITH_TABS, +]) +@pytest.mark.parametrize("model_name,marker", TOKENIZERS_TO_TEST_WITH_TOKEN_MARKER) +def test_tokenization_on_edge_cases_full_sequence_verify_spans(model_name: str, marker: str, edge_case: str): + tokenizer = get_tokenizer(pretrained_model_name_or_path=model_name, do_lower_case=False, add_prefix_space=True) + + pre_tokenizer = WhitespaceSplit() + words_and_spans = pre_tokenizer.pre_tokenize_str(edge_case) + words = [x[0] for x in words_and_spans] + word_spans = [x[1] for x in words_and_spans] + + encoded = tokenizer.encode_plus(words, is_split_into_words=True, add_special_tokens=False).encodings[0] + + # subword-tokens have special chars depending on model type. To align with original text we get rid of them + tokens = [token.replace(marker, "") for token in encoded.tokens] + token_offsets = convert_offset_from_word_reference_to_text_reference(encoded.offsets, encoded.words, word_spans) + + for token, (start, end) in zip(tokens, token_offsets): + assert token == edge_case[start:end] + + +@pytest.mark.integration +@pytest.mark.parametrize('edge_case', [ + SENTENCE_WITH_CUSTOM_TOKEN, + GERMAN_SENTENCE, +]) +@pytest.mark.parametrize("model_name,marker", TOKENIZERS_TO_TEST_WITH_TOKEN_MARKER) +def test_tokenization_on_edge_cases_full_sequence_verify_spans_roberta_exception(model_name: str, marker: str, edge_case: str): + tokenizer = get_tokenizer(pretrained_model_name_or_path=model_name, do_lower_case=False, add_prefix_space=True) + + pre_tokenizer = WhitespaceSplit() + words_and_spans = pre_tokenizer.pre_tokenize_str(edge_case) + words = [x[0] for x in words_and_spans] + word_spans = [x[1] for x in words_and_spans] + + encoded = tokenizer.encode_plus(words, is_split_into_words=True, add_special_tokens=False).encodings[0] + + # subword-tokens have special chars depending on model type. To align with original text we get rid of them + tokens = [token.replace(marker, "") for token in encoded.tokens] + token_offsets = convert_offset_from_word_reference_to_text_reference(encoded.offsets, encoded.words, word_spans) + + for token, (start, end) in zip(tokens, token_offsets): + assert token == edge_case[start:end] + + +@pytest.mark.integration +@pytest.mark.parametrize('edge_case', [ + REGULAR_SENTENCE, + GERMAN_SENTENCE, + SENTENCE_WITH_EXCESS_WHITESPACE, + OTHER_ALPHABETS, + GIBBERISH_SENTENCE, + SENTENCE_WITH_ELLIPSIS, + SENTENCE_WITH_CUSTOM_TOKEN, + SENTENCE_WITH_LINEBREAK_1, + SENTENCE_WITH_LINEBREAK_2, + SENTENCE_WITH_LINEBREAKS, + SENTENCE_WITH_TABS, +]) +@pytest.mark.parametrize("model_name,marker", TOKENIZERS_TO_TEST_WITH_TOKEN_MARKER) +def test_detokenization(model_name: str, marker: str, edge_case: str): + tokenizer = get_tokenizer(pretrained_model_name_or_path=model_name, do_lower_case=False) + + encoded = tokenizer.encode_plus(edge_case, add_special_tokens=False).encodings[0] + + detokenized = " ".join(encoded.tokens) + detokenized = detokenized.replace(marker, "") + + detokenized_ids = tokenizer(detokenized, add_special_tokens=False)["input_ids"] + detokenized_tokens = [tokenizer.decode([tok_id]).strip() for tok_id in detokenized_ids] + + assert encoded.tokens == detokenized_tokens + + +@pytest.mark.integration +def test_encode_plus_for_bert(): + tokenizer = get_tokenizer(pretrained_model_name_or_path="bert-base-cased", do_lower_case=False) + text = "Some Text with neverseentokens plus !215?#. and a combined-token_with/chars" + + encoded_batch = tokenizer.encode_plus(text) + encoded = encoded_batch.encodings[0] + + words = np.array(encoded.words) + words[words is None] = -1 + + tokens = encoded.tokens + offsets = [x[0] for x in encoded.offsets] + start_of_word = [False] + list(np.ediff1d(words) > 0) + + assert zip(tokens, offsets, start_of_word) == [ + ("[CLS]", 0, False), + ("Some", 0, True), + ("Text", 5, True), + ("with", 10, True), + ("never", 15, True), + ("##see", 20, False), + ("##nto", 23, False), + ("##ken", 26, False), + ("##s", 29, False), + ("plus", 31, True), + ("!", 36, True), + ("215", 37, True), + ("?", 40, True), + ("#", 41, True), + (".", 42, True), + ("and", 44, True), + ("a", 48, True), + ("combined", 50, True), + ("-", 58, True), + ("token", 59, True), + ("_", 64, True), + ("with", 65, True), + ("/", 69, True), + ("ch", 70, True), + ("##ars", 72, False), + ("[SEP]", 0, False), + ] - detokenized = " ".join(encoded.tokens) - detokenized = re.sub(r"(^|\s+)(##)", "", detokenized) - detokenized_ids = tokenizer(detokenized, add_special_tokens=False)["input_ids"] - detokenized_tokens = [tokenizer.decode([tok_id]).strip() for tok_id in detokenized_ids] +@pytest.mark.integration +def test_tokenize_custom_vocab_bert(): + tokenizer = get_tokenizer(pretrained_model_name_or_path="bert-base-cased", do_lower_case=False) - assert encoded.tokens == detokenized_tokens + tokenizer.add_tokens(new_tokens=["neverseentokens"]) + text = "Some Text with neverseentokens plus !215?#. and a combined-token_with/chars" + tokenized = tokenizer.tokenize(text) -if __name__ == "__main__": - test_all_tokenizer_on_special_cases() + encoded = tokenizer.encode_plus(text, add_special_tokens=False).encodings[0] + offsets = [x[0] for x in encoded.offsets] + start_of_word_single = [True] + list(np.ediff1d(encoded.words) > 0) + + assert encoded.tokens == tokenized + assert offsets == [0, 5, 10, 15, 31, 36, 37, 40, 41, 42, 44, 48, 50, 58, 59, 64, 65, 69, 70, 72] + assert start_of_word_single == [True]*19 + [False] From a5e2d9d9b97f86cdc5a6e91c82f103be88068f2f Mon Sep 17 00:00:00 2001 From: Sara Zan Date: Thu, 21 Jul 2022 11:20:50 +0200 Subject: [PATCH 86/89] Test `haystack/modeling/language_model.py` and remove `model_type` (#2862) * Add language unit tests * Parametrize language unit tests * Simplify model_type handling * Remove model_type parameter from get_language_model * Use the actual dpr encoders rather than tiny-bert in test_retriever.py * Additional checks for pretrained_model_name_or_path parameter in get_language_model * Fix get_language_model docs * Small fix --- haystack/modeling/model/biadaptive_model.py | 12 ++-- haystack/modeling/model/language_model.py | 66 +++++++++++---------- haystack/nodes/retriever/dense.py | 25 ++------ test/modeling/test_dpr.py | 16 ++--- test/modeling/test_language.py | 34 +++++++++++ test/nodes/test_retriever.py | 6 +- 6 files changed, 86 insertions(+), 73 deletions(-) create mode 100644 test/modeling/test_language.py diff --git a/haystack/modeling/model/biadaptive_model.py b/haystack/modeling/model/biadaptive_model.py index 41be92cce5..d80f009578 100644 --- a/haystack/modeling/model/biadaptive_model.py +++ b/haystack/modeling/model/biadaptive_model.py @@ -144,13 +144,13 @@ def load( """ # Language Model if lm1_name: - language_model1 = get_language_model(os.path.join(load_dir, lm1_name), model_type="DPRQuestionEncoder") + language_model1 = get_language_model(os.path.join(load_dir, lm1_name)) else: - language_model1 = get_language_model(load_dir, model_type="DPRQuestionEncoder") + language_model1 = get_language_model(load_dir) if lm2_name: - language_model2 = get_language_model(os.path.join(load_dir, lm2_name), model_type="DPRContextEncoder") + language_model2 = get_language_model(os.path.join(load_dir, lm2_name)) else: - language_model2 = get_language_model(load_dir, model_type="DPRContextEncoder") + language_model2 = get_language_model(load_dir) # Prediction heads ph_config_files = cls._get_prediction_head_files(load_dir) @@ -495,8 +495,8 @@ def convert_from_transformers( :type processor: Processor :return: AdaptiveModel """ - lm1 = get_language_model(pretrained_model_name_or_path=model_name_or_path1, model_type="DPRQuestionEncoder") - lm2 = get_language_model(pretrained_model_name_or_path=model_name_or_path2, model_type="DPRContextEncoder") + lm1 = get_language_model(pretrained_model_name_or_path=model_name_or_path1) + lm2 = get_language_model(pretrained_model_name_or_path=model_name_or_path2) prediction_head = TextSimilarityHead(similarity_function=similarity_function) # TODO Infer type of head automatically from config if task_type == "text_similarity": diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index c49689e111..c702a8181c 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -739,6 +739,7 @@ def forward( "Codebert": HFLanguageModel, "DebertaV2": HFLanguageModelWithPooler, "DistilBert": HFLanguageModelNoSegmentIds, + "dpr": DPREncoder, "DPRContextEncoder": DPREncoder, "DPRQuestionEncoder": DPREncoder, "Electra": HFLanguageModelWithPooler, @@ -785,7 +786,7 @@ def forward( "DebertaV2": { "summary_last_dropout": 0, "summary_type": "first", - "summary_activati": "tanh", + "summary_activation": "tanh", "summary_use_proj": False, }, } @@ -813,7 +814,6 @@ def get_language_model_class(model_type: str) -> Optional[Type[Union[HFLanguageM def get_language_model( pretrained_model_name_or_path: Union[Path, str], - model_type: Optional[str] = None, language: str = None, n_added_tokens: int = 0, use_auth_token: Optional[Union[str, bool]] = None, @@ -829,41 +829,38 @@ def get_language_model( See all supported model variations at: https://huggingface.co/models. - The appropriate language model class is inferred automatically from model configuration - or can be manually supplied using `language_model_class`. + The appropriate language model class is inferred automatically from model configuration. :param pretrained_model_name_or_path: The path of the saved pretrained model or its name. - :param revision: The version of the model to use from the Hugging Face model hub. This can be a tag name, a branch name, or a commit hash. - :param language_model_type: (Optional) Name of the language model class to load (for example `Bert`). Overrides any other discovered value. + :param language: The language of the model (i.e english etc). + :param n_added_tokens: The number of added tokens to the model. + :param use_auth_token: Whether to use the huggingface auth token for private repos or not. + :param revision: The version of the model to use from the Hugging Face model hub. This can be a tag name, + a branch name, or a commit hash. + :param autoconfig_kwargs: Additional keyword arguments to pass to the autoconfig function. + :param model_kwargs: Additional keyword arguments to pass to the lamguage model constructor. """ - logger.info(f" * LOADING MODEL: '{pretrained_model_name_or_path}' {'('+model_type+')' if model_type else ''}") - from_where = "local storage" + valid_pretrained_model_name_or_path = ( + isinstance(pretrained_model_name_or_path, (str, Path)) and len(str(pretrained_model_name_or_path)) > 0 + ) + if not valid_pretrained_model_name_or_path: + raise ValueError(f"{pretrained_model_name_or_path} is not a valid pretrained_model_name_or_path parameter") config_file = Path(pretrained_model_name_or_path) / "language_model_config.json" + available_local_filesystem = True if os.path.exists(config_file) else False + model_type = None + if available_local_filesystem: + # it's a local directory in Haystack format + config = json.load(open(config_file)) + model_type = config["name"] - if model_type is None: - - if os.path.exists(config_file): - # it's a local directory in Haystack format - config = json.load(open(config_file)) - model_type = config["name"] - if not model_type: - model_type = _get_model_type( - pretrained_model_name_or_path, - use_auth_token=use_auth_token, - revision=revision, - autoconfig_kwargs=autoconfig_kwargs, - ) - - else: - # It's from the model hub - from_where = "the Model Hub" - model_type = _get_model_type( - pretrained_model_name_or_path, - use_auth_token=use_auth_token, - revision=revision, - autoconfig_kwargs=autoconfig_kwargs, - ) + if not model_type: + model_type = _get_model_type( + pretrained_model_name_or_path, + use_auth_token=use_auth_token, + revision=revision, + autoconfig_kwargs=autoconfig_kwargs, + ) if not model_type: logger.error( @@ -886,6 +883,8 @@ def get_language_model( f"Supported model types are: {', '.join(HUGGINGFACE_TO_HAYSTACK.keys())}" ) + logger.info(f" * LOADING MODEL: '{pretrained_model_name_or_path}' {'(' + model_type + ')' if model_type else ''}") + # Instantiate the class for this model language_model = language_model_class( pretrained_model_name_or_path=pretrained_model_name_or_path, @@ -895,7 +894,10 @@ def get_language_model( use_auth_token=use_auth_token, model_kwargs=model_kwargs, ) - logger.info(f"Loaded '{pretrained_model_name_or_path}' ({model_type} model) from {from_where}.") + logger.info( + f"Loaded '{pretrained_model_name_or_path}' ({model_type} model) " + f"from {'local file system' if available_local_filesystem else 'model hub'}." + ) return language_model diff --git a/haystack/nodes/retriever/dense.py b/haystack/nodes/retriever/dense.py index 82417395cb..f7c7312e60 100644 --- a/haystack/nodes/retriever/dense.py +++ b/haystack/nodes/retriever/dense.py @@ -162,10 +162,7 @@ def __init__( use_auth_token=use_auth_token, ) self.query_encoder = get_language_model( - pretrained_model_name_or_path=query_embedding_model, - revision=model_version, - model_type="DPRQuestionEncoder", - use_auth_token=use_auth_token, + pretrained_model_name_or_path=query_embedding_model, revision=model_version, use_auth_token=use_auth_token ) self.passage_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained( pretrained_model_name_or_path=passage_embedding_model, @@ -175,10 +172,7 @@ def __init__( use_auth_token=use_auth_token, ) self.passage_encoder = get_language_model( - pretrained_model_name_or_path=passage_embedding_model, - revision=model_version, - model_type="DPRContextEncoder", - use_auth_token=use_auth_token, + pretrained_model_name_or_path=passage_embedding_model, revision=model_version, use_auth_token=use_auth_token ) self.processor = TextSimilarityProcessor( @@ -865,10 +859,7 @@ def __init__( use_auth_token=use_auth_token, ) self.query_encoder = get_language_model( - pretrained_model_name_or_path=query_embedding_model, - model_type="DPRQuestionEncoder", - revision=model_version, - use_auth_token=use_auth_token, + pretrained_model_name_or_path=query_embedding_model, revision=model_version, use_auth_token=use_auth_token ) self.passage_tokenizer = passage_tokenizer_class.from_pretrained( passage_embedding_model, @@ -878,10 +869,7 @@ def __init__( use_auth_token=use_auth_token, ) self.passage_encoder = get_language_model( - pretrained_model_name_or_path=passage_embedding_model, - model_type="DPRContextEncoder", - revision=model_version, - use_auth_token=use_auth_token, + pretrained_model_name_or_path=passage_embedding_model, revision=model_version, use_auth_token=use_auth_token ) self.table_tokenizer = table_tokenizer_class.from_pretrained( table_embedding_model, @@ -891,10 +879,7 @@ def __init__( use_auth_token=use_auth_token, ) self.table_encoder = get_language_model( - pretrained_model_name_or_path=table_embedding_model, - model_type="DPRContextEncoder", - revision=model_version, - use_auth_token=use_auth_token, + pretrained_model_name_or_path=table_embedding_model, revision=model_version, use_auth_token=use_auth_token ) self.processor = TableTextSimilarityProcessor( diff --git a/test/modeling/test_dpr.py b/test/modeling/test_dpr.py index 04eab24763..af1cf0e91a 100644 --- a/test/modeling/test_dpr.py +++ b/test/modeling/test_dpr.py @@ -727,13 +727,9 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path: Path, query_and_pa query_tokenizer = get_tokenizer( pretrained_model_name_or_path=query_embedding_model ) # tokenizer class is inferred automatically - query_encoder = get_language_model( - pretrained_model_name_or_path=query_embedding_model, model_type="DPRQuestionEncoder" - ) + query_encoder = get_language_model(pretrained_model_name_or_path=query_embedding_model) passage_tokenizer = get_tokenizer(pretrained_model_name_or_path=passage_embedding_model) - passage_encoder = get_language_model( - pretrained_model_name_or_path=passage_embedding_model, model_type="DPRContextEncoder" - ) + passage_encoder = get_language_model(pretrained_model_name_or_path=passage_embedding_model) processor = TextSimilarityProcessor( query_tokenizer=query_tokenizer, @@ -775,15 +771,11 @@ def test_dpr_processor_save_load_non_bert_tokenizer(tmp_path: Path, query_and_pa loaded_query_tokenizer = get_tokenizer( pretrained_model_name_or_path=Path(save_dir) / query_encoder_dir, use_fast=True ) # tokenizer class is inferred automatically - loaded_query_encoder = get_language_model( - pretrained_model_name_or_path=Path(save_dir) / query_encoder_dir, model_type="DPRQuestionEncoder" - ) + loaded_query_encoder = get_language_model(pretrained_model_name_or_path=Path(save_dir) / query_encoder_dir) loaded_passage_tokenizer = get_tokenizer( pretrained_model_name_or_path=Path(save_dir) / passage_encoder_dir, use_fast=True ) - loaded_passage_encoder = get_language_model( - pretrained_model_name_or_path=Path(save_dir) / passage_encoder_dir, model_type="DPRQuestionEncoder" - ) + loaded_passage_encoder = get_language_model(pretrained_model_name_or_path=Path(save_dir) / passage_encoder_dir) loaded_processor = TextSimilarityProcessor( query_tokenizer=loaded_query_tokenizer, diff --git a/test/modeling/test_language.py b/test/modeling/test_language.py new file mode 100644 index 0000000000..844f2302b7 --- /dev/null +++ b/test/modeling/test_language.py @@ -0,0 +1,34 @@ +import pytest + +from haystack.modeling.model.language_model import get_language_model + + +@pytest.mark.parametrize( + "pretrained_model_name_or_path, lm_class", + [ + ("google/bert_uncased_L-2_H-128_A-2", "HFLanguageModel"), + ("google/electra-small-generator", "HFLanguageModelWithPooler"), + ("distilbert-base-uncased", "HFLanguageModelNoSegmentIds"), + ("deepset/bert-small-mm_retrieval-passage_encoder", "DPREncoder"), + ], +) +def test_basic_loading(pretrained_model_name_or_path, lm_class): + lm = get_language_model(pretrained_model_name_or_path) + mod = __import__("haystack.modeling.model.language_model", fromlist=[lm_class]) + klass = getattr(mod, lm_class) + assert isinstance(lm, klass) + + +def test_basic_loading_unknown_model(): + with pytest.raises(OSError): + get_language_model("model_that_doesnt_exist") + + +def test_basic_loading_with_empty_string(): + with pytest.raises(ValueError): + get_language_model("") + + +def test_basic_loading_invalid_params(): + with pytest.raises(ValueError): + get_language_model(None) diff --git a/test/nodes/test_retriever.py b/test/nodes/test_retriever.py index 5bd23e0956..c5081b1e5e 100644 --- a/test/nodes/test_retriever.py +++ b/test/nodes/test_retriever.py @@ -345,9 +345,9 @@ def sum_params(model): def test_table_text_retriever_training(document_store): retriever = TableTextRetriever( document_store=document_store, - query_embedding_model="prajjwal1/bert-tiny", - passage_embedding_model="prajjwal1/bert-tiny", - table_embedding_model="prajjwal1/bert-tiny", + query_embedding_model="deepset/bert-small-mm_retrieval-question_encoder", + passage_embedding_model="deepset/bert-small-mm_retrieval-passage_encoder", + table_embedding_model="deepset/bert-small-mm_retrieval-table_encoder", use_gpu=False, ) From b7c3329492d989423ac9023bb1c322b11ffc13a5 Mon Sep 17 00:00:00 2001 From: ZanSara Date: Thu, 21 Jul 2022 13:16:00 +0200 Subject: [PATCH 87/89] fix tokenization tests --- haystack/modeling/model/language_model.py | 13 +++--- test/modeling/test_tokenization.py | 56 ++++++++--------------- 2 files changed, 25 insertions(+), 44 deletions(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 1f6dde100d..4bcd7bdcf3 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -840,16 +840,15 @@ def get_language_model( :param autoconfig_kwargs: Additional keyword arguments to pass to the autoconfig function. :param model_kwargs: Additional keyword arguments to pass to the lamguage model constructor. """ - valid_pretrained_model_name_or_path = ( - isinstance(pretrained_model_name_or_path, (str, Path)) and len(str(pretrained_model_name_or_path)) > 0 - ) - if not valid_pretrained_model_name_or_path: + + if not pretrained_model_name_or_path or not isinstance(pretrained_model_name_or_path, (str, Path)): raise ValueError(f"{pretrained_model_name_or_path} is not a valid pretrained_model_name_or_path parameter") config_file = Path(pretrained_model_name_or_path) / "language_model_config.json" - available_local_filesystem = True if os.path.exists(config_file) else False + model_type = None - if available_local_filesystem: + config_file_exists = os.path.exists(config_file) + if config_file_exists: # it's a local directory in Haystack format config = json.load(open(config_file)) model_type = config["name"] @@ -896,7 +895,7 @@ def get_language_model( ) logger.info( f"Loaded '{pretrained_model_name_or_path}' ({model_type} model) " - f"from {'local file system' if available_local_filesystem else 'model hub'}." + f"from {'local file system' if config_file_exists else 'model hub'}." ) return language_model diff --git a/test/modeling/test_tokenization.py b/test/modeling/test_tokenization.py index 70a3dc8fcf..26bac068f2 100644 --- a/test/modeling/test_tokenization.py +++ b/test/modeling/test_tokenization.py @@ -12,8 +12,12 @@ from haystack.modeling.model.tokenization import get_tokenizer -TOKENIZERS_TO_TEST = ["bert-base-cased", "roberta-base", "xlnet-base-cased"] -TOKENIZERS_TO_TEST_WITH_TOKEN_MARKER = [("bert-base-cased", "##"), ("roberta-base", "Ġ"), ("xlnet-base-cased", "▁")] +BERT = "bert-base-cased" +ROBERTA = "roberta-base" +XLNET = "xlnet-base-cased" + +TOKENIZERS_TO_TEST = [BERT, ROBERTA, XLNET] +TOKENIZERS_TO_TEST_WITH_TOKEN_MARKER = [(BERT, "##"), (ROBERTA, "Ġ"), (XLNET, "▁")] REGULAR_SENTENCE = "This is a sentence" @@ -139,7 +143,7 @@ def test_save_load(tmp_path, model_name: str): @pytest.mark.integration def test_tokenize_custom_vocab_bert(): - tokenizer = get_tokenizer(pretrained_model_name_or_path="bert-base-cased", do_lower_case=False) + tokenizer = get_tokenizer(pretrained_model_name_or_path=BERT, do_lower_case=False) tokenizer.add_tokens(new_tokens=["neverseentokens"]) text = "Some Text with neverseentokens plus !215?#. and a combined-token_with/chars" @@ -181,7 +185,7 @@ def test_tokenization_on_edge_cases_full_sequence_tokenization(model_name: str, SENTENCE_WITH_CUSTOM_TOKEN, GERMAN_SENTENCE, ]) -@pytest.mark.parametrize("model_name", [t for t in TOKENIZERS_TO_TEST if t != "roberta-base"]) +@pytest.mark.parametrize("model_name", [t for t in TOKENIZERS_TO_TEST if t != ROBERTA]) def test_tokenization_on_edge_cases_full_sequence_tokenization_roberta_exceptions(model_name: str, edge_case: str): """ Verify that tokenization on full sequence is the same as the one on "whitespace tokenized words". @@ -230,30 +234,6 @@ def test_tokenization_on_edge_cases_full_sequence_verify_spans(model_name: str, assert token == edge_case[start:end] -@pytest.mark.integration -@pytest.mark.parametrize('edge_case', [ - SENTENCE_WITH_CUSTOM_TOKEN, - GERMAN_SENTENCE, -]) -@pytest.mark.parametrize("model_name,marker", TOKENIZERS_TO_TEST_WITH_TOKEN_MARKER) -def test_tokenization_on_edge_cases_full_sequence_verify_spans_roberta_exception(model_name: str, marker: str, edge_case: str): - tokenizer = get_tokenizer(pretrained_model_name_or_path=model_name, do_lower_case=False, add_prefix_space=True) - - pre_tokenizer = WhitespaceSplit() - words_and_spans = pre_tokenizer.pre_tokenize_str(edge_case) - words = [x[0] for x in words_and_spans] - word_spans = [x[1] for x in words_and_spans] - - encoded = tokenizer.encode_plus(words, is_split_into_words=True, add_special_tokens=False).encodings[0] - - # subword-tokens have special chars depending on model type. To align with original text we get rid of them - tokens = [token.replace(marker, "") for token in encoded.tokens] - token_offsets = convert_offset_from_word_reference_to_text_reference(encoded.offsets, encoded.words, word_spans) - - for token, (start, end) in zip(tokens, token_offsets): - assert token == edge_case[start:end] - - @pytest.mark.integration @pytest.mark.parametrize('edge_case', [ REGULAR_SENTENCE, @@ -268,14 +248,13 @@ def test_tokenization_on_edge_cases_full_sequence_verify_spans_roberta_exception SENTENCE_WITH_LINEBREAKS, SENTENCE_WITH_TABS, ]) -@pytest.mark.parametrize("model_name,marker", TOKENIZERS_TO_TEST_WITH_TOKEN_MARKER) -def test_detokenization(model_name: str, marker: str, edge_case: str): - tokenizer = get_tokenizer(pretrained_model_name_or_path=model_name, do_lower_case=False) - +def test_detokenization_for_bert(edge_case): + tokenizer = get_tokenizer(pretrained_model_name_or_path=BERT, do_lower_case=False) + encoded = tokenizer.encode_plus(edge_case, add_special_tokens=False).encodings[0] detokenized = " ".join(encoded.tokens) - detokenized = detokenized.replace(marker, "") + detokenized = re.sub(r"(^|\s+)(##)", "", detokenized) detokenized_ids = tokenizer(detokenized, add_special_tokens=False)["input_ids"] detokenized_tokens = [tokenizer.decode([tok_id]).strip() for tok_id in detokenized_ids] @@ -285,20 +264,23 @@ def test_detokenization(model_name: str, marker: str, edge_case: str): @pytest.mark.integration def test_encode_plus_for_bert(): - tokenizer = get_tokenizer(pretrained_model_name_or_path="bert-base-cased", do_lower_case=False) + tokenizer = get_tokenizer(pretrained_model_name_or_path=BERT, do_lower_case=False) text = "Some Text with neverseentokens plus !215?#. and a combined-token_with/chars" encoded_batch = tokenizer.encode_plus(text) encoded = encoded_batch.encodings[0] words = np.array(encoded.words) - words[words is None] = -1 + words[0] = -1 + words[-1] = -1 + + print(words.tolist()) tokens = encoded.tokens offsets = [x[0] for x in encoded.offsets] start_of_word = [False] + list(np.ediff1d(words) > 0) - assert zip(tokens, offsets, start_of_word) == [ + assert list(zip(tokens, offsets, start_of_word)) == [ ("[CLS]", 0, False), ("Some", 0, True), ("Text", 5, True), @@ -330,7 +312,7 @@ def test_encode_plus_for_bert(): @pytest.mark.integration def test_tokenize_custom_vocab_bert(): - tokenizer = get_tokenizer(pretrained_model_name_or_path="bert-base-cased", do_lower_case=False) + tokenizer = get_tokenizer(pretrained_model_name_or_path=BERT, do_lower_case=False) tokenizer.add_tokens(new_tokens=["neverseentokens"]) text = "Some Text with neverseentokens plus !215?#. and a combined-token_with/chars" From 216ef43ce439221e0ca5e2eb9ff0111c77c19e2b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 21 Jul 2022 11:18:59 +0000 Subject: [PATCH 88/89] Update Documentation & Code Style --- haystack/modeling/model/language_model.py | 4 +- haystack/modeling/model/tokenization.py | 20 ++-- test/modeling/test_tokenization.py | 131 +++++++++++----------- 3 files changed, 79 insertions(+), 76 deletions(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 4bcd7bdcf3..5ea8fa826b 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -840,12 +840,12 @@ def get_language_model( :param autoconfig_kwargs: Additional keyword arguments to pass to the autoconfig function. :param model_kwargs: Additional keyword arguments to pass to the lamguage model constructor. """ - + if not pretrained_model_name_or_path or not isinstance(pretrained_model_name_or_path, (str, Path)): raise ValueError(f"{pretrained_model_name_or_path} is not a valid pretrained_model_name_or_path parameter") config_file = Path(pretrained_model_name_or_path) / "language_model_config.json" - + model_type = None config_file_exists = os.path.exists(config_file) if config_file_exists: diff --git a/haystack/modeling/model/tokenization.py b/haystack/modeling/model/tokenization.py index 6d258d6535..9467d38132 100644 --- a/haystack/modeling/model/tokenization.py +++ b/haystack/modeling/model/tokenization.py @@ -59,7 +59,12 @@ def get_tokenizer( params["keep_accents"] = True return AutoTokenizer.from_pretrained( - pretrained_model_name_or_path=model_name_or_path, revision=revision, use_fast=use_fast, use_auth_token=use_auth_token, **params, **kwargs + pretrained_model_name_or_path=model_name_or_path, + revision=revision, + use_fast=use_fast, + use_auth_token=use_auth_token, + **params, + **kwargs, ) @@ -156,15 +161,15 @@ def truncate_sequences( """ Reduces a single sequence or a pair of sequences to a maximum sequence length. The sequences can contain tokens or any other elements (offsets, masks ...). - If `with_special_tokens` is enabled, it'll remove some additional tokens to have exactly + If `with_special_tokens` is enabled, it'll remove some additional tokens to have exactly enough space for later adding special tokens (CLS, SEP etc.) Supported truncation strategies: - - longest_first: (default) Iteratively reduce the inputs sequence until the input is under - max_length starting from the longest one at each token (when there is a pair of input sequences). + - longest_first: (default) Iteratively reduce the inputs sequence until the input is under + max_length starting from the longest one at each token (when there is a pair of input sequences). Overflowing tokens only contains overflow from the first sequence. - - only_first: Only truncate the first sequence. raise an error if the first sequence is + - only_first: Only truncate the first sequence. raise an error if the first sequence is shorter or equal to than num_tokens_to_remove. - only_second: Only truncate the second sequence - do_not_truncate: Does not truncate (raise an error if the input sequence is longer than max_length) @@ -173,9 +178,9 @@ def truncate_sequences( :param seq_b: Optional second sequence of tokens/offsets/... :param tokenizer: Tokenizer (e.g. from get_tokenizer)) :param max_seq_len: - :param truncation_strategy: how the sequence(s) should be truncated down. + :param truncation_strategy: how the sequence(s) should be truncated down. Default: "longest_first" (see above for other options). - :param with_special_tokens: If true, it'll remove some additional tokens to have exactly enough space + :param with_special_tokens: If true, it'll remove some additional tokens to have exactly enough space for later adding special tokens (CLS, SEP etc.) :param stride: optional stride of the window during truncation :return: truncated seq_a, truncated seq_b, overflowing tokens @@ -197,6 +202,7 @@ def truncate_sequences( ) return (seq_a, seq_b, overflowing_tokens) + # # FIXME this is a relic from FARM. If there's the occasion, remove it! # diff --git a/test/modeling/test_tokenization.py b/test/modeling/test_tokenization.py index 26bac068f2..5758eeedec 100644 --- a/test/modeling/test_tokenization.py +++ b/test/modeling/test_tokenization.py @@ -47,7 +47,6 @@ def from_pretrained(cls, *args, **kwargs): return cls() - @pytest.fixture(autouse=True) def mock_autotokenizer(request, monkeypatch): # Do not patch integration tests @@ -56,7 +55,6 @@ def mock_autotokenizer(request, monkeypatch): monkeypatch.setattr(haystack.modeling.model.tokenization, "AutoTokenizer", AutoTokenizer) - def convert_offset_from_word_reference_to_text_reference(offsets, words, word_spans): """ Token offsets are originally relative to the beginning of the word @@ -69,60 +67,51 @@ def convert_offset_from_word_reference_to_text_reference(offsets, words, word_sp word_start = word_spans[word_index][0] token_offsets.append((start + word_start, end + word_start)) return token_offsets - # # Unit tests # + def test_get_tokenizer_str(): tokenizer = get_tokenizer(pretrained_model_name_or_path="test-model-name") tokenizer.mocker.from_pretrained.assert_called_with( - pretrained_model_name_or_path='test-model-name', - revision=None, - use_fast=True, - use_auth_token=None + pretrained_model_name_or_path="test-model-name", revision=None, use_fast=True, use_auth_token=None ) def test_get_tokenizer_path(tmp_path): tokenizer = get_tokenizer(pretrained_model_name_or_path=tmp_path / "test-path") tokenizer.mocker.from_pretrained.assert_called_with( - pretrained_model_name_or_path=str(tmp_path / "test-path"), - revision=None, - use_fast=True, - use_auth_token=None + pretrained_model_name_or_path=str(tmp_path / "test-path"), revision=None, use_fast=True, use_auth_token=None ) def test_get_tokenizer_keep_accents(): tokenizer = get_tokenizer(pretrained_model_name_or_path="test-model-name-albert") tokenizer.mocker.from_pretrained.assert_called_with( - pretrained_model_name_or_path='test-model-name-albert', - revision=None, + pretrained_model_name_or_path="test-model-name-albert", + revision=None, use_fast=True, use_auth_token=None, - keep_accents=True + keep_accents=True, ) def test_get_tokenizer_mlm_warning(caplog): tokenizer = get_tokenizer(pretrained_model_name_or_path="test-model-name-mlm") tokenizer.mocker.from_pretrained.assert_called_with( - pretrained_model_name_or_path='test-model-name-mlm', - revision=None, - use_fast=True, - use_auth_token=None + pretrained_model_name_or_path="test-model-name-mlm", revision=None, use_fast=True, use_auth_token=None ) assert "MLM part of codebert is currently not supported in Haystack".lower() in caplog.text.lower() - # # Integration tests # + @pytest.mark.integration @pytest.mark.parametrize("model_name", TOKENIZERS_TO_TEST) def test_save_load(tmp_path, model_name: str): @@ -148,73 +137,78 @@ def test_tokenize_custom_vocab_bert(): text = "Some Text with neverseentokens plus !215?#. and a combined-token_with/chars" tokenized = tokenizer.tokenize(text) - assert tokenized == f"Some Text with neverseentokens plus ! 215 ? # . and a combined - token _ with / ch ##ars".split() + assert ( + tokenized == f"Some Text with neverseentokens plus ! 215 ? # . and a combined - token _ with / ch ##ars".split() + ) @pytest.mark.integration -@pytest.mark.parametrize('edge_case', [ - REGULAR_SENTENCE, - OTHER_ALPHABETS, - GIBBERISH_SENTENCE, - SENTENCE_WITH_ELLIPSIS, - SENTENCE_WITH_LINEBREAK_1, - SENTENCE_WITH_LINEBREAK_2, - SENTENCE_WITH_LINEBREAKS, - SENTENCE_WITH_EXCESS_WHITESPACE, - SENTENCE_WITH_TABS -]) +@pytest.mark.parametrize( + "edge_case", + [ + REGULAR_SENTENCE, + OTHER_ALPHABETS, + GIBBERISH_SENTENCE, + SENTENCE_WITH_ELLIPSIS, + SENTENCE_WITH_LINEBREAK_1, + SENTENCE_WITH_LINEBREAK_2, + SENTENCE_WITH_LINEBREAKS, + SENTENCE_WITH_EXCESS_WHITESPACE, + SENTENCE_WITH_TABS, + ], +) @pytest.mark.parametrize("model_name", TOKENIZERS_TO_TEST) def test_tokenization_on_edge_cases_full_sequence_tokenization(model_name: str, edge_case: str): """ Verify that tokenization on full sequence is the same as the one on "whitespace tokenized words" """ - tokenizer = get_tokenizer(pretrained_model_name_or_path=model_name, do_lower_case=False, add_prefix_space=True) - + tokenizer = get_tokenizer(pretrained_model_name_or_path=model_name, do_lower_case=False, add_prefix_space=True) + pre_tokenizer = WhitespaceSplit() words_and_spans = pre_tokenizer.pre_tokenize_str(edge_case) words = [x[0] for x in words_and_spans] encoded = tokenizer.encode_plus(words, is_split_into_words=True, add_special_tokens=False).encodings[0] - expected_tokenization = tokenizer.tokenize(" ".join(edge_case.split())) # remove multiple whitespaces + expected_tokenization = tokenizer.tokenize(" ".join(edge_case.split())) # remove multiple whitespaces assert encoded.tokens == expected_tokenization @pytest.mark.integration -@pytest.mark.parametrize('edge_case', [ - SENTENCE_WITH_CUSTOM_TOKEN, - GERMAN_SENTENCE, -]) +@pytest.mark.parametrize("edge_case", [SENTENCE_WITH_CUSTOM_TOKEN, GERMAN_SENTENCE]) @pytest.mark.parametrize("model_name", [t for t in TOKENIZERS_TO_TEST if t != ROBERTA]) def test_tokenization_on_edge_cases_full_sequence_tokenization_roberta_exceptions(model_name: str, edge_case: str): """ Verify that tokenization on full sequence is the same as the one on "whitespace tokenized words". These test cases work for all tokenizers under test except for RoBERTa. """ - tokenizer = get_tokenizer(pretrained_model_name_or_path=model_name, do_lower_case=False, add_prefix_space=True) - + tokenizer = get_tokenizer(pretrained_model_name_or_path=model_name, do_lower_case=False, add_prefix_space=True) + pre_tokenizer = WhitespaceSplit() words_and_spans = pre_tokenizer.pre_tokenize_str(edge_case) words = [x[0] for x in words_and_spans] encoded = tokenizer.encode_plus(words, is_split_into_words=True, add_special_tokens=False).encodings[0] - expected_tokenization = tokenizer.tokenize(" ".join(edge_case.split())) # remove multiple whitespaces + expected_tokenization = tokenizer.tokenize(" ".join(edge_case.split())) # remove multiple whitespaces assert encoded.tokens == expected_tokenization @pytest.mark.integration -@pytest.mark.parametrize('edge_case', [ - REGULAR_SENTENCE, - # OTHER_ALPHABETS, # contains [UNK] that are impossible to match back to original text space - GIBBERISH_SENTENCE, - SENTENCE_WITH_ELLIPSIS, - SENTENCE_WITH_LINEBREAK_1, - SENTENCE_WITH_LINEBREAK_2, - SENTENCE_WITH_LINEBREAKS, - SENTENCE_WITH_EXCESS_WHITESPACE, - SENTENCE_WITH_TABS, -]) +@pytest.mark.parametrize( + "edge_case", + [ + REGULAR_SENTENCE, + # OTHER_ALPHABETS, # contains [UNK] that are impossible to match back to original text space + GIBBERISH_SENTENCE, + SENTENCE_WITH_ELLIPSIS, + SENTENCE_WITH_LINEBREAK_1, + SENTENCE_WITH_LINEBREAK_2, + SENTENCE_WITH_LINEBREAKS, + SENTENCE_WITH_EXCESS_WHITESPACE, + SENTENCE_WITH_TABS, + ], +) @pytest.mark.parametrize("model_name,marker", TOKENIZERS_TO_TEST_WITH_TOKEN_MARKER) def test_tokenization_on_edge_cases_full_sequence_verify_spans(model_name: str, marker: str, edge_case: str): tokenizer = get_tokenizer(pretrained_model_name_or_path=model_name, do_lower_case=False, add_prefix_space=True) @@ -235,22 +229,25 @@ def test_tokenization_on_edge_cases_full_sequence_verify_spans(model_name: str, @pytest.mark.integration -@pytest.mark.parametrize('edge_case', [ - REGULAR_SENTENCE, - GERMAN_SENTENCE, - SENTENCE_WITH_EXCESS_WHITESPACE, - OTHER_ALPHABETS, - GIBBERISH_SENTENCE, - SENTENCE_WITH_ELLIPSIS, - SENTENCE_WITH_CUSTOM_TOKEN, - SENTENCE_WITH_LINEBREAK_1, - SENTENCE_WITH_LINEBREAK_2, - SENTENCE_WITH_LINEBREAKS, - SENTENCE_WITH_TABS, -]) +@pytest.mark.parametrize( + "edge_case", + [ + REGULAR_SENTENCE, + GERMAN_SENTENCE, + SENTENCE_WITH_EXCESS_WHITESPACE, + OTHER_ALPHABETS, + GIBBERISH_SENTENCE, + SENTENCE_WITH_ELLIPSIS, + SENTENCE_WITH_CUSTOM_TOKEN, + SENTENCE_WITH_LINEBREAK_1, + SENTENCE_WITH_LINEBREAK_2, + SENTENCE_WITH_LINEBREAKS, + SENTENCE_WITH_TABS, + ], +) def test_detokenization_for_bert(edge_case): tokenizer = get_tokenizer(pretrained_model_name_or_path=BERT, do_lower_case=False) - + encoded = tokenizer.encode_plus(edge_case, add_special_tokens=False).encodings[0] detokenized = " ".join(encoded.tokens) @@ -325,4 +322,4 @@ def test_tokenize_custom_vocab_bert(): assert encoded.tokens == tokenized assert offsets == [0, 5, 10, 15, 31, 36, 37, 40, 41, 42, 44, 48, 50, 58, 59, 64, 65, 69, 70, 72] - assert start_of_word_single == [True]*19 + [False] + assert start_of_word_single == [True] * 19 + [False] From 0e7ec82b0fc98ca1273c9ba58d1bc3fa3b5416ff Mon Sep 17 00:00:00 2001 From: Sara Zan Date: Fri, 22 Jul 2022 13:53:26 +0200 Subject: [PATCH 89/89] Adjust model_type resolution to check config architectures (#2871) --- haystack/modeling/model/language_model.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index 5ea8fa826b..34a4565768 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -739,7 +739,6 @@ def forward( "Codebert": HFLanguageModel, "DebertaV2": HFLanguageModelWithPooler, "DistilBert": HFLanguageModelNoSegmentIds, - "dpr": DPREncoder, "DPRContextEncoder": DPREncoder, "DPRQuestionEncoder": DPREncoder, "Electra": HFLanguageModelWithPooler, @@ -802,6 +801,15 @@ def capitalize_model_type(model_type: str) -> str: return HUGGINGFACE_CAPITALIZE.get(model_type.lower(), model_type) +def is_supported_model(model_type: Optional[str]): + """ + Returns whether the model type is supported by Haystack + :param model_type: the model_type as found in the config file + :return: whether the model type is supported by the Haystack + """ + return model_type and model_type.lower() in HUGGINGFACE_CAPITALIZE + + def get_language_model_class(model_type: str) -> Optional[Type[Union[HFLanguageModel, DPREncoder]]]: """ Returns the corresponding Haystack LanguageModel subclass. @@ -922,6 +930,9 @@ def _get_model_type( **(autoconfig_kwargs or {}), ) model_type = config.model_type + # if unsupported model, try to infer from config.architectures + if not is_supported_model(model_type) and config.architectures: + model_type = config.architectures[0] if is_supported_model(config.architectures[0]) else None except Exception as e: logger.error(f"AutoConfig failed to load on '{model_name_or_path}': {str(e)}")