From c1c0cbfde462ff5490731b4425b3c8e51b45218e Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Tue, 5 Mar 2024 10:20:18 +0100 Subject: [PATCH] docs: Update docs of MetaFieldRanker, TransformersSimilarityRanker (#7301) * docs: Update docstrings of MetaFieldRanker and TransformersSimilarityRanker * add warm_up() call to usage example * Apply suggestions from code review Co-authored-by: Stefano Fiorucci * show result of usage example --------- Co-authored-by: Stefano Fiorucci --- haystack/components/rankers/meta_field.py | 143 ++++++++++-------- .../rankers/transformers_similarity.py | 102 ++++++++----- haystack/components/readers/extractive.py | 97 +++++++----- test/components/rankers/test_metafield.py | 40 +---- 4 files changed, 208 insertions(+), 174 deletions(-) diff --git a/haystack/components/rankers/meta_field.py b/haystack/components/rankers/meta_field.py index 2ce4876c32..f855eff7ae 100644 --- a/haystack/components/rankers/meta_field.py +++ b/haystack/components/rankers/meta_field.py @@ -3,7 +3,7 @@ from dateutil.parser import parse as date_parse -from haystack import Document, component, default_to_dict, logging +from haystack import Document, component, logging logger = logging.getLogger(__name__) @@ -12,6 +12,7 @@ class MetaFieldRanker: """ Ranks Documents based on the value of their specific meta field. + The ranking can be performed in descending order or ascending order. Usage example: @@ -43,27 +44,33 @@ def __init__( """ Creates an instance of MetaFieldRanker. - :param meta_field: The name of the meta field to rank by. - :param weight: In range [0,1]. - 0 disables ranking by a meta field. - 0.5 content and meta fields have the same impact for the ranking. - 1 means ranking by a meta field only. The highest value comes first. - :param top_k: The maximum number of Documents you want the Ranker to return per query. If not provided, the - Ranker returns all documents it receives in the new ranking order. - :param ranking_mode: The mode used to combine the Retriever's and Ranker's scores. - Possible values are 'reciprocal_rank_fusion' (default) and 'linear_score'. - Use the 'score' mode only with Retrievers or Rankers that return a score in range [0,1]. - :param sort_order: Whether to sort the meta field by ascending or descending order. - Possible values are `descending` (default) and `ascending`. - :param meta_value_type: Parse the meta value into the data type specified before sorting. - This will only work if all meta values stored under `meta_field` in the provided documents are strings. - For example, if we specified `meta_value_type="date"` then for the meta value `"date": "2015-02-01"` - we would parse the string into a datetime object and then sort the documents by date. - The available options are: - -'float' will parse the meta values into floats. - -'int' will parse the meta values into integers. - -'date' will parse the meta values into datetime objects. - -'None' (default) will do no parsing. + :param meta_field: + The name of the meta field to rank by. + :param weight: + In range [0,1]. + 0 disables ranking by a meta field. + 0.5 ranking from previous component and based on meta field have the same weight. + 1 ranking by a meta field only. + :param top_k: + The maximum number of Documents to return per query. + If not provided, the Ranker returns all documents it receives in the new ranking order. + :param ranking_mode: + The mode used to combine the Retriever's and Ranker's scores. + Possible values are 'reciprocal_rank_fusion' (default) and 'linear_score'. + Use the 'linear_score' mode only with Retrievers or Rankers that return a score in range [0,1]. + :param sort_order: + Whether to sort the meta field by ascending or descending order. + Possible values are `descending` (default) and `ascending`. + :param meta_value_type: + Parse the meta value into the data type specified before sorting. + This will only work if all meta values stored under `meta_field` in the provided documents are strings. + For example, if we specified `meta_value_type="date"` then for the meta value `"date": "2015-02-01"` + we would parse the string into a datetime object and then sort the documents by date. + The available options are: + - 'float' will parse the meta values into floats. + - 'int' will parse the meta values into integers. + - 'date' will parse the meta values into datetime objects. + - 'None' (default) will do no parsing. """ self.meta_field = meta_field @@ -108,7 +115,8 @@ def _validate_params( if sort_order not in ["ascending", "descending"]: raise ValueError( - "The value of parameter must be 'ascending' or 'descending', but is currently set to '%s'.\n" + "The value of parameter must be 'ascending' or 'descending', " + "but is currently set to '%s'.\n" "Change the value to 'ascending' or 'descending' when initializing the " "MetaFieldRanker." % sort_order ) @@ -121,20 +129,6 @@ def _validate_params( "MetaFieldRanker." % meta_value_type ) - def to_dict(self) -> Dict[str, Any]: - """ - Serialize object to a dictionary. - """ - return default_to_dict( - self, - meta_field=self.meta_field, - weight=self.weight, - top_k=self.top_k, - ranking_mode=self.ranking_mode, - sort_order=self.sort_order, - meta_value_type=self.meta_value_type, - ) - @component.output_types(documents=List[Document]) def run( self, @@ -146,35 +140,52 @@ def run( meta_value_type: Optional[Literal["float", "int", "date"]] = None, ): """ - Use this method to rank a list of Documents based on the selected meta field by: + Ranks a list of Documents based on the selected meta field by: 1. Sorting the Documents by the meta field in descending or ascending order. - 2. Merging the scores from the meta field with the scores from the previous component according to the strategy and weight provided. + 2. Merging the rankings from the previous component and based on the meta field according to ranking mode and + weight. 3. Returning the top-k documents. - :param documents: Documents to be ranked. - :param top_k: (optional) The number of Documents you want the Ranker to return. - If not provided, the top_k provided at initialization time is used. - :param weight: (optional) In range [0,1]. - 0 disables ranking by a meta field. - 0.5 content and meta fields have the same impact for the ranking. - 1 means ranking by a meta field only. The highest value comes first. - If not provided, the weight provided at initialization time is used. - :param ranking_mode: (optional) The mode used to combine the Retriever's and Ranker's scores. - Possible values are 'reciprocal_rank_fusion' (default) and 'linear_score'. - Use the 'score' mode only with Retrievers or Rankers that return a score in range [0,1]. - If not provided, the ranking_mode provided at initialization time is used. - :param sort_order: Whether to sort the meta field by ascending or descending order. - Possible values are `descending` (default) and `ascending`. - If not provided, the sort_order provided at initialization time is used. - :param meta_value_type: Parse the meta value into the data type specified before sorting. - This will only work if all meta values stored under `meta_field` in the provided documents are strings. - For example, if we specified `meta_value_type="date"` then for the meta value `"date": "2015-02-01"` - we would parse the string into a datetime object and then sort the documents by date. - The available options are: - -'float' will parse the meta values into floats. - -'int' will parse the meta values into integers. - -'date' will parse the meta values into datetime objects. - -'None' (default) will do no parsing. + :param documents: + Documents to be ranked. + :param top_k: + The maximum number of Documents to return per query. + If not provided, the top_k provided at initialization time is used. + :param weight: + In range [0,1]. + 0 disables ranking by a meta field. + 0.5 ranking from previous component and based on meta field have the same weight. + 1 ranking by a meta field only. + If not provided, the weight provided at initialization time is used. + :param ranking_mode: + (optional) The mode used to combine the Retriever's and Ranker's scores. + Possible values are 'reciprocal_rank_fusion' (default) and 'linear_score'. + Use the 'score' mode only with Retrievers or Rankers that return a score in range [0,1]. + If not provided, the ranking_mode provided at initialization time is used. + :param sort_order: + Whether to sort the meta field by ascending or descending order. + Possible values are `descending` (default) and `ascending`. + If not provided, the sort_order provided at initialization time is used. + :param meta_value_type: + Parse the meta value into the data type specified before sorting. + This will only work if all meta values stored under `meta_field` in the provided documents are strings. + For example, if we specified `meta_value_type="date"` then for the meta value `"date": "2015-02-01"` + we would parse the string into a datetime object and then sort the documents by date. + The available options are: + -'float' will parse the meta values into floats. + -'int' will parse the meta values into integers. + -'date' will parse the meta values into datetime objects. + -'None' (default) will do no parsing. + :returns: + A dictionary with the following keys: + - `documents`: List of Documents sorted by the specified meta field. + + :raises ValueError: + If `top_k` is not > 0. + If `weight` is not in range [0,1]. + If `ranking_mode` is not 'reciprocal_rank_fusion' or 'linear_score'. + If `sort_order` is not 'ascending' or 'descending'. + If `meta_value_type` is not 'float', 'int', 'date' or `None`. """ if not documents: return {"documents": []} @@ -330,8 +341,8 @@ def _calc_linear_score(rank: int, amount: int) -> float: """ Calculate the meta field score as a linear score between the greatest and the lowest score in the list. This linear scaling is useful for: - - Reducing the effect of outliers - - Creating scores that are meaningfully distributed in the range [0,1], - similar to scores coming from a Retriever or Ranker. + - Reducing the effect of outliers + - Creating scores that are meaningfully distributed in the range [0,1], + similar to scores coming from a Retriever or Ranker. """ return (amount - rank) / amount diff --git a/haystack/components/rankers/transformers_similarity.py b/haystack/components/rankers/transformers_similarity.py index f69a1af83b..af26fc8b91 100644 --- a/haystack/components/rankers/transformers_similarity.py +++ b/haystack/components/rankers/transformers_similarity.py @@ -19,20 +19,21 @@ class TransformersSimilarityRanker: """ Ranks Documents based on their similarity to the query. + It uses a pre-trained cross-encoder model (from the Hugging Face Hub) to embed the query and the Documents. Usage example: - ``` + ```python from haystack import Document from haystack.components.rankers import TransformersSimilarityRanker ranker = TransformersSimilarityRanker() docs = [Document(content="Paris"), Document(content="Berlin")] query = "City in Germany" - output = ranker.run(query=query, documents=docs) - docs = output["documents"] - assert len(docs) == 2 - assert docs[0].content == "Berlin" + ranker.warm_up() + result = ranker.run(query=query, documents=docs) + docs = result["documents"] + print(docs[0].content) ``` """ @@ -54,30 +55,39 @@ def __init__( """ Creates an instance of TransformersSimilarityRanker. - :param model: The name or path of a pre-trained cross-encoder model - from the Hugging Face Hub. - :param device: The device on which the model is loaded. If `None`, the default device is automatically - selected. - :param token: The API token used to download private models from Hugging Face. - If this parameter is set to `True`, the token generated when running - `transformers-cli login` (stored in ~/.huggingface) is used. - :param top_k: The maximum number of Documents to return per query. - :param query_prefix: A string to add to the beginning of the query text before ranking. - Can be used to prepend the text with an instruction, as required by some reranking models, - such as bge. - :param document_prefix: A string to add to the beginning of each Document text before ranking. - Can be used to prepend the text with an instruction, as required by some embedding models, - such as bge. - :param meta_fields_to_embed: List of meta fields that should be embedded along with the Document content. - :param embedding_separator: Separator used to concatenate the meta fields to the Document content. - :param scale_score: Whether the raw logit predictions will be scaled using a Sigmoid activation function. + :param model: + The name or path of a pre-trained cross-encoder model from the Hugging Face Hub. + :param device: + The device on which the model is loaded. If `None`, the default device is automatically selected. + :param token: + The API token used to download private models from Hugging Face. + :param top_k: + The maximum number of Documents to return per query. + :param query_prefix: + A string to add to the beginning of the query text before ranking. + Can be used to prepend the text with an instruction, as required by some reranking models, such as bge. + :param document_prefix: + A string to add to the beginning of each Document text before ranking. Can be used to prepend the text with + an instruction, as required by some embedding models, such as bge. + :param meta_fields_to_embed: + List of meta fields that should be embedded along with the Document content. + :param embedding_separator: + Separator used to concatenate the meta fields to the Document content. + :param scale_score: + Whether the raw logit predictions will be scaled using a Sigmoid activation function. Set this to False if you do not want any scaling of the raw logit predictions. - :param calibration_factor: Factor used for calibrating probabilities calculated by - `sigmoid(logits * calibration_factor)`. This is only used if `scale_score` is set to True. - :param score_threshold: If provided only returns documents with a score above this threshold. + :param calibration_factor: + Factor used for calibrating probabilities calculated by `sigmoid(logits * calibration_factor)`. + This is only used if `scale_score` is set to True. + :param score_threshold: + If provided only returns documents with a score above this threshold. :param model_kwargs: Additional keyword arguments passed to `AutoModelForSequenceClassification.from_pretrained` when loading the model specified in `model`. For details on what kwargs you can pass, see the model's documentation. + + :raises ValueError: + If `top_k` is not > 0. + If `scale_score` is True and `calibration_factor` is not provided. """ torch_and_transformers_import.check() @@ -115,7 +125,7 @@ def _get_telemetry_data(self) -> Dict[str, Any]: def warm_up(self): """ - Warm up the model and tokenizer used for scoring the Documents. + Initializes the component. """ if self.model is None: self.model = AutoModelForSequenceClassification.from_pretrained( @@ -128,7 +138,10 @@ def warm_up(self): def to_dict(self) -> Dict[str, Any]: """ - Serialize this component to a dictionary. + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. """ serialization_dict = default_to_dict( self, @@ -152,7 +165,12 @@ def to_dict(self) -> Dict[str, Any]: @classmethod def from_dict(cls, data: Dict[str, Any]) -> "TransformersSimilarityRanker": """ - Deserialize this component from a dictionary. + Deserializes the component from a dictionary. + + :param data: + Dictionary to deserialize from. + :returns: + Deserialized component. """ deserialize_secrets_inplace(data["init_parameters"], keys=["token"]) init_params = data["init_parameters"] @@ -175,15 +193,29 @@ def run( """ Returns a list of Documents ranked by their similarity to the given query. - :param query: Query string. - :param documents: List of Documents. - :param top_k: The maximum number of Documents you want the Ranker to return. - :param scale_score: Whether the raw logit predictions will be scaled using a Sigmoid activation function. + :param query: + Query string. + :param documents: + List of Documents. + :param top_k: + The maximum number of Documents you want the Ranker to return. + :param scale_score: + Whether the raw logit predictions will be scaled using a Sigmoid activation function. Set this to False if you do not want any scaling of the raw logit predictions. - :param calibration_factor: Factor used for calibrating probabilities calculated by + :param calibration_factor: + Factor used for calibrating probabilities calculated by `sigmoid(logits * calibration_factor)`. This is only used if `scale_score` is set to True. - :param score_threshold: If provided only returns documents with a score above this threshold. - :return: List of Documents sorted by their similarity to the query with the most similar Documents appearing first. + :param score_threshold: + If provided only returns documents with a score above this threshold. + :returns: + A dictionary with the following keys: + - `documents`: List of Documents most similar to the given query in descending order of similarity. + + :raises ValueError: + If `top_k` is not > 0. + If `scale_score` is True and `calibration_factor` is not provided. + :raises ComponentError: + If the model is not loaded because `warm_up()` was not called before. """ if not documents: return {"documents": []} diff --git a/haystack/components/readers/extractive.py b/haystack/components/readers/extractive.py index 724c6b8133..fe4679476f 100644 --- a/haystack/components/readers/extractive.py +++ b/haystack/components/readers/extractive.py @@ -21,19 +21,28 @@ @component class ExtractiveReader: """ - A component that locates and extract answers to a given query from Documents. It's used for performing extractive - QA. The Reader assigns a score to every possible answer span independently of other answer spans. + Locates and extracts answers to a given query from Documents. + + The ExtractiveReader component performs extractive question answering. + It assigns a score to every possible answer span independently of other answer spans. This fixes a common issue of other implementations which make comparisons across documents harder by normalizing each document's answers independently. Example usage: ```python - p = Pipeline() - p.add_component(instance=InMemoryBM25Retriever(document_store=InMemoryDocumentStore()), name="retriever") - p.add_component(instance=ExtractiveReader(), name="reader") - p.connect("retriever", "reader") + from haystack import Document + from haystack.components.readers import ExtractiveReader + + docs = [ + Document(content="Python is a popular programming language"), + Document(content="python ist eine beliebte Programmiersprache"), + ] + + reader = ExtractiveReader() + reader.warm_up() + question = "Who lives in Berlin?" - p.run({"retriever": {"query": question}, "reader": {"query": question}}) + reader.run(query=question, documents=docs) ``` """ @@ -54,37 +63,43 @@ def __init__( model_kwargs: Optional[Dict[str, Any]] = None, ) -> None: """ - Creates an ExtractiveReader - :param model: A Hugging Face transformers question answering model. + Creates an instance of ExtractiveReader. + + :param model: + A Hugging Face transformers question answering model. Can either be a path to a folder containing the model files or an identifier for the Hugging Face hub. - Default: `'deepset/roberta-base-squad2-distilled'` - :param device: The device on which the model is loaded. If `None`, the default device is automatically - selected. - :param token: The API token used to download private models from Hugging Face. - If this parameter is set to `True`, then the token generated when running - `transformers-cli login` (stored in ~/.huggingface) is used. - :param top_k: Number of answers to return per query. - It is required even if score_threshold is set. Defaults to 20. + :param device: + The device on which the model is loaded. If `None`, the default device is automatically selected. + :param token: + The API token used to download private models from Hugging Face. + :param top_k: + Number of answers to return per query. It is required even if score_threshold is set. An additional answer with no text is returned if no_answer is set to True (default). - :param score_threshold: Returns only answers with the probability score above this threshold. - :param max_seq_length: Maximum number of tokens. - If a sequence exceeds it, the sequence is split. - Default: 384 - :param stride: Number of tokens that overlap when sequence is split because it exceeds max_seq_length. - Default: 128 - :param max_batch_size: Maximum number of samples that are fed through the model at the same time. - :param answers_per_seq: Number of answer candidates to consider per sequence. + :param score_threshold: + Returns only answers with the probability score above this threshold. + :param max_seq_length: + Maximum number of tokens. If a sequence exceeds it, the sequence is split. + :param stride: + Number of tokens that overlap when sequence is split because it exceeds max_seq_length. + :param max_batch_size: + Maximum number of samples that are fed through the model at the same time. + :param answers_per_seq: + Number of answer candidates to consider per sequence. This is relevant when a Document was split into multiple sequences because of max_seq_length. - :param no_answer: Whether to return an additional `no answer` with an empty text and a score representing the + :param no_answer: + Whether to return an additional `no answer` with an empty text and a score representing the probability that the other top_k answers are incorrect. - :param calibration_factor: Factor used for calibrating probabilities. - :param overlap_threshold: If set this will remove duplicate answers if they have an overlap larger than the + :param calibration_factor: + Factor used for calibrating probabilities. + :param overlap_threshold: + If set this will remove duplicate answers if they have an overlap larger than the supplied threshold. For example, for the answers "in the river in Maine" and "the river" we would remove one of these answers since the second answer has a 100% (1.0) overlap with the first answer. However, for the answers "the river in" and "in Maine" there is only a max overlap percentage of 25% so both of these answers could be kept if this variable is set to 0.24 or lower. If None is provided then all answers are kept. - :param model_kwargs: Additional keyword arguments passed to `AutoModelForQuestionAnswering.from_pretrained` + :param model_kwargs: + Additional keyword arguments passed to `AutoModelForQuestionAnswering.from_pretrained` when loading the model specified in `model`. For details on what kwargs you can pass, see the model's documentation. """ @@ -115,7 +130,10 @@ def _get_telemetry_data(self) -> Dict[str, Any]: def to_dict(self) -> Dict[str, Any]: """ - Serialize this component to a dictionary. + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. """ serialization_dict = default_to_dict( self, @@ -139,7 +157,12 @@ def to_dict(self) -> Dict[str, Any]: @classmethod def from_dict(cls, data: Dict[str, Any]) -> "ExtractiveReader": """ - Deserialize this component from a dictionary. + Deserializes the component from a dictionary. + + :param data: + Dictionary to deserialize from. + :returns: + Deserialized component. """ init_params = data["init_parameters"] if init_params["device"] is not None: @@ -150,7 +173,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "ExtractiveReader": def warm_up(self): """ - Loads model and tokenizer + Initializes the component. """ # Take the first device used by `accelerate`. Needed to pass inputs from the tokenizer to the correct device. if self.model is None: @@ -422,16 +445,20 @@ def deduplicate_by_overlap( self, answers: List[ExtractedAnswer], overlap_threshold: Optional[float] ) -> List[ExtractedAnswer]: """ - This de-duplicates overlapping Extractive Answers from the same document based on how much the spans of the + De-duplicates overlapping Extractive Answers from the same document based on how much the spans of the answers overlap. - :param answers: List of answers to be deduplicated. - :param overlap_threshold: If set this will remove duplicate answers if they have an overlap larger than the + :param answers: + List of answers to be deduplicated. + :param overlap_threshold: + If set this will remove duplicate answers if they have an overlap larger than the supplied threshold. For example, for the answers "in the river in Maine" and "the river" we would remove one of these answers since the second answer has a 100% (1.0) overlap with the first answer. However, for the answers "the river in" and "in Maine" there is only a max overlap percentage of 25% so both of these answers could be kept if this variable is set to 0.24 or lower. If None is provided then all answers are kept. + :returns: + List of deduplicated answers. """ if overlap_threshold is None: return answers diff --git a/test/components/rankers/test_metafield.py b/test/components/rankers/test_metafield.py index d729c55b4a..4c44f7a6d3 100644 --- a/test/components/rankers/test_metafield.py +++ b/test/components/rankers/test_metafield.py @@ -1,48 +1,12 @@ -import pytest import logging +import pytest + from haystack import Document from haystack.components.rankers.meta_field import MetaFieldRanker class TestMetaFieldRanker: - def test_to_dict(self): - component = MetaFieldRanker(meta_field="rating") - data = component.to_dict() - assert data == { - "type": "haystack.components.rankers.meta_field.MetaFieldRanker", - "init_parameters": { - "meta_field": "rating", - "weight": 1.0, - "top_k": None, - "ranking_mode": "reciprocal_rank_fusion", - "sort_order": "descending", - "meta_value_type": None, - }, - } - - def test_to_dict_with_custom_init_parameters(self): - component = MetaFieldRanker( - meta_field="rating", - weight=0.5, - top_k=5, - ranking_mode="linear_score", - sort_order="ascending", - meta_value_type="date", - ) - data = component.to_dict() - assert data == { - "type": "haystack.components.rankers.meta_field.MetaFieldRanker", - "init_parameters": { - "meta_field": "rating", - "weight": 0.5, - "top_k": 5, - "ranking_mode": "linear_score", - "sort_order": "ascending", - "meta_value_type": "date", - }, - } - @pytest.mark.parametrize("meta_field_values, expected_first_value", [([1.3, 0.7, 2.1], 2.1), ([1, 5, 8], 8)]) def test_run(self, meta_field_values, expected_first_value): """