docs: Update docs of MetaFieldRanker, TransformersSimilarityRanker (#…

…7301) * docs: Update docstrings of MetaFieldRanker and TransformersSimilarityRanker * add warm_up() call to usage example * Apply suggestions from code review Co-authored-by: Stefano Fiorucci <[email protected]> * show result of usage example --------- Co-authored-by: Stefano Fiorucci <[email protected]>
deepset-ai · Mar 5, 2024 · c1c0cbf · c1c0cbf
1 parent f0fb71d
commit c1c0cbf
Show file tree

Hide file tree

Showing 4 changed files with 208 additions and 174 deletions.
diff --git a/haystack/components/rankers/meta_field.py b/haystack/components/rankers/meta_field.py
@@ -3,7 +3,7 @@
 
 from dateutil.parser import parse as date_parse
 
-from haystack import Document, component, default_to_dict, logging
+from haystack import Document, component, logging
 
 logger = logging.getLogger(__name__)
 
@@ -12,6 +12,7 @@
 class MetaFieldRanker:
     """
     Ranks Documents based on the value of their specific meta field.
+
     The ranking can be performed in descending order or ascending order.
 
     Usage example:
@@ -43,27 +44,33 @@ def __init__(
         """
         Creates an instance of MetaFieldRanker.
 
-        :param meta_field: The name of the meta field to rank by.
-        :param weight: In range [0,1].
-                0 disables ranking by a meta field.
-                0.5 content and meta fields have the same impact for the ranking.
-                1 means ranking by a meta field only. The highest value comes first.
-        :param top_k: The maximum number of Documents you want the Ranker to return per query. If not provided, the
-                Ranker returns all documents it receives in the new ranking order.
-        :param ranking_mode: The mode used to combine the Retriever's and Ranker's scores.
-                Possible values are 'reciprocal_rank_fusion' (default) and 'linear_score'.
-                Use the 'score' mode only with Retrievers or Rankers that return a score in range [0,1].
-        :param sort_order: Whether to sort the meta field by ascending or descending order.
-                Possible values are `descending` (default) and `ascending`.
-        :param meta_value_type: Parse the meta value into the data type specified before sorting.
-                This will only work if all meta values stored under `meta_field` in the provided documents are strings.
-                For example, if we specified `meta_value_type="date"` then for the meta value `"date": "2015-02-01"`
-                we would parse the string into a datetime object and then sort the documents by date.
-                The available options are:
-                -'float' will parse the meta values into floats.
-                -'int' will parse the meta values into integers.
-                -'date' will parse the meta values into datetime objects.
-                -'None' (default) will do no parsing.
+        :param meta_field:
+            The name of the meta field to rank by.
+        :param weight:
+            In range [0,1].
+            0 disables ranking by a meta field.
+            0.5 ranking from previous component and based on meta field have the same weight.
+            1 ranking by a meta field only.
+        :param top_k:
+            The maximum number of Documents to return per query.
+            If not provided, the Ranker returns all documents it receives in the new ranking order.
+        :param ranking_mode:
+            The mode used to combine the Retriever's and Ranker's scores.
+            Possible values are 'reciprocal_rank_fusion' (default) and 'linear_score'.
+            Use the 'linear_score' mode only with Retrievers or Rankers that return a score in range [0,1].
+        :param sort_order:
+            Whether to sort the meta field by ascending or descending order.
+            Possible values are `descending` (default) and `ascending`.
+        :param meta_value_type:
+            Parse the meta value into the data type specified before sorting.
+            This will only work if all meta values stored under `meta_field` in the provided documents are strings.
+            For example, if we specified `meta_value_type="date"` then for the meta value `"date": "2015-02-01"`
+            we would parse the string into a datetime object and then sort the documents by date.
+            The available options are:
+            - 'float' will parse the meta values into floats.
+            - 'int' will parse the meta values into integers.
+            - 'date' will parse the meta values into datetime objects.
+            - 'None' (default) will do no parsing.
         """
 
         self.meta_field = meta_field
@@ -108,7 +115,8 @@ def _validate_params(
 
         if sort_order not in ["ascending", "descending"]:
             raise ValueError(
-                "The value of parameter <sort_order> must be 'ascending' or 'descending', but is currently set to '%s'.\n"
+                "The value of parameter <sort_order> must be 'ascending' or 'descending', "
+                "but is currently set to '%s'.\n"
                 "Change the <sort_order> value to 'ascending' or 'descending' when initializing the "
                 "MetaFieldRanker." % sort_order
             )
@@ -121,20 +129,6 @@ def _validate_params(
                 "MetaFieldRanker." % meta_value_type
             )
 
-    def to_dict(self) -> Dict[str, Any]:
-        """
-        Serialize object to a dictionary.
-        """
-        return default_to_dict(
-            self,
-            meta_field=self.meta_field,
-            weight=self.weight,
-            top_k=self.top_k,
-            ranking_mode=self.ranking_mode,
-            sort_order=self.sort_order,
-            meta_value_type=self.meta_value_type,
-        )
-
     @component.output_types(documents=List[Document])
     def run(
         self,
@@ -146,35 +140,52 @@ def run(
         meta_value_type: Optional[Literal["float", "int", "date"]] = None,
     ):
         """
-        Use this method to rank a list of Documents based on the selected meta field by:
+        Ranks a list of Documents based on the selected meta field by:
         1. Sorting the Documents by the meta field in descending or ascending order.
-        2. Merging the scores from the meta field with the scores from the previous component according to the strategy and weight provided.
+        2. Merging the rankings from the previous component and based on the meta field according to ranking mode and
+        weight.
         3. Returning the top-k documents.
 
-        :param documents: Documents to be ranked.
-        :param top_k: (optional) The number of Documents you want the Ranker to return.
-                If not provided, the top_k provided at initialization time is used.
-        :param weight: (optional) In range [0,1].
-                0 disables ranking by a meta field.
-                0.5 content and meta fields have the same impact for the ranking.
-                1 means ranking by a meta field only. The highest value comes first.
-                If not provided, the weight provided at initialization time is used.
-        :param ranking_mode: (optional) The mode used to combine the Retriever's and Ranker's scores.
-                Possible values are 'reciprocal_rank_fusion' (default) and 'linear_score'.
-                Use the 'score' mode only with Retrievers or Rankers that return a score in range [0,1].
-                If not provided, the ranking_mode provided at initialization time is used.
-        :param sort_order: Whether to sort the meta field by ascending or descending order.
-                Possible values are `descending` (default) and `ascending`.
-                If not provided, the sort_order provided at initialization time is used.
-        :param meta_value_type: Parse the meta value into the data type specified before sorting.
-                This will only work if all meta values stored under `meta_field` in the provided documents are strings.
-                For example, if we specified `meta_value_type="date"` then for the meta value `"date": "2015-02-01"`
-                we would parse the string into a datetime object and then sort the documents by date.
-                The available options are:
-                -'float' will parse the meta values into floats.
-                -'int' will parse the meta values into integers.
-                -'date' will parse the meta values into datetime objects.
-                -'None' (default) will do no parsing.
+        :param documents:
+            Documents to be ranked.
+        :param top_k:
+            The maximum number of Documents to return per query.
+            If not provided, the top_k provided at initialization time is used.
+        :param weight:
+            In range [0,1].
+            0 disables ranking by a meta field.
+            0.5 ranking from previous component and based on meta field have the same weight.
+            1 ranking by a meta field only.
+            If not provided, the weight provided at initialization time is used.
+        :param ranking_mode:
+            (optional) The mode used to combine the Retriever's and Ranker's scores.
+            Possible values are 'reciprocal_rank_fusion' (default) and 'linear_score'.
+            Use the 'score' mode only with Retrievers or Rankers that return a score in range [0,1].
+            If not provided, the ranking_mode provided at initialization time is used.
+        :param sort_order:
+            Whether to sort the meta field by ascending or descending order.
+            Possible values are `descending` (default) and `ascending`.
+            If not provided, the sort_order provided at initialization time is used.
+        :param meta_value_type:
+            Parse the meta value into the data type specified before sorting.
+            This will only work if all meta values stored under `meta_field` in the provided documents are strings.
+            For example, if we specified `meta_value_type="date"` then for the meta value `"date": "2015-02-01"`
+            we would parse the string into a datetime object and then sort the documents by date.
+            The available options are:
+            -'float' will parse the meta values into floats.
+            -'int' will parse the meta values into integers.
+            -'date' will parse the meta values into datetime objects.
+            -'None' (default) will do no parsing.
+        :returns:
+            A dictionary with the following keys:
+            - `documents`: List of Documents sorted by the specified meta field.
+
+        :raises ValueError:
+            If `top_k` is not > 0.
+            If `weight` is not in range [0,1].
+            If `ranking_mode` is not 'reciprocal_rank_fusion' or 'linear_score'.
+            If `sort_order` is not 'ascending' or 'descending'.
+            If `meta_value_type` is not 'float', 'int', 'date' or `None`.
         """
         if not documents:
             return {"documents": []}
@@ -330,8 +341,8 @@ def _calc_linear_score(rank: int, amount: int) -> float:
         """
         Calculate the meta field score as a linear score between the greatest and the lowest score in the list.
         This linear scaling is useful for:
-          - Reducing the effect of outliers
-          - Creating scores that are meaningfully distributed in the range [0,1],
-             similar to scores coming from a Retriever or Ranker.
+        - Reducing the effect of outliers
+        - Creating scores that are meaningfully distributed in the range [0,1],
+        similar to scores coming from a Retriever or Ranker.
         """
         return (amount - rank) / amount
diff --git a/haystack/components/rankers/transformers_similarity.py b/haystack/components/rankers/transformers_similarity.py
@@ -19,20 +19,21 @@
 class TransformersSimilarityRanker:
     """
     Ranks Documents based on their similarity to the query.
+
     It uses a pre-trained cross-encoder model (from the Hugging Face Hub) to embed the query and the Documents.
 
     Usage example:
-    ```
+    ```python
     from haystack import Document
     from haystack.components.rankers import TransformersSimilarityRanker
 
     ranker = TransformersSimilarityRanker()
     docs = [Document(content="Paris"), Document(content="Berlin")]
     query = "City in Germany"
-    output = ranker.run(query=query, documents=docs)
-    docs = output["documents"]
-    assert len(docs) == 2
-    assert docs[0].content == "Berlin"
+    ranker.warm_up()
+    result = ranker.run(query=query, documents=docs)
+    docs = result["documents"]
+    print(docs[0].content)
     ```
     """
 
@@ -54,30 +55,39 @@ def __init__(
         """
         Creates an instance of TransformersSimilarityRanker.
 
-        :param model: The name or path of a pre-trained cross-encoder model
-            from the Hugging Face Hub.
-        :param device: The device on which the model is loaded. If `None`, the default device is automatically
-            selected.
-        :param token: The API token used to download private models from Hugging Face.
-            If this parameter is set to `True`, the token generated when running
-            `transformers-cli login` (stored in ~/.huggingface) is used.
-        :param top_k: The maximum number of Documents to return per query.
-        :param query_prefix: A string to add to the beginning of the query text before ranking.
-            Can be used to prepend the text with an instruction, as required by some reranking models,
-            such as bge.
-        :param document_prefix: A string to add to the beginning of each Document text before ranking.
-            Can be used to prepend the text with an instruction, as required by some embedding models,
-            such as bge.
-        :param meta_fields_to_embed: List of meta fields that should be embedded along with the Document content.
-        :param embedding_separator: Separator used to concatenate the meta fields to the Document content.
-        :param scale_score: Whether the raw logit predictions will be scaled using a Sigmoid activation function.
+        :param model:
+            The name or path of a pre-trained cross-encoder model from the Hugging Face Hub.
+        :param device:
+            The device on which the model is loaded. If `None`, the default device is automatically selected.
+        :param token:
+            The API token used to download private models from Hugging Face.
+        :param top_k:
+            The maximum number of Documents to return per query.
+        :param query_prefix:
+            A string to add to the beginning of the query text before ranking.
+            Can be used to prepend the text with an instruction, as required by some reranking models, such as bge.
+        :param document_prefix:
+            A string to add to the beginning of each Document text before ranking. Can be used to prepend the text with
+            an instruction, as required by some embedding models, such as bge.
+        :param meta_fields_to_embed:
+            List of meta fields that should be embedded along with the Document content.
+        :param embedding_separator:
+            Separator used to concatenate the meta fields to the Document content.
+        :param scale_score:
+            Whether the raw logit predictions will be scaled using a Sigmoid activation function.
             Set this to False if you do not want any scaling of the raw logit predictions.
-        :param calibration_factor: Factor used for calibrating probabilities calculated by
-            `sigmoid(logits * calibration_factor)`. This is only used if `scale_score` is set to True.
-        :param score_threshold: If provided only returns documents with a score above this threshold.
+        :param calibration_factor:
+            Factor used for calibrating probabilities calculated by `sigmoid(logits * calibration_factor)`.
+            This is only used if `scale_score` is set to True.
+        :param score_threshold:
+            If provided only returns documents with a score above this threshold.
         :param model_kwargs: Additional keyword arguments passed to `AutoModelForSequenceClassification.from_pretrained`
             when loading the model specified in `model`. For details on what kwargs you can pass,
             see the model's documentation.
+
+        :raises ValueError:
+            If `top_k` is not > 0.
+            If `scale_score` is True and `calibration_factor` is not provided.
         """
         torch_and_transformers_import.check()
 
@@ -115,7 +125,7 @@ def _get_telemetry_data(self) -> Dict[str, Any]:
 
     def warm_up(self):
         """
-        Warm up the model and tokenizer used for scoring the Documents.
+        Initializes the component.
         """
         if self.model is None:
             self.model = AutoModelForSequenceClassification.from_pretrained(
@@ -128,7 +138,10 @@ def warm_up(self):
 
     def to_dict(self) -> Dict[str, Any]:
         """
-        Serialize this component to a dictionary.
+        Serializes the component to a dictionary.
+
+        :returns:
+            Dictionary with serialized data.
         """
         serialization_dict = default_to_dict(
             self,
@@ -152,7 +165,12 @@ def to_dict(self) -> Dict[str, Any]:
     @classmethod
     def from_dict(cls, data: Dict[str, Any]) -> "TransformersSimilarityRanker":
         """
-        Deserialize this component from a dictionary.
+        Deserializes the component from a dictionary.
+
+        :param data:
+            Dictionary to deserialize from.
+        :returns:
+            Deserialized component.
         """
         deserialize_secrets_inplace(data["init_parameters"], keys=["token"])
         init_params = data["init_parameters"]
@@ -175,15 +193,29 @@ def run(
         """
         Returns a list of Documents ranked by their similarity to the given query.
 
-        :param query: Query string.
-        :param documents: List of Documents.
-        :param top_k: The maximum number of Documents you want the Ranker to return.
-        :param scale_score: Whether the raw logit predictions will be scaled using a Sigmoid activation function.
+        :param query:
+            Query string.
+        :param documents:
+            List of Documents.
+        :param top_k:
+            The maximum number of Documents you want the Ranker to return.
+        :param scale_score:
+            Whether the raw logit predictions will be scaled using a Sigmoid activation function.
             Set this to False if you do not want any scaling of the raw logit predictions.
-        :param calibration_factor: Factor used for calibrating probabilities calculated by
+        :param calibration_factor:
+            Factor used for calibrating probabilities calculated by
             `sigmoid(logits * calibration_factor)`. This is only used if `scale_score` is set to True.
-        :param score_threshold: If provided only returns documents with a score above this threshold.
-        :return: List of Documents sorted by their similarity to the query with the most similar Documents appearing first.
+        :param score_threshold:
+            If provided only returns documents with a score above this threshold.
+        :returns:
+            A dictionary with the following keys:
+            - `documents`: List of Documents most similar to the given query in descending order of similarity.
+
+        :raises ValueError:
+            If `top_k` is not > 0.
+            If `scale_score` is True and `calibration_factor` is not provided.
+        :raises ComponentError:
+            If the model is not loaded because `warm_up()` was not called before.
         """
         if not documents:
             return {"documents": []}