Skip to content

Commit

Permalink
docs: Update docs of MetaFieldRanker, TransformersSimilarityRanker (#…
Browse files Browse the repository at this point in the history
…7301)

* docs: Update docstrings of MetaFieldRanker and TransformersSimilarityRanker

* add warm_up() call to usage example

* Apply suggestions from code review

Co-authored-by: Stefano Fiorucci <[email protected]>

* show result of usage example

---------

Co-authored-by: Stefano Fiorucci <[email protected]>
  • Loading branch information
julian-risch and anakin87 authored Mar 5, 2024
1 parent f0fb71d commit c1c0cbf
Show file tree
Hide file tree
Showing 4 changed files with 208 additions and 174 deletions.
143 changes: 77 additions & 66 deletions haystack/components/rankers/meta_field.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from dateutil.parser import parse as date_parse

from haystack import Document, component, default_to_dict, logging
from haystack import Document, component, logging

logger = logging.getLogger(__name__)

Expand All @@ -12,6 +12,7 @@
class MetaFieldRanker:
"""
Ranks Documents based on the value of their specific meta field.
The ranking can be performed in descending order or ascending order.
Usage example:
Expand Down Expand Up @@ -43,27 +44,33 @@ def __init__(
"""
Creates an instance of MetaFieldRanker.
:param meta_field: The name of the meta field to rank by.
:param weight: In range [0,1].
0 disables ranking by a meta field.
0.5 content and meta fields have the same impact for the ranking.
1 means ranking by a meta field only. The highest value comes first.
:param top_k: The maximum number of Documents you want the Ranker to return per query. If not provided, the
Ranker returns all documents it receives in the new ranking order.
:param ranking_mode: The mode used to combine the Retriever's and Ranker's scores.
Possible values are 'reciprocal_rank_fusion' (default) and 'linear_score'.
Use the 'score' mode only with Retrievers or Rankers that return a score in range [0,1].
:param sort_order: Whether to sort the meta field by ascending or descending order.
Possible values are `descending` (default) and `ascending`.
:param meta_value_type: Parse the meta value into the data type specified before sorting.
This will only work if all meta values stored under `meta_field` in the provided documents are strings.
For example, if we specified `meta_value_type="date"` then for the meta value `"date": "2015-02-01"`
we would parse the string into a datetime object and then sort the documents by date.
The available options are:
-'float' will parse the meta values into floats.
-'int' will parse the meta values into integers.
-'date' will parse the meta values into datetime objects.
-'None' (default) will do no parsing.
:param meta_field:
The name of the meta field to rank by.
:param weight:
In range [0,1].
0 disables ranking by a meta field.
0.5 ranking from previous component and based on meta field have the same weight.
1 ranking by a meta field only.
:param top_k:
The maximum number of Documents to return per query.
If not provided, the Ranker returns all documents it receives in the new ranking order.
:param ranking_mode:
The mode used to combine the Retriever's and Ranker's scores.
Possible values are 'reciprocal_rank_fusion' (default) and 'linear_score'.
Use the 'linear_score' mode only with Retrievers or Rankers that return a score in range [0,1].
:param sort_order:
Whether to sort the meta field by ascending or descending order.
Possible values are `descending` (default) and `ascending`.
:param meta_value_type:
Parse the meta value into the data type specified before sorting.
This will only work if all meta values stored under `meta_field` in the provided documents are strings.
For example, if we specified `meta_value_type="date"` then for the meta value `"date": "2015-02-01"`
we would parse the string into a datetime object and then sort the documents by date.
The available options are:
- 'float' will parse the meta values into floats.
- 'int' will parse the meta values into integers.
- 'date' will parse the meta values into datetime objects.
- 'None' (default) will do no parsing.
"""

self.meta_field = meta_field
Expand Down Expand Up @@ -108,7 +115,8 @@ def _validate_params(

if sort_order not in ["ascending", "descending"]:
raise ValueError(
"The value of parameter <sort_order> must be 'ascending' or 'descending', but is currently set to '%s'.\n"
"The value of parameter <sort_order> must be 'ascending' or 'descending', "
"but is currently set to '%s'.\n"
"Change the <sort_order> value to 'ascending' or 'descending' when initializing the "
"MetaFieldRanker." % sort_order
)
Expand All @@ -121,20 +129,6 @@ def _validate_params(
"MetaFieldRanker." % meta_value_type
)

def to_dict(self) -> Dict[str, Any]:
"""
Serialize object to a dictionary.
"""
return default_to_dict(
self,
meta_field=self.meta_field,
weight=self.weight,
top_k=self.top_k,
ranking_mode=self.ranking_mode,
sort_order=self.sort_order,
meta_value_type=self.meta_value_type,
)

@component.output_types(documents=List[Document])
def run(
self,
Expand All @@ -146,35 +140,52 @@ def run(
meta_value_type: Optional[Literal["float", "int", "date"]] = None,
):
"""
Use this method to rank a list of Documents based on the selected meta field by:
Ranks a list of Documents based on the selected meta field by:
1. Sorting the Documents by the meta field in descending or ascending order.
2. Merging the scores from the meta field with the scores from the previous component according to the strategy and weight provided.
2. Merging the rankings from the previous component and based on the meta field according to ranking mode and
weight.
3. Returning the top-k documents.
:param documents: Documents to be ranked.
:param top_k: (optional) The number of Documents you want the Ranker to return.
If not provided, the top_k provided at initialization time is used.
:param weight: (optional) In range [0,1].
0 disables ranking by a meta field.
0.5 content and meta fields have the same impact for the ranking.
1 means ranking by a meta field only. The highest value comes first.
If not provided, the weight provided at initialization time is used.
:param ranking_mode: (optional) The mode used to combine the Retriever's and Ranker's scores.
Possible values are 'reciprocal_rank_fusion' (default) and 'linear_score'.
Use the 'score' mode only with Retrievers or Rankers that return a score in range [0,1].
If not provided, the ranking_mode provided at initialization time is used.
:param sort_order: Whether to sort the meta field by ascending or descending order.
Possible values are `descending` (default) and `ascending`.
If not provided, the sort_order provided at initialization time is used.
:param meta_value_type: Parse the meta value into the data type specified before sorting.
This will only work if all meta values stored under `meta_field` in the provided documents are strings.
For example, if we specified `meta_value_type="date"` then for the meta value `"date": "2015-02-01"`
we would parse the string into a datetime object and then sort the documents by date.
The available options are:
-'float' will parse the meta values into floats.
-'int' will parse the meta values into integers.
-'date' will parse the meta values into datetime objects.
-'None' (default) will do no parsing.
:param documents:
Documents to be ranked.
:param top_k:
The maximum number of Documents to return per query.
If not provided, the top_k provided at initialization time is used.
:param weight:
In range [0,1].
0 disables ranking by a meta field.
0.5 ranking from previous component and based on meta field have the same weight.
1 ranking by a meta field only.
If not provided, the weight provided at initialization time is used.
:param ranking_mode:
(optional) The mode used to combine the Retriever's and Ranker's scores.
Possible values are 'reciprocal_rank_fusion' (default) and 'linear_score'.
Use the 'score' mode only with Retrievers or Rankers that return a score in range [0,1].
If not provided, the ranking_mode provided at initialization time is used.
:param sort_order:
Whether to sort the meta field by ascending or descending order.
Possible values are `descending` (default) and `ascending`.
If not provided, the sort_order provided at initialization time is used.
:param meta_value_type:
Parse the meta value into the data type specified before sorting.
This will only work if all meta values stored under `meta_field` in the provided documents are strings.
For example, if we specified `meta_value_type="date"` then for the meta value `"date": "2015-02-01"`
we would parse the string into a datetime object and then sort the documents by date.
The available options are:
-'float' will parse the meta values into floats.
-'int' will parse the meta values into integers.
-'date' will parse the meta values into datetime objects.
-'None' (default) will do no parsing.
:returns:
A dictionary with the following keys:
- `documents`: List of Documents sorted by the specified meta field.
:raises ValueError:
If `top_k` is not > 0.
If `weight` is not in range [0,1].
If `ranking_mode` is not 'reciprocal_rank_fusion' or 'linear_score'.
If `sort_order` is not 'ascending' or 'descending'.
If `meta_value_type` is not 'float', 'int', 'date' or `None`.
"""
if not documents:
return {"documents": []}
Expand Down Expand Up @@ -330,8 +341,8 @@ def _calc_linear_score(rank: int, amount: int) -> float:
"""
Calculate the meta field score as a linear score between the greatest and the lowest score in the list.
This linear scaling is useful for:
- Reducing the effect of outliers
- Creating scores that are meaningfully distributed in the range [0,1],
similar to scores coming from a Retriever or Ranker.
- Reducing the effect of outliers
- Creating scores that are meaningfully distributed in the range [0,1],
similar to scores coming from a Retriever or Ranker.
"""
return (amount - rank) / amount
102 changes: 67 additions & 35 deletions haystack/components/rankers/transformers_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,20 +19,21 @@
class TransformersSimilarityRanker:
"""
Ranks Documents based on their similarity to the query.
It uses a pre-trained cross-encoder model (from the Hugging Face Hub) to embed the query and the Documents.
Usage example:
```
```python
from haystack import Document
from haystack.components.rankers import TransformersSimilarityRanker
ranker = TransformersSimilarityRanker()
docs = [Document(content="Paris"), Document(content="Berlin")]
query = "City in Germany"
output = ranker.run(query=query, documents=docs)
docs = output["documents"]
assert len(docs) == 2
assert docs[0].content == "Berlin"
ranker.warm_up()
result = ranker.run(query=query, documents=docs)
docs = result["documents"]
print(docs[0].content)
```
"""

Expand All @@ -54,30 +55,39 @@ def __init__(
"""
Creates an instance of TransformersSimilarityRanker.
:param model: The name or path of a pre-trained cross-encoder model
from the Hugging Face Hub.
:param device: The device on which the model is loaded. If `None`, the default device is automatically
selected.
:param token: The API token used to download private models from Hugging Face.
If this parameter is set to `True`, the token generated when running
`transformers-cli login` (stored in ~/.huggingface) is used.
:param top_k: The maximum number of Documents to return per query.
:param query_prefix: A string to add to the beginning of the query text before ranking.
Can be used to prepend the text with an instruction, as required by some reranking models,
such as bge.
:param document_prefix: A string to add to the beginning of each Document text before ranking.
Can be used to prepend the text with an instruction, as required by some embedding models,
such as bge.
:param meta_fields_to_embed: List of meta fields that should be embedded along with the Document content.
:param embedding_separator: Separator used to concatenate the meta fields to the Document content.
:param scale_score: Whether the raw logit predictions will be scaled using a Sigmoid activation function.
:param model:
The name or path of a pre-trained cross-encoder model from the Hugging Face Hub.
:param device:
The device on which the model is loaded. If `None`, the default device is automatically selected.
:param token:
The API token used to download private models from Hugging Face.
:param top_k:
The maximum number of Documents to return per query.
:param query_prefix:
A string to add to the beginning of the query text before ranking.
Can be used to prepend the text with an instruction, as required by some reranking models, such as bge.
:param document_prefix:
A string to add to the beginning of each Document text before ranking. Can be used to prepend the text with
an instruction, as required by some embedding models, such as bge.
:param meta_fields_to_embed:
List of meta fields that should be embedded along with the Document content.
:param embedding_separator:
Separator used to concatenate the meta fields to the Document content.
:param scale_score:
Whether the raw logit predictions will be scaled using a Sigmoid activation function.
Set this to False if you do not want any scaling of the raw logit predictions.
:param calibration_factor: Factor used for calibrating probabilities calculated by
`sigmoid(logits * calibration_factor)`. This is only used if `scale_score` is set to True.
:param score_threshold: If provided only returns documents with a score above this threshold.
:param calibration_factor:
Factor used for calibrating probabilities calculated by `sigmoid(logits * calibration_factor)`.
This is only used if `scale_score` is set to True.
:param score_threshold:
If provided only returns documents with a score above this threshold.
:param model_kwargs: Additional keyword arguments passed to `AutoModelForSequenceClassification.from_pretrained`
when loading the model specified in `model`. For details on what kwargs you can pass,
see the model's documentation.
:raises ValueError:
If `top_k` is not > 0.
If `scale_score` is True and `calibration_factor` is not provided.
"""
torch_and_transformers_import.check()

Expand Down Expand Up @@ -115,7 +125,7 @@ def _get_telemetry_data(self) -> Dict[str, Any]:

def warm_up(self):
"""
Warm up the model and tokenizer used for scoring the Documents.
Initializes the component.
"""
if self.model is None:
self.model = AutoModelForSequenceClassification.from_pretrained(
Expand All @@ -128,7 +138,10 @@ def warm_up(self):

def to_dict(self) -> Dict[str, Any]:
"""
Serialize this component to a dictionary.
Serializes the component to a dictionary.
:returns:
Dictionary with serialized data.
"""
serialization_dict = default_to_dict(
self,
Expand All @@ -152,7 +165,12 @@ def to_dict(self) -> Dict[str, Any]:
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "TransformersSimilarityRanker":
"""
Deserialize this component from a dictionary.
Deserializes the component from a dictionary.
:param data:
Dictionary to deserialize from.
:returns:
Deserialized component.
"""
deserialize_secrets_inplace(data["init_parameters"], keys=["token"])
init_params = data["init_parameters"]
Expand All @@ -175,15 +193,29 @@ def run(
"""
Returns a list of Documents ranked by their similarity to the given query.
:param query: Query string.
:param documents: List of Documents.
:param top_k: The maximum number of Documents you want the Ranker to return.
:param scale_score: Whether the raw logit predictions will be scaled using a Sigmoid activation function.
:param query:
Query string.
:param documents:
List of Documents.
:param top_k:
The maximum number of Documents you want the Ranker to return.
:param scale_score:
Whether the raw logit predictions will be scaled using a Sigmoid activation function.
Set this to False if you do not want any scaling of the raw logit predictions.
:param calibration_factor: Factor used for calibrating probabilities calculated by
:param calibration_factor:
Factor used for calibrating probabilities calculated by
`sigmoid(logits * calibration_factor)`. This is only used if `scale_score` is set to True.
:param score_threshold: If provided only returns documents with a score above this threshold.
:return: List of Documents sorted by their similarity to the query with the most similar Documents appearing first.
:param score_threshold:
If provided only returns documents with a score above this threshold.
:returns:
A dictionary with the following keys:
- `documents`: List of Documents most similar to the given query in descending order of similarity.
:raises ValueError:
If `top_k` is not > 0.
If `scale_score` is True and `calibration_factor` is not provided.
:raises ComponentError:
If the model is not loaded because `warm_up()` was not called before.
"""
if not documents:
return {"documents": []}
Expand Down
Loading

0 comments on commit c1c0cbf

Please sign in to comment.