Skip to content

Commit

Permalink
Docs: Update Rankers docstrings and messages (#6296)
Browse files Browse the repository at this point in the history
* Update docstrings and messages

* Fix tests

* Fix formatting

* Update haystack/preview/components/rankers/meta_field.py

Co-authored-by: Silvano Cerza <[email protected]>

* Fix tests

---------

Co-authored-by: Silvano Cerza <[email protected]>
Co-authored-by: Silvano Cerza <[email protected]>
  • Loading branch information
3 people authored Nov 20, 2023
1 parent 0ef06e7 commit 497299c
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 41 deletions.
54 changes: 27 additions & 27 deletions haystack/preview/components/rankers/meta_field.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
@component
class MetaFieldRanker:
"""
Ranks documents based on the value of a metadata field.
Ranks Documents based on the value of their specific metadata field. The ranking is done in a descending order.
Usage example:
```
Expand Down Expand Up @@ -42,13 +42,13 @@ def __init__(
:param metadata_field: The name of the metadata field to rank by.
:param weight: In range [0,1].
0 disables sorting by metadata field.
0.5 content and metadata fields have the same impact.
1 means sorting only by metadata field, highest value comes first.
:param top_k: The maximum number of documents to return.
:param ranking_mode: The mode used to combine retriever and recentness.
0 disables ranking by a metadata field.
0.5 content and metadata fields have the same impact for the ranking.
1 means ranking by a metadata field only. The highest value comes first.
:param top_k: The maximum number of Documents you want the Ranker to return per query.
:param ranking_mode: The mode used to combine the Retriever's and Ranker's scores.
Possible values are 'reciprocal_rank_fusion' (default) and 'linear_score'.
Make sure to use 'score' mode only with retrievers/rankers that give back OK score in range [0,1].
Use the 'score' mode only with Retrievers or Rankers that return a score in range [0,1].
"""

self.metadata_field = metadata_field
Expand All @@ -59,9 +59,9 @@ def __init__(
if self.weight < 0 or self.weight > 1:
raise ValueError(
"""
Param <weight> needs to be in range [0,1] but was set to '{}'.\n
'0' disables sorting by metadata field, '0.5' gives equal weight to previous relevance scores and metadata field, and '1' ranks by metadata field only.\n
Please change param <weight> when initializing the MetaFieldRanker.
Parameter <weight> must be in range [0,1] but is currently set to '{}'.\n
'0' disables sorting by a metadata field, '0.5' assigns equal weight to the previous relevance scores and the metadata field, and '1' ranks by the metadata field only.\n
Change the <weight> parameter to a value in range 0 to 1 when initializing the MetaFieldRanker.
""".format(
self.weight
)
Expand All @@ -70,8 +70,8 @@ def __init__(
if self.ranking_mode not in ["reciprocal_rank_fusion", "linear_score"]:
raise ValueError(
"""
Param <ranking_mode> needs to be 'reciprocal_rank_fusion' or 'linear_score' but was set to '{}'. \n
Please change the <ranking_mode> when initializing the MetaFieldRanker.
The value of parameter <ranking_mode> must be 'reciprocal_rank_fusion' or 'linear_score', but is currently set to '{}'. \n
Change the <ranking_mode> value to 'reciprocal_rank_fusion' or 'linear_score' when initializing the MetaFieldRanker.
""".format(
self.ranking_mode
)
Expand All @@ -92,13 +92,13 @@ def to_dict(self) -> Dict[str, Any]:
@component.output_types(documents=List[Document])
def run(self, documents: List[Document], top_k: Optional[int] = None):
"""
This method is used to rank a list of documents based on the selected metadata field by:
1. Sorting the documents by the metadata field in descending order.
Use this method to rank a list of Documents based on the selected metadata field by:
1. Sorting the Documents by the metadata field in descending order.
2. Merging the scores from the metadata field with the scores from the previous component according to the strategy and weight provided.
3. Returning the top-k documents.
:param documents: Documents provided for ranking.
:param top_k: (optional) How many documents to return at the end. If not provided, all documents will be returned.
:param documents: Documents to be ranked.
:param top_k: (optional) The number of Documents you want the Ranker to return. If not provided, the Ranker returns all Documents it received.
"""
if not documents:
return {"documents": []}
Expand All @@ -113,9 +113,9 @@ def run(self, documents: List[Document], top_k: Optional[int] = None):
except KeyError:
raise ComponentError(
"""
Param <metadata_field> was set to '{}' but document(s) {} do not contain this metadata key.\n
Please double-check the names of existing metadata fields of your documents \n
and set <metadata_field> to the name of the field that contains the metadata you want to rank by.
The parameter <metadata_field> is currently set to '{}' but the Documents {} don't have this metadata key.\n
Double-check the names of the metadata fields in your documents \n
and set <metadata_field> to the name of the field that contains the metadata you want to use for ranking.
""".format(
self.metadata_field, ",".join([doc.id for doc in documents if self.metadata_field not in doc.meta])
)
Expand All @@ -129,7 +129,7 @@ def run(self, documents: List[Document], top_k: Optional[int] = None):

def _merge_scores(self, documents: List[Document], sorted_documents: List[Document]) -> List[Document]:
"""
Merge scores for documents sorted both by content and by metadata field.
Merge scores for Documents sorted both by their content and by their metadata field.
"""
scores_map: Dict = defaultdict(int)

Expand All @@ -141,10 +141,10 @@ def _merge_scores(self, documents: List[Document], sorted_documents: List[Docume
for i, (doc, sorted_doc) in enumerate(zip(documents, sorted_documents)):
score = float(0)
if doc.score is None:
warnings.warn("The score was not provided; defaulting to 0")
warnings.warn("The score wasn't provided; defaulting to 0.")
elif doc.score < 0 or doc.score > 1:
warnings.warn(
"The score {} for document {} is outside the [0,1] range; defaulting to 0".format(
"The score {} for Document {} is outside the [0,1] range; defaulting to 0".format(
doc.score, doc.id
)
)
Expand All @@ -164,17 +164,17 @@ def _merge_scores(self, documents: List[Document], sorted_documents: List[Docume
def _calculate_rrf(rank: int, k: int = 61) -> float:
"""
Calculates the reciprocal rank fusion. The constant K is set to 61 (60 was suggested by the original paper,
plus 1 as python lists are 0-based and the paper [https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf] used 1-based ranking).
plus 1 as python lists are 0-based and the [paper](https://plg.uwaterloo.ca/~gvcormac/cormacksigir09-rrf.pdf) used 1-based ranking).
"""
return 1 / (k + rank)

@staticmethod
def _calc_linear_score(rank: int, amount: int) -> float:
"""
Calculate the metadata field score as a linear score between the greatest and the lowest score in the list.
This linear scaling is useful to
a) reduce the effect of outliers and
b) create scores that are meaningfully distributed in [0,1],
similar to scores coming from a retriever/ranker.
This linear scaling is useful for:
- Reducing the effect of outliers
- Creating scores that are meaningfully distributed in the range [0,1],
similar to scores coming from a Retriever or Ranker.
"""
return (amount - rank) / amount
24 changes: 12 additions & 12 deletions haystack/preview/components/rankers/transformers_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
@component
class TransformersSimilarityRanker:
"""
Ranks documents based on query similarity.
It uses a pre-trained cross-encoder model (from Hugging Face Hub) to embed the query and documents.
Ranks Documents based on their similarity to the query.
It uses a pre-trained cross-encoder model (from the Hugging Face Hub) to embed the query and the Documents.
Usage example:
```
Expand Down Expand Up @@ -45,12 +45,12 @@ def __init__(
Creates an instance of TransformersSimilarityRanker.
:param model_name_or_path: The name or path of a pre-trained cross-encoder model
from Hugging Face Hub.
:param device: torch device (for example, cuda:0, cpu, mps) to limit model inference to a specific device.
from the Hugging Face Hub.
:param device: The torch device (for example, cuda:0, cpu, mps) to which you want to limit model inference.
:param token: The API token used to download private models from Hugging Face.
If this parameter is set to `True`, then the token generated when running
`transformers-cli login` (stored in ~/.huggingface) will be used.
:param top_k: The maximum number of documents to return per query.
If this parameter is set to `True`, the token generated when running
`transformers-cli login` (stored in ~/.huggingface) is used.
:param top_k: The maximum number of Documents to return per query.
"""
torch_and_transformers_import.check()

Expand All @@ -71,7 +71,7 @@ def _get_telemetry_data(self) -> Dict[str, Any]:

def warm_up(self):
"""
Warm up the model and tokenizer used in scoring the documents.
Warm up the model and tokenizer used for scoring the Documents.
"""
if self.model_name_or_path and not self.model:
self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name_or_path, token=self.token)
Expand All @@ -94,12 +94,12 @@ def to_dict(self) -> Dict[str, Any]:
@component.output_types(documents=List[Document])
def run(self, query: str, documents: List[Document], top_k: Optional[int] = None):
"""
Returns a list of documents ranked by their similarity to the given query
Returns a list of Documents ranked by their similarity to the given query.
:param query: Query string.
:param documents: List of Documents.
:param top_k: The maximum number of documents to return.
:return: List of Documents sorted by (desc.) similarity with the query.
:param top_k: The maximum number of Documents you want the Ranker to return.
:return: List of Documents sorted by their similarity to the query with the most similar Documents appearing first.
"""
if not documents:
return {"documents": []}
Expand All @@ -113,7 +113,7 @@ def run(self, query: str, documents: List[Document], top_k: Optional[int] = None
# If a model path is provided but the model isn't loaded
if self.model_name_or_path and not self.model:
raise ComponentError(
f"The component {self.__class__.__name__} not warmed up. Run 'warm_up()' before calling 'run()'."
f"The component {self.__class__.__name__} wasn't warmed up. Run 'warm_up()' before calling 'run()'."
)

query_doc_pairs = [[query, doc.content] for doc in documents]
Expand Down
5 changes: 3 additions & 2 deletions test/preview/components/rankers/test_metafield.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def test_linear_score_raises_warning_if_doc_wrong_score(self, score):
Document(id=3, content="abc", meta={"rating": 2.1}, score=0.6),
]
with pytest.warns(
UserWarning, match=rf"The score {score} for document 1 is outside the \[0,1\] range; defaulting to 0"
UserWarning, match=rf"The score {score} for Document 1 is outside the \[0,1\] range; defaulting to 0"
):
ranker.run(documents=docs_before)

Expand All @@ -117,5 +117,6 @@ def test_linear_score_raises_raises_warning_if_doc_without_score(self):
Document(content="abc", meta={"rating": 0.7}),
Document(content="abc", meta={"rating": 2.1}),
]
with pytest.warns(UserWarning, match="The score was not provided; defaulting to 0"):

with pytest.warns(UserWarning, match="The score wasn't provided; defaulting to 0."):
ranker.run(documents=docs_before)

0 comments on commit 497299c

Please sign in to comment.