Skip to content

Commit

Permalink
feat: Add DocumentMeanAveragePrecision (#7461)
Browse files Browse the repository at this point in the history
* Add DocumentMeanAveragePrecision

* Remove questions input

* Update docstrings

* Update haystack/components/evaluators/document_map.py

Co-authored-by: Madeesh Kannan <[email protected]>

---------

Co-authored-by: Madeesh Kannan <[email protected]>
  • Loading branch information
silvanocerza and shadeMe authored Apr 4, 2024
1 parent dc87f51 commit 7799909
Show file tree
Hide file tree
Showing 3 changed files with 166 additions and 0 deletions.
84 changes: 84 additions & 0 deletions haystack/components/evaluators/document_map.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
from typing import Any, Dict, List

from haystack import Document, component


@component
class DocumentMeanAveragePrecision:
"""
Evaluator that calculates the mean average precision of the retrieved documents, a metric
that measures how high retrieved documents are ranked.
Each question can have multiple ground truth documents and multiple retrieved documents.
`DocumentMeanAveragePrecision` doesn't normalize its inputs, the `DocumentCleaner` component
should be used to clean and normalize the documents before passing them to this evaluator.
Usage example:
```python
from haystack.components.evaluators import AnswerExactMatchEvaluator
evaluator = DocumentMeanAveragePrecision()
result = evaluator.run(
ground_truth_documents=[
[Document(content="France")],
[Document(content="9th century"), Document(content="9th")],
],
retrieved_documents=[
[Document(content="France")],
[Document(content="9th century"), Document(content="10th century"), Document(content="9th")],
],
)
print(result["individual_scores"])
# [1.0, 0.8333333333333333]
print(result["score"])
# 0.9166666666666666
```
"""

@component.output_types(score=float, individual_scores=List[float])
def run(
self, ground_truth_documents: List[List[Document]], retrieved_documents: List[List[Document]]
) -> Dict[str, Any]:
"""
Run the DocumentMeanAveragePrecision on the given inputs.
All lists must have the same length.
:param ground_truth_documents:
A list of expected documents for each question.
:param retrieved_documents:
A list of retrieved documents for each question.
:returns:
A dictionary with the following outputs:
- `score` - The average of calculated scores.
- `invididual_scores` - A list of numbers from 0.0 to 1.0 that represents how high retrieved documents are ranked.
"""
if len(ground_truth_documents) != len(retrieved_documents):
msg = "The length of ground_truth_documents and retrieved_documents must be the same."
raise ValueError(msg)

individual_scores = []

for ground_truth, retrieved in zip(ground_truth_documents, retrieved_documents):
score = 0.0
for ground_document in ground_truth:
if ground_document.content is None:
continue

average_precision = 0.0
relevant_documents = 0

for rank, retrieved_document in enumerate(retrieved):
if retrieved_document.content is None:
continue

if ground_document.content in retrieved_document.content:
relevant_documents += 1
average_precision += relevant_documents / (rank + 1)
if relevant_documents > 0:
score = average_precision / relevant_documents
individual_scores.append(score)

score = sum(individual_scores) / len(retrieved_documents)

return {"score": score, "individual_scores": individual_scores}
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
features:
- |
Add DocumentMeanAveragePrecision, it can be used to calculate mean average precision of retrieved documents.
78 changes: 78 additions & 0 deletions test/components/evaluators/test_document_map.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import pytest

from haystack import Document
from haystack.components.evaluators.document_map import DocumentMeanAveragePrecision


def test_run_with_all_matching():
evaluator = DocumentMeanAveragePrecision()
result = evaluator.run(
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
retrieved_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
)

assert result == {"individual_scores": [1.0, 1.0], "score": 1.0}


def test_run_with_no_matching():
evaluator = DocumentMeanAveragePrecision()
result = evaluator.run(
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
retrieved_documents=[[Document(content="Paris")], [Document(content="London")]],
)

assert result == {"individual_scores": [0.0, 0.0], "score": 0.0}


def test_run_with_partial_matching():
evaluator = DocumentMeanAveragePrecision()
result = evaluator.run(
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
)

assert result == {"individual_scores": [1.0, 0.0], "score": 0.5}


def test_run_with_complex_data():
evaluator = DocumentMeanAveragePrecision()
result = evaluator.run(
ground_truth_documents=[
[Document(content="France")],
[Document(content="9th century"), Document(content="9th")],
[Document(content="classical music"), Document(content="classical")],
[Document(content="11th century"), Document(content="the 11th")],
[Document(content="Denmark, Iceland and Norway")],
[Document(content="10th century"), Document(content="10th")],
],
retrieved_documents=[
[Document(content="France")],
[Document(content="9th century"), Document(content="10th century"), Document(content="9th")],
[Document(content="classical"), Document(content="rock music"), Document(content="dubstep")],
[Document(content="11th"), Document(content="the 11th"), Document(content="11th century")],
[Document(content="Denmark"), Document(content="Norway"), Document(content="Iceland")],
[
Document(content="10th century"),
Document(content="the first half of the 10th century"),
Document(content="10th"),
Document(content="10th"),
],
],
)
assert result == {"individual_scores": [1.0, 0.8333333333333333, 1.0, 0.5, 0.0, 1.0], "score": 0.7222222222222222}


def test_run_with_different_lengths():
with pytest.raises(ValueError):
evaluator = DocumentMeanAveragePrecision()
evaluator.run(
ground_truth_documents=[[Document(content="Berlin")]],
retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
)

with pytest.raises(ValueError):
evaluator = DocumentMeanAveragePrecision()
evaluator.run(
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
retrieved_documents=[[Document(content="Berlin")]],
)

0 comments on commit 7799909

Please sign in to comment.