Skip to content

Commit

Permalink
show result of usage example
Browse files Browse the repository at this point in the history
  • Loading branch information
julian-risch committed Mar 5, 2024
1 parent 710af1e commit 6cb9423
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 39 deletions.
8 changes: 4 additions & 4 deletions haystack/components/rankers/meta_field.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,10 @@ def __init__(
For example, if we specified `meta_value_type="date"` then for the meta value `"date": "2015-02-01"`
we would parse the string into a datetime object and then sort the documents by date.
The available options are:
-'float' will parse the meta values into floats.
-'int' will parse the meta values into integers.
-'date' will parse the meta values into datetime objects.
-'None' (default) will do no parsing.
- 'float' will parse the meta values into floats.
- 'int' will parse the meta values into integers.
- 'date' will parse the meta values into datetime objects.
- 'None' (default) will do no parsing.
"""

self.meta_field = meta_field
Expand Down
1 change: 1 addition & 0 deletions haystack/components/rankers/transformers_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ class TransformersSimilarityRanker:
ranker.warm_up()
result = ranker.run(query=query, documents=docs)
docs = result["documents"]
print(docs[0].content)
```
"""

Expand Down
97 changes: 62 additions & 35 deletions haystack/components/readers/extractive.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,19 +21,28 @@
@component
class ExtractiveReader:
"""
A component that locates and extract answers to a given query from Documents. It's used for performing extractive
QA. The Reader assigns a score to every possible answer span independently of other answer spans.
Locates and extracts answers to a given query from Documents.
The ExtractiveReader component performs extractive question answering.
It assigns a score to every possible answer span independently of other answer spans.
This fixes a common issue of other implementations which make comparisons across documents harder by normalizing
each document's answers independently.
Example usage:
```python
p = Pipeline()
p.add_component(instance=InMemoryBM25Retriever(document_store=InMemoryDocumentStore()), name="retriever")
p.add_component(instance=ExtractiveReader(), name="reader")
p.connect("retriever", "reader")
from haystack import Document
from haystack.components.readers import ExtractiveReader
docs = [
Document(content="Python is a popular programming language"),
Document(content="python ist eine beliebte Programmiersprache"),
]
reader = ExtractiveReader()
reader.warm_up()
question = "Who lives in Berlin?"
p.run({"retriever": {"query": question}, "reader": {"query": question}})
reader.run(query=question, documents=docs)
```
"""

Expand All @@ -54,37 +63,43 @@ def __init__(
model_kwargs: Optional[Dict[str, Any]] = None,
) -> None:
"""
Creates an ExtractiveReader
:param model: A Hugging Face transformers question answering model.
Creates an instance of ExtractiveReader.
:param model:
A Hugging Face transformers question answering model.
Can either be a path to a folder containing the model files or an identifier for the Hugging Face hub.
Default: `'deepset/roberta-base-squad2-distilled'`
:param device: The device on which the model is loaded. If `None`, the default device is automatically
selected.
:param token: The API token used to download private models from Hugging Face.
If this parameter is set to `True`, then the token generated when running
`transformers-cli login` (stored in ~/.huggingface) is used.
:param top_k: Number of answers to return per query.
It is required even if score_threshold is set. Defaults to 20.
:param device:
The device on which the model is loaded. If `None`, the default device is automatically selected.
:param token:
The API token used to download private models from Hugging Face.
:param top_k:
Number of answers to return per query. It is required even if score_threshold is set.
An additional answer with no text is returned if no_answer is set to True (default).
:param score_threshold: Returns only answers with the probability score above this threshold.
:param max_seq_length: Maximum number of tokens.
If a sequence exceeds it, the sequence is split.
Default: 384
:param stride: Number of tokens that overlap when sequence is split because it exceeds max_seq_length.
Default: 128
:param max_batch_size: Maximum number of samples that are fed through the model at the same time.
:param answers_per_seq: Number of answer candidates to consider per sequence.
:param score_threshold:
Returns only answers with the probability score above this threshold.
:param max_seq_length:
Maximum number of tokens. If a sequence exceeds it, the sequence is split.
:param stride:
Number of tokens that overlap when sequence is split because it exceeds max_seq_length.
:param max_batch_size:
Maximum number of samples that are fed through the model at the same time.
:param answers_per_seq:
Number of answer candidates to consider per sequence.
This is relevant when a Document was split into multiple sequences because of max_seq_length.
:param no_answer: Whether to return an additional `no answer` with an empty text and a score representing the
:param no_answer:
Whether to return an additional `no answer` with an empty text and a score representing the
probability that the other top_k answers are incorrect.
:param calibration_factor: Factor used for calibrating probabilities.
:param overlap_threshold: If set this will remove duplicate answers if they have an overlap larger than the
:param calibration_factor:
Factor used for calibrating probabilities.
:param overlap_threshold:
If set this will remove duplicate answers if they have an overlap larger than the
supplied threshold. For example, for the answers "in the river in Maine" and "the river" we would remove
one of these answers since the second answer has a 100% (1.0) overlap with the first answer.
However, for the answers "the river in" and "in Maine" there is only a max overlap percentage of 25% so
both of these answers could be kept if this variable is set to 0.24 or lower.
If None is provided then all answers are kept.
:param model_kwargs: Additional keyword arguments passed to `AutoModelForQuestionAnswering.from_pretrained`
:param model_kwargs:
Additional keyword arguments passed to `AutoModelForQuestionAnswering.from_pretrained`
when loading the model specified in `model`. For details on what kwargs you can pass,
see the model's documentation.
"""
Expand Down Expand Up @@ -115,7 +130,10 @@ def _get_telemetry_data(self) -> Dict[str, Any]:

def to_dict(self) -> Dict[str, Any]:
"""
Serialize this component to a dictionary.
Serializes the component to a dictionary.
:returns:
Dictionary with serialized data.
"""
serialization_dict = default_to_dict(
self,
Expand All @@ -139,7 +157,12 @@ def to_dict(self) -> Dict[str, Any]:
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "ExtractiveReader":
"""
Deserialize this component from a dictionary.
Deserializes the component from a dictionary.
:param data:
Dictionary to deserialize from.
:returns:
Deserialized component.
"""
init_params = data["init_parameters"]
if init_params["device"] is not None:
Expand All @@ -150,7 +173,7 @@ def from_dict(cls, data: Dict[str, Any]) -> "ExtractiveReader":

def warm_up(self):
"""
Loads model and tokenizer
Initializes the component.
"""
# Take the first device used by `accelerate`. Needed to pass inputs from the tokenizer to the correct device.
if self.model is None:
Expand Down Expand Up @@ -422,16 +445,20 @@ def deduplicate_by_overlap(
self, answers: List[ExtractedAnswer], overlap_threshold: Optional[float]
) -> List[ExtractedAnswer]:
"""
This de-duplicates overlapping Extractive Answers from the same document based on how much the spans of the
De-duplicates overlapping Extractive Answers from the same document based on how much the spans of the
answers overlap.
:param answers: List of answers to be deduplicated.
:param overlap_threshold: If set this will remove duplicate answers if they have an overlap larger than the
:param answers:
List of answers to be deduplicated.
:param overlap_threshold:
If set this will remove duplicate answers if they have an overlap larger than the
supplied threshold. For example, for the answers "in the river in Maine" and "the river" we would remove
one of these answers since the second answer has a 100% (1.0) overlap with the first answer.
However, for the answers "the river in" and "in Maine" there is only a max overlap percentage of 25% so
both of these answers could be kept if this variable is set to 0.24 or lower.
If None is provided then all answers are kept.
:returns:
List of deduplicated answers.
"""
if overlap_threshold is None:
return answers
Expand Down

0 comments on commit 6cb9423

Please sign in to comment.