Skip to content

Commit

Permalink
refactor: FaithfulnessEvaluator specifies inputs explicitly (#7548)
Browse files Browse the repository at this point in the history
* specify inputs explicitly. move out examples

* Update haystack/components/evaluators/faithfulness.py

Co-authored-by: Madeesh Kannan <[email protected]>

---------

Co-authored-by: Madeesh Kannan <[email protected]>
  • Loading branch information
julian-risch and shadeMe authored Apr 22, 2024
1 parent b12e0db commit d7638cf
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 39 deletions.
82 changes: 45 additions & 37 deletions haystack/components/evaluators/faithfulness.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,40 @@
from haystack.core.component import component
from haystack.utils import Secret, deserialize_secrets_inplace

# Default examples to include in the prompt if the user does not provide any examples
_DEFAULT_EXAMPLES = [
{
"inputs": {
"questions": "What is the capital of Germany and when was it founded?",
"contexts": ["Berlin is the capital of Germany and was founded in 1244."],
"responses": "The capital of Germany, Berlin, was founded in the 13th century.",
},
"outputs": {
"statements": ["Berlin is the capital of Germany.", "Berlin was founded in 1244."],
"statement_scores": [1, 1],
},
},
{
"inputs": {
"questions": "What is the capital of France?",
"contexts": ["Berlin is the capital of Germany."],
"responses": "Paris",
},
"outputs": {"statements": ["Paris is the capital of France."], "statement_scores": [0]},
},
{
"inputs": {
"questions": "What is the capital of Italy?",
"contexts": ["Rome is the capital of Italy."],
"responses": "Rome is the capital of Italy with more than 4 million inhabitants.",
},
"outputs": {
"statements": ["Rome is the capital of Italy.", "Rome has more than 4 million inhabitants."],
"statement_scores": [1, 0],
},
},
]


class FaithfulnessEvaluator(LLMEvaluator):
"""
Expand Down Expand Up @@ -50,7 +84,8 @@ def __init__(
Creates an instance of FaithfulnessEvaluator.
:param examples:
Few-shot examples conforming to the expected input and output format of FaithfulnessEvaluator.
Optional few-shot examples conforming to the expected input and output format of FaithfulnessEvaluator.
Default examples will be used if none are provided.
Each example must be a dictionary with keys "inputs" and "outputs".
"inputs" must be a dictionary with keys "questions", "contexts", and "responses".
"outputs" must be a dictionary with "statements" and "statement_scores".
Expand Down Expand Up @@ -81,38 +116,7 @@ def __init__(
)
self.inputs = [("questions", List[str]), ("contexts", List[List[str]]), ("responses", List[str])]
self.outputs = ["statements", "statement_scores"]
self.examples = examples or [
{
"inputs": {
"questions": "What is the capital of Germany and when was it founded?",
"contexts": ["Berlin is the capital of Germany and was founded in 1244."],
"responses": "The capital of Germany, Berlin, was founded in the 13th century.",
},
"outputs": {
"statements": ["Berlin is the capital of Germany.", "Berlin was founded in 1244."],
"statement_scores": [1, 1],
},
},
{
"inputs": {
"questions": "What is the capital of France?",
"contexts": ["Berlin is the capital of Germany."],
"responses": "Paris",
},
"outputs": {"statements": ["Paris is the capital of France."], "statement_scores": [0]},
},
{
"inputs": {
"questions": "What is the capital of Italy?",
"contexts": ["Rome is the capital of Italy."],
"responses": "Rome is the capital of Italy with more than 4 million inhabitants.",
},
"outputs": {
"statements": ["Rome is the capital of Italy.", "Rome has more than 4 million inhabitants."],
"statement_scores": [1, 0],
},
},
]
self.examples = examples or _DEFAULT_EXAMPLES
self.api = api
self.api_key = api_key

Expand All @@ -126,19 +130,23 @@ def __init__(
)

@component.output_types(results=List[Dict[str, Any]])
def run(self, **inputs) -> Dict[str, Any]:
def run(self, questions: List[str], contexts: List[List[str]], responses: List[str]) -> Dict[str, Any]:
"""
Run the LLM evaluator.
:param inputs:
The input values to evaluate. The keys are the input names and the values are lists of input values.
:param questions:
A list of questions.
:param contexts:
A nested list of contexts that correspond to the questions.
:param responses:
A list of responses.
:returns:
A dictionary with the following outputs:
- `score`: Mean faithfulness score over all the provided input answers.
- `individual_scores`: A list of faithfulness scores for each input answer.
- `results`: A list of dictionaries with `statements` and `statement_scores` for each input answer.
"""
result = super().run(**inputs)
result = super().run(questions=questions, contexts=contexts, responses=responses)

# calculate average statement faithfulness score per query
for res in result["results"]:
Expand Down
35 changes: 33 additions & 2 deletions test/components/evaluators/test_faithfulness_evaluator.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
from typing import List

import pytest
Expand Down Expand Up @@ -108,10 +109,15 @@ def generator_run(self, *args, **kwargs):
questions = ["Which is the most popular global sport?", "Who created the Python language?"]
contexts = [
[
"The popularity of sports can be measured in various ways, including TV viewership, social media presence, number of participants, and economic impact. Football is undoubtedly the world's most popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and Messi, drawing a followership of more than 4 billion people."
"The popularity of sports can be measured in various ways, including TV viewership, social media "
"presence, number of participants, and economic impact. Football is undoubtedly the world's most "
"popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and "
"Messi, drawing a followership of more than 4 billion people."
],
[
"Python, created by Guido van Rossum in the late 1980s, is a high-level general-purpose programming language. Its design philosophy emphasizes code readability, and its language constructs aim to help programmers write clear, logical code for both small and large-scale software projects."
"Python, created by Guido van Rossum in the late 1980s, is a high-level general-purpose programming "
"language. Its design philosophy emphasizes code readability, and its language constructs aim to help "
"programmers write clear, logical code for both small and large-scale software projects."
],
]
responses = [
Expand All @@ -127,3 +133,28 @@ def generator_run(self, *args, **kwargs):
],
"score": 0.75,
}

def test_run_missing_parameters(self, monkeypatch):
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
component = FaithfulnessEvaluator()
with pytest.raises(TypeError, match="missing 3 required positional arguments"):
component.run()

@pytest.mark.skipif(
not os.environ.get("OPENAI_API_KEY", None),
reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.",
)
@pytest.mark.integration
def test_live_run(self):
questions = ["What is Python and who created it?"]
contexts = [["Python is a programming language created by Guido van Rossum."]]
responses = ["Python is a programming language created by George Lucas."]
evaluator = FaithfulnessEvaluator()
result = evaluator.run(questions=questions, contexts=contexts, responses=responses)

assert result["score"] == 0.5
assert result["individual_scores"] == [0.5]
assert result["results"][0]["score"] == 0.5
assert result["results"][0]["statement_scores"] == [1, 0]
assert "programming language" in result["results"][0]["statements"][0]
assert "George Lucas" in result["results"][0]["statements"][1]

0 comments on commit d7638cf

Please sign in to comment.