From d7638cfd4b0c86f2985e666a5845959bacc68a85 Mon Sep 17 00:00:00 2001
From: Julian Risch <julian.risch@deepset.ai>
Date: Mon, 22 Apr 2024 14:52:10 +0200
Subject: [PATCH] refactor: FaithfulnessEvaluator specifies inputs explicitly
 (#7548)

* specify inputs explicitly. move out examples

* Update haystack/components/evaluators/faithfulness.py

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

---------

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
---
 .../components/evaluators/faithfulness.py     | 82 ++++++++++---------
 .../evaluators/test_faithfulness_evaluator.py | 35 +++++++-
 2 files changed, 78 insertions(+), 39 deletions(-)

diff --git a/haystack/components/evaluators/faithfulness.py b/haystack/components/evaluators/faithfulness.py
index 7722995b44..8274990387 100644
--- a/haystack/components/evaluators/faithfulness.py
+++ b/haystack/components/evaluators/faithfulness.py
@@ -7,6 +7,40 @@
 from haystack.core.component import component
 from haystack.utils import Secret, deserialize_secrets_inplace
 
+# Default examples to include in the prompt if the user does not provide any examples
+_DEFAULT_EXAMPLES = [
+    {
+        "inputs": {
+            "questions": "What is the capital of Germany and when was it founded?",
+            "contexts": ["Berlin is the capital of Germany and was founded in 1244."],
+            "responses": "The capital of Germany, Berlin, was founded in the 13th century.",
+        },
+        "outputs": {
+            "statements": ["Berlin is the capital of Germany.", "Berlin was founded in 1244."],
+            "statement_scores": [1, 1],
+        },
+    },
+    {
+        "inputs": {
+            "questions": "What is the capital of France?",
+            "contexts": ["Berlin is the capital of Germany."],
+            "responses": "Paris",
+        },
+        "outputs": {"statements": ["Paris is the capital of France."], "statement_scores": [0]},
+    },
+    {
+        "inputs": {
+            "questions": "What is the capital of Italy?",
+            "contexts": ["Rome is the capital of Italy."],
+            "responses": "Rome is the capital of Italy with more than 4 million inhabitants.",
+        },
+        "outputs": {
+            "statements": ["Rome is the capital of Italy.", "Rome has more than 4 million inhabitants."],
+            "statement_scores": [1, 0],
+        },
+    },
+]
+
 
 class FaithfulnessEvaluator(LLMEvaluator):
     """
@@ -50,7 +84,8 @@ def __init__(
         Creates an instance of FaithfulnessEvaluator.
 
         :param examples:
-            Few-shot examples conforming to the expected input and output format of FaithfulnessEvaluator.
+            Optional few-shot examples conforming to the expected input and output format of FaithfulnessEvaluator.
+            Default examples will be used if none are provided.
             Each example must be a dictionary with keys "inputs" and "outputs".
             "inputs" must be a dictionary with keys "questions", "contexts", and "responses".
             "outputs" must be a dictionary with "statements" and "statement_scores".
@@ -81,38 +116,7 @@ def __init__(
         )
         self.inputs = [("questions", List[str]), ("contexts", List[List[str]]), ("responses", List[str])]
         self.outputs = ["statements", "statement_scores"]
-        self.examples = examples or [
-            {
-                "inputs": {
-                    "questions": "What is the capital of Germany and when was it founded?",
-                    "contexts": ["Berlin is the capital of Germany and was founded in 1244."],
-                    "responses": "The capital of Germany, Berlin, was founded in the 13th century.",
-                },
-                "outputs": {
-                    "statements": ["Berlin is the capital of Germany.", "Berlin was founded in 1244."],
-                    "statement_scores": [1, 1],
-                },
-            },
-            {
-                "inputs": {
-                    "questions": "What is the capital of France?",
-                    "contexts": ["Berlin is the capital of Germany."],
-                    "responses": "Paris",
-                },
-                "outputs": {"statements": ["Paris is the capital of France."], "statement_scores": [0]},
-            },
-            {
-                "inputs": {
-                    "questions": "What is the capital of Italy?",
-                    "contexts": ["Rome is the capital of Italy."],
-                    "responses": "Rome is the capital of Italy with more than 4 million inhabitants.",
-                },
-                "outputs": {
-                    "statements": ["Rome is the capital of Italy.", "Rome has more than 4 million inhabitants."],
-                    "statement_scores": [1, 0],
-                },
-            },
-        ]
+        self.examples = examples or _DEFAULT_EXAMPLES
         self.api = api
         self.api_key = api_key
 
@@ -126,19 +130,23 @@ def __init__(
         )
 
     @component.output_types(results=List[Dict[str, Any]])
-    def run(self, **inputs) -> Dict[str, Any]:
+    def run(self, questions: List[str], contexts: List[List[str]], responses: List[str]) -> Dict[str, Any]:
         """
         Run the LLM evaluator.
 
-        :param inputs:
-            The input values to evaluate. The keys are the input names and the values are lists of input values.
+        :param questions:
+            A list of questions.
+        :param contexts:
+            A nested list of contexts that correspond to the questions.
+        :param responses:
+            A list of responses.
         :returns:
             A dictionary with the following outputs:
                 - `score`: Mean faithfulness score over all the provided input answers.
                 - `individual_scores`: A list of faithfulness scores for each input answer.
                 - `results`: A list of dictionaries with `statements` and `statement_scores` for each input answer.
         """
-        result = super().run(**inputs)
+        result = super().run(questions=questions, contexts=contexts, responses=responses)
 
         # calculate average statement faithfulness score per query
         for res in result["results"]:
diff --git a/test/components/evaluators/test_faithfulness_evaluator.py b/test/components/evaluators/test_faithfulness_evaluator.py
index 5776437366..0aa97ee95b 100644
--- a/test/components/evaluators/test_faithfulness_evaluator.py
+++ b/test/components/evaluators/test_faithfulness_evaluator.py
@@ -1,3 +1,4 @@
+import os
 from typing import List
 
 import pytest
@@ -108,10 +109,15 @@ def generator_run(self, *args, **kwargs):
         questions = ["Which is the most popular global sport?", "Who created the Python language?"]
         contexts = [
             [
-                "The popularity of sports can be measured in various ways, including TV viewership, social media presence, number of participants, and economic impact. Football is undoubtedly the world's most popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and Messi, drawing a followership of more than 4 billion people."
+                "The popularity of sports can be measured in various ways, including TV viewership, social media "
+                "presence, number of participants, and economic impact. Football is undoubtedly the world's most "
+                "popular sport with major events like the FIFA World Cup and sports personalities like Ronaldo and "
+                "Messi, drawing a followership of more than 4 billion people."
             ],
             [
-                "Python, created by Guido van Rossum in the late 1980s, is a high-level general-purpose programming language. Its design philosophy emphasizes code readability, and its language constructs aim to help programmers write clear, logical code for both small and large-scale software projects."
+                "Python, created by Guido van Rossum in the late 1980s, is a high-level general-purpose programming "
+                "language. Its design philosophy emphasizes code readability, and its language constructs aim to help "
+                "programmers write clear, logical code for both small and large-scale software projects."
             ],
         ]
         responses = [
@@ -127,3 +133,28 @@ def generator_run(self, *args, **kwargs):
             ],
             "score": 0.75,
         }
+
+    def test_run_missing_parameters(self, monkeypatch):
+        monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
+        component = FaithfulnessEvaluator()
+        with pytest.raises(TypeError, match="missing 3 required positional arguments"):
+            component.run()
+
+    @pytest.mark.skipif(
+        not os.environ.get("OPENAI_API_KEY", None),
+        reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.",
+    )
+    @pytest.mark.integration
+    def test_live_run(self):
+        questions = ["What is Python and who created it?"]
+        contexts = [["Python is a programming language created by Guido van Rossum."]]
+        responses = ["Python is a programming language created by George Lucas."]
+        evaluator = FaithfulnessEvaluator()
+        result = evaluator.run(questions=questions, contexts=contexts, responses=responses)
+
+        assert result["score"] == 0.5
+        assert result["individual_scores"] == [0.5]
+        assert result["results"][0]["score"] == 0.5
+        assert result["results"][0]["statement_scores"] == [1, 0]
+        assert "programming language" in result["results"][0]["statements"][0]
+        assert "George Lucas" in result["results"][0]["statements"][1]