rework coherence to be summarization specific

Striveworks · Aug 22, 2024 · dfc7552 · dfc7552
1 parent 7770c33
commit dfc7552
Show file tree

Hide file tree

Showing 12 changed files with 470 additions and 778 deletions.
diff --git a/api/tests/functional-tests/backend/core/test_llm_clients.py b/api/tests/functional-tests/backend/core/test_llm_clients.py
@@ -987,23 +987,23 @@ def _return_invalid4_toxicity_response(*args, **kwargs):
         "valor_api.backend.core.llm_clients.LLMClient.__call__",
         _return_valid_coherence_response,
     )
-    assert 5 == client.coherence("some text")
+    assert 5 == client.coherence("some text", "some summary")
 
     # Coherence score is not an integer.
     monkeypatch.setattr(
         "valor_api.backend.core.llm_clients.LLMClient.__call__",
         _return_invalid1_coherence_response,
     )
     with pytest.raises(InvalidLLMResponseError):
-        client.coherence("some text")
+        client.coherence("some text", "some summary")
 
     # Coherence score is 0, which is not in {1,2,3,4,5}.
     monkeypatch.setattr(
         "valor_api.backend.core.llm_clients.LLMClient.__call__",
         _return_invalid2_coherence_response,
     )
     with pytest.raises(InvalidLLMResponseError):
-        client.coherence("some text")
+        client.coherence("some text", "some summary")
 
     # Patch __call__ with a valid response.
     monkeypatch.setattr(

diff --git a/api/tests/functional-tests/backend/metrics/test_text_generation.py b/api/tests/functional-tests/backend/metrics/test_text_generation.py
diff --git a/api/tests/unit-tests/schemas/test_evaluation.py b/api/tests/unit-tests/schemas/test_evaluation.py
@@ -63,7 +63,6 @@ def test_EvaluationParameters(llm_api_params):
             MetricType.AnswerRelevance,
             MetricType.Bias,
             MetricType.BLEU,
-            MetricType.Coherence,
             MetricType.ContextPrecision,
             MetricType.ContextRecall,
             MetricType.ContextRelevance,
@@ -83,7 +82,6 @@ def test_EvaluationParameters(llm_api_params):
             MetricType.AnswerRelevance,
             MetricType.Bias,
             MetricType.BLEU,
-            MetricType.Coherence,
             MetricType.ContextPrecision,
             MetricType.ContextRecall,
             MetricType.ContextRelevance,
@@ -199,21 +197,14 @@ def test_EvaluationParameters(llm_api_params):
             bleu_weights=[1.1, 0.3, -0.5, 0.1],
         )
 
-    # BLEU weights must sum to 1.
+    # BLEU weights must sum to 1. metrics_to_return here are all metrics applicable to summarization.
     with pytest.raises(ValidationError):
         schemas.EvaluationParameters(
             task_type=enums.TaskType.TEXT_GENERATION,
             metrics_to_return=[
-                MetricType.AnswerCorrectness,
-                MetricType.AnswerRelevance,
                 MetricType.Bias,
                 MetricType.BLEU,
                 MetricType.Coherence,
-                MetricType.ContextPrecision,
-                MetricType.ContextRecall,
-                MetricType.ContextRelevance,
-                MetricType.Faithfulness,
-                MetricType.Hallucination,
                 MetricType.ROUGE,
                 MetricType.Toxicity,
             ],

diff --git a/api/tests/unit-tests/schemas/test_metrics.py b/api/tests/unit-tests/schemas/test_metrics.py
@@ -631,7 +631,7 @@ def test_CoherenceMetric():
         parameters={
             "dataset_uid": "01",
             "dataset_name": "test_dataset",
-            "prediction": "some prediction",
+            "prediction": "some summary",
         },
     )
 
@@ -641,7 +641,7 @@ def test_CoherenceMetric():
             parameters={
                 "dataset_uid": "01",
                 "dataset_name": "test_dataset",
-                "prediction": "some prediction",
+                "prediction": "some summary",
             },
         )
 
@@ -651,7 +651,7 @@ def test_CoherenceMetric():
             parameters={
                 "dataset_uid": "01",
                 "dataset_name": "test_dataset",
-                "prediction": "some prediction",
+                "prediction": "some summary",
             },
         )
 
@@ -661,7 +661,7 @@ def test_CoherenceMetric():
             parameters={
                 "dataset_uid": "01",
                 "dataset_name": "test_dataset",
-                "prediction": "some prediction",
+                "prediction": "some summary",
             },
         )
 

diff --git a/api/valor_api/backend/core/llm_clients.py b/api/valor_api/backend/core/llm_clients.py
@@ -403,14 +403,17 @@ def _generate_bias_verdicts(
     def _coherence(
         self,
         text: str,
+        summary: str,
     ) -> int:
         """
-        Compute coherence, the collective quality of all sentences, for a single piece of text.
+        Compute coherence, the collective quality of a summary.
 
         Parameters
         ----------
         text: str
-            The text to be evaluated.
+            The text that was summarized.
+        summary: str
+            The summary to be evaluated.
 
         Returns
         -------
@@ -421,7 +424,9 @@ def _coherence(
             {"role": "system", "content": DEFAULT_SYSTEM_PROMPT},
             {
                 "role": "user",
-                "content": generate_coherence_instruction(text=text),
+                "content": generate_coherence_instruction(
+                    text=text, summary=summary
+                ),
             },
         ]
 
@@ -854,21 +859,24 @@ def bias(
     def coherence(
         self,
         text: str,
+        summary: str,
     ) -> int:
         """
-        Compute coherence, the collective quality of all sentences, for a single piece of text.
+        Compute coherence, the collective quality of a summary.
 
         Parameters
         ----------
         text: str
-            The text to be evaluated.
+            The text that was summarized.
+        summary: str
+            The summary to be evaluated.
 
         Returns
         -------
         int
             The coherence score will be evaluated as an integer, with 1 indicating the lowest coherence and 5 the highest coherence.
         """
-        return self._coherence(text)
+        return self._coherence(text=text, summary=summary)
 
     def context_precision(
         self,
@@ -1480,7 +1488,7 @@ def __call__(
 
             # Coherence score
             elif (
-                "Coherence (1-5) - the collective quality of all sentences."
+                "Your task is to rate the summary based on its coherence"
                 in processed_messages[1]["content"]
             ):
                 response = "4"

diff --git a/api/valor_api/backend/core/llm_instructions_analysis.py b/api/valor_api/backend/core/llm_instructions_analysis.py
@@ -343,37 +343,43 @@ def generate_bias_verdicts_instruction(opinions: list[str]) -> str:
 """
 
 
-def generate_coherence_instruction(text: str) -> str:
+def generate_coherence_instruction(
+    text: str,
+    summary: str,
+) -> str:
     """
-    Generate LLM instruction for evaluating the coherence of the text.
-
     This instruction was adapted from appendix A of DeepEval's paper G-EVAL: NLG Evaluation using GPT-4 with Better Human Alignment (https://arxiv.org/pdf/2303.16634).
-    The main adaptation is a generalization of the metric to more task types. The example prompt in DeepEval was specific to summarization, but the below prompt could apply to any text generation task.
-    Crucially, unlike DeepEval, no context is used. Instead, the coherence of the text is evaluated entirely based on the text. This generalizes the prompt and also prevents the evaluation from being influenced by the quality of sentences in the context.
+
+    The instruction was generalized to apply to any text summarization task, as opposed to DeepEval's example instruction which was specific to news article summarization.
 
     Parameters
     ----------
     text: str
-        The text to be evaluated.
+        The text that was summarized.
+    summary: str
+        The summary to be evaluated.
 
     Returns
     -------
     str
-        The instruction for the LLM.
+        The instruction for the llm.
     """
-    return f"""Grade the text. Your task is to rate the text based on its coherence. Please make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed.
+    return f"""You will be given one summary written for a piece of text. Your task is to rate the summary based on its coherence. Please make sure you read and understand these instructions carefully. Please keep this document open while reviewing and refer to it as needed.
 
     Evaluation Criteria:
-    Coherence (1-5) - the collective quality of all sentences. We align this dimension with the DUC quality question of structure and coherence whereby ”the summary should be well-structured and well-organized. The summary should not just be a heap of related information, but should build from sentence to sentence to a coherent body of information about a topic.”
+    Coherence (1-5) - the collective quality of all sentences. We align this dimension with the DUC quality question of structure and coherence: the summary should be well-structured and well-organized. The summary should not just be a heap of related information, but should build from sentence to sentence to a coherent body of information about a topic.
 
     Evaluation Steps:
     1. Read the text carefully and identify the main topic and key points.
-    2. Check if the text presents the information in a clear and logical order. Examine the collective quality of all sentences.
+    2. Read the summary and compare it to the text. Check if the summary covers the main topic and key points of the text, and if it presents them in a clear and logical order.
     3. Assign a score for coherence on a scale of 1 to 5, where 1 is the lowest and 5 is the highest based on the Evaluation Criteria. Respond with just the number 1 to 5.
 
     Text:
     {text}
 
+    Summary:
+    {summary}
+
     Coherence Score (1-5):
     """