Skip to content

Commit

Permalink
rework coherence to be summarization specific
Browse files Browse the repository at this point in the history
  • Loading branch information
b.nativi committed Aug 22, 2024
1 parent 7770c33 commit dfc7552
Show file tree
Hide file tree
Showing 12 changed files with 470 additions and 778 deletions.
6 changes: 3 additions & 3 deletions api/tests/functional-tests/backend/core/test_llm_clients.py
Original file line number Diff line number Diff line change
Expand Up @@ -987,23 +987,23 @@ def _return_invalid4_toxicity_response(*args, **kwargs):
"valor_api.backend.core.llm_clients.LLMClient.__call__",
_return_valid_coherence_response,
)
assert 5 == client.coherence("some text")
assert 5 == client.coherence("some text", "some summary")

# Coherence score is not an integer.
monkeypatch.setattr(
"valor_api.backend.core.llm_clients.LLMClient.__call__",
_return_invalid1_coherence_response,
)
with pytest.raises(InvalidLLMResponseError):
client.coherence("some text")
client.coherence("some text", "some summary")

# Coherence score is 0, which is not in {1,2,3,4,5}.
monkeypatch.setattr(
"valor_api.backend.core.llm_clients.LLMClient.__call__",
_return_invalid2_coherence_response,
)
with pytest.raises(InvalidLLMResponseError):
client.coherence("some text")
client.coherence("some text", "some summary")

# Patch __call__ with a valid response.
monkeypatch.setattr(
Expand Down
251 changes: 200 additions & 51 deletions api/tests/functional-tests/backend/metrics/test_text_generation.py

Large diffs are not rendered by default.

11 changes: 1 addition & 10 deletions api/tests/unit-tests/schemas/test_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@ def test_EvaluationParameters(llm_api_params):
MetricType.AnswerRelevance,
MetricType.Bias,
MetricType.BLEU,
MetricType.Coherence,
MetricType.ContextPrecision,
MetricType.ContextRecall,
MetricType.ContextRelevance,
Expand All @@ -83,7 +82,6 @@ def test_EvaluationParameters(llm_api_params):
MetricType.AnswerRelevance,
MetricType.Bias,
MetricType.BLEU,
MetricType.Coherence,
MetricType.ContextPrecision,
MetricType.ContextRecall,
MetricType.ContextRelevance,
Expand Down Expand Up @@ -199,21 +197,14 @@ def test_EvaluationParameters(llm_api_params):
bleu_weights=[1.1, 0.3, -0.5, 0.1],
)

# BLEU weights must sum to 1.
# BLEU weights must sum to 1. metrics_to_return here are all metrics applicable to summarization.
with pytest.raises(ValidationError):
schemas.EvaluationParameters(
task_type=enums.TaskType.TEXT_GENERATION,
metrics_to_return=[
MetricType.AnswerCorrectness,
MetricType.AnswerRelevance,
MetricType.Bias,
MetricType.BLEU,
MetricType.Coherence,
MetricType.ContextPrecision,
MetricType.ContextRecall,
MetricType.ContextRelevance,
MetricType.Faithfulness,
MetricType.Hallucination,
MetricType.ROUGE,
MetricType.Toxicity,
],
Expand Down
8 changes: 4 additions & 4 deletions api/tests/unit-tests/schemas/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -631,7 +631,7 @@ def test_CoherenceMetric():
parameters={
"dataset_uid": "01",
"dataset_name": "test_dataset",
"prediction": "some prediction",
"prediction": "some summary",
},
)

Expand All @@ -641,7 +641,7 @@ def test_CoherenceMetric():
parameters={
"dataset_uid": "01",
"dataset_name": "test_dataset",
"prediction": "some prediction",
"prediction": "some summary",
},
)

Expand All @@ -651,7 +651,7 @@ def test_CoherenceMetric():
parameters={
"dataset_uid": "01",
"dataset_name": "test_dataset",
"prediction": "some prediction",
"prediction": "some summary",
},
)

Expand All @@ -661,7 +661,7 @@ def test_CoherenceMetric():
parameters={
"dataset_uid": "01",
"dataset_name": "test_dataset",
"prediction": "some prediction",
"prediction": "some summary",
},
)

Expand Down
22 changes: 15 additions & 7 deletions api/valor_api/backend/core/llm_clients.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,14 +403,17 @@ def _generate_bias_verdicts(
def _coherence(
self,
text: str,
summary: str,
) -> int:
"""
Compute coherence, the collective quality of all sentences, for a single piece of text.
Compute coherence, the collective quality of a summary.
Parameters
----------
text: str
The text to be evaluated.
The text that was summarized.
summary: str
The summary to be evaluated.
Returns
-------
Expand All @@ -421,7 +424,9 @@ def _coherence(
{"role": "system", "content": DEFAULT_SYSTEM_PROMPT},
{
"role": "user",
"content": generate_coherence_instruction(text=text),
"content": generate_coherence_instruction(
text=text, summary=summary
),
},
]

Expand Down Expand Up @@ -854,21 +859,24 @@ def bias(
def coherence(
self,
text: str,
summary: str,
) -> int:
"""
Compute coherence, the collective quality of all sentences, for a single piece of text.
Compute coherence, the collective quality of a summary.
Parameters
----------
text: str
The text to be evaluated.
The text that was summarized.
summary: str
The summary to be evaluated.
Returns
-------
int
The coherence score will be evaluated as an integer, with 1 indicating the lowest coherence and 5 the highest coherence.
"""
return self._coherence(text)
return self._coherence(text=text, summary=summary)

def context_precision(
self,
Expand Down Expand Up @@ -1480,7 +1488,7 @@ def __call__(

# Coherence score
elif (
"Coherence (1-5) - the collective quality of all sentences."
"Your task is to rate the summary based on its coherence"
in processed_messages[1]["content"]
):
response = "4"
Expand Down
26 changes: 16 additions & 10 deletions api/valor_api/backend/core/llm_instructions_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,37 +343,43 @@ def generate_bias_verdicts_instruction(opinions: list[str]) -> str:
"""


def generate_coherence_instruction(text: str) -> str:
def generate_coherence_instruction(
text: str,
summary: str,
) -> str:
"""
Generate LLM instruction for evaluating the coherence of the text.
This instruction was adapted from appendix A of DeepEval's paper G-EVAL: NLG Evaluation using GPT-4 with Better Human Alignment (https://arxiv.org/pdf/2303.16634).
The main adaptation is a generalization of the metric to more task types. The example prompt in DeepEval was specific to summarization, but the below prompt could apply to any text generation task.
Crucially, unlike DeepEval, no context is used. Instead, the coherence of the text is evaluated entirely based on the text. This generalizes the prompt and also prevents the evaluation from being influenced by the quality of sentences in the context.
The instruction was generalized to apply to any text summarization task, as opposed to DeepEval's example instruction which was specific to news article summarization.
Parameters
----------
text: str
The text to be evaluated.
The text that was summarized.
summary: str
The summary to be evaluated.
Returns
-------
str
The instruction for the LLM.
The instruction for the llm.
"""
return f"""Grade the text. Your task is to rate the text based on its coherence. Please make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed.
return f"""You will be given one summary written for a piece of text. Your task is to rate the summary based on its coherence. Please make sure you read and understand these instructions carefully. Please keep this document open while reviewing and refer to it as needed.
Evaluation Criteria:
Coherence (1-5) - the collective quality of all sentences. We align this dimension with the DUC quality question of structure and coherence whereby ”the summary should be well-structured and well-organized. The summary should not just be a heap of related information, but should build from sentence to sentence to a coherent body of information about a topic.
Coherence (1-5) - the collective quality of all sentences. We align this dimension with the DUC quality question of structure and coherence: the summary should be well-structured and well-organized. The summary should not just be a heap of related information, but should build from sentence to sentence to a coherent body of information about a topic.
Evaluation Steps:
1. Read the text carefully and identify the main topic and key points.
2. Check if the text presents the information in a clear and logical order. Examine the collective quality of all sentences.
2. Read the summary and compare it to the text. Check if the summary covers the main topic and key points of the text, and if it presents them in a clear and logical order.
3. Assign a score for coherence on a scale of 1 to 5, where 1 is the lowest and 5 is the highest based on the Evaluation Criteria. Respond with just the number 1 to 5.
Text:
{text}
Summary:
{summary}
Coherence Score (1-5):
"""

Expand Down
Loading

0 comments on commit dfc7552

Please sign in to comment.