diff --git a/api/tests/functional-tests/backend/core/test_llm_clients.py b/api/tests/functional-tests/backend/core/test_llm_clients.py index 927e22301..d617bc183 100644 --- a/api/tests/functional-tests/backend/core/test_llm_clients.py +++ b/api/tests/functional-tests/backend/core/test_llm_clients.py @@ -55,59 +55,90 @@ ] }```""" +GROUNDTRUTH_VALID_STATEMENTS = """```json +{ + "statements": [ + "gt statement 1", + "gt statement 2", + "gt statement 3", + "gt statement 4" + ] +}```""" + +ANSWER_CORRECTNESS_VALID_VERDICTS = """```json +{ + "TP": [ + "statement 1", + "statement 2", + "statement 4" + ], + "FP": [ + "statement 3" + ], + "FN": [ + "gt statement 1", + "gt statement 4" + ] +}```""" + ANSWER_RELEVANCE_VALID_VERDICTS = """```json { "verdicts": [ - { - "verdict": "no", - "reason": "The statement has nothing to do with the query." - }, - { - "verdict": "yes" - }, - { - "verdict": "idk" - }, - { - "verdict": "yes" - } + {"verdict": "no"}, + {"verdict": "yes"}, + {"verdict": "idk"}, + {"verdict": "yes"} ] }```""" BIAS_VALID_VERDICTS = """```json { "verdicts": [ - { - "verdict": "yes", - "reason": "This opinion demonstrates gender bias." - }, - { - "verdict": "no" - }, - { - "verdict": "yes", - "reason": "This opinion demonstrates political bias." - }, - { - "verdict": "no" - } + {"verdict": "yes"}, + {"verdict": "no"}, + {"verdict": "yes"}, + {"verdict": "no"} + ] +}```""" + +CONTEXT_PRECISION_VALID1_VERDICTS = """```json +{ + "verdicts": [ + {"verdict": "no"}, + {"verdict": "yes"}, + {"verdict": "no"}, + {"verdict": "no"}, + {"verdict": "yes"} + ] +}```""" + +CONTEXT_PRECISION_VALID2_VERDICTS = """```json +{ + "verdicts": [ + {"verdict": "no"}, + {"verdict": "no"}, + {"verdict": "no"}, + {"verdict": "no"}, + {"verdict": "no"} + ] +}```""" + +CONTEXT_RECALL_VALID_VERDICTS = """```json +{ + "verdicts": [ + {"verdict": "yes"}, + {"verdict": "yes"}, + {"verdict": "no"}, + {"verdict": "yes"} ] }```""" CONTEXT_RELEVANCE_VALID_VERDICTS = """```json { "verdicts": [ - { - "verdict": "no", - "reason": "This context does not relate to the query." - }, - { - "verdict": "yes" - }, - { - "verdict": "no", - "reason": "This context is not useful for answering the query." - } + {"verdict": "no"}, + {"verdict": "yes"}, + {"verdict": "no"} ] }```""" @@ -125,38 +156,19 @@ HALLUCINATION_VALID_VERDICTS = """```json { "verdicts": [ - { - "verdict": "no" - }, - - { - "verdict": "yes", - "reason": "The text and context disagree on when Abraham Lincoln was born." - }, - { - "verdict": "yes", - "reason": "The text says that Abraham Lincoln lost the election of 1860, but the context says that Abraham Lincoln won the election of 1860." - } + {"verdict": "no"}, + {"verdict": "yes"}, + {"verdict": "yes"} ] }```""" TOXICITY_VALID_VERDICTS = """```json { "verdicts": [ - { - "verdict": "yes", - "reason": "This opinion demonstrates hate." - }, - { - "verdict": "no" - }, - { - "verdict": "yes", - "reason": "This opinion demonstrates mockery." - }, - { - "verdict": "no" - } + {"verdict": "yes"}, + {"verdict": "no"}, + {"verdict": "yes"}, + {"verdict": "no"} ] }```""" @@ -176,6 +188,148 @@ def test_LLMClient(monkeypatch): Check the metric computations for LLMClient. The client children inherit all of these metric computations. """ + def _return_valid_answer_correctness_response(*args, **kwargs): + if "prediction text" in args[1][1]["content"]: + return VALID_STATEMENTS + elif "ground truth text" in args[1][1]["content"]: + return GROUNDTRUTH_VALID_STATEMENTS + elif ( + "Return in JSON format with three keys: 'TP', 'FP', and 'FN'" + in args[1][1]["content"] + ): + return ANSWER_CORRECTNESS_VALID_VERDICTS + else: + raise BadValueInTestLLMClientsError + + def _return_invalid1_answer_correctness_response(*args, **kwargs): + return """```json +{ + "list": [ + "statement 1", + "statement 2", + "statement 3", + "statement 4" + ] +}```""" + + def _return_invalid2_answer_correctness_response(*args, **kwargs): + if "prediction text" in args[1][1]["content"]: + return VALID_STATEMENTS + elif "ground truth text" in args[1][1]["content"]: + return """```json +{ + "statements": [ + "statement 1", + 4, + "statement 3", + "statement 4" + ] +}```""" + else: + raise BadValueInTestLLMClientsError + + def _return_invalid3_answer_correctness_response(*args, **kwargs): + if "prediction text" in args[1][1]["content"]: + return VALID_STATEMENTS + elif "ground truth text" in args[1][1]["content"]: + return GROUNDTRUTH_VALID_STATEMENTS + elif ( + "Return in JSON format with three keys: 'TP', 'FP', and 'FN'" + in args[1][1]["content"] + ): + return """```json +{ + "TP": [ + "statement 1", + "statement 2", + "statement 4" + ], + "FP": [ + "statement 3" + ] +}```""" + else: + raise BadValueInTestLLMClientsError + + def _return_invalid4_answer_correctness_response(*args, **kwargs): + if "prediction text" in args[1][1]["content"]: + return VALID_STATEMENTS + elif "ground truth text" in args[1][1]["content"]: + return GROUNDTRUTH_VALID_STATEMENTS + elif ( + "Return in JSON format with three keys: 'TP', 'FP', and 'FN'" + in args[1][1]["content"] + ): + return """```json +{ + "TP": "statement 1", + "FP": [ + "statement 3" + ], + "FN": [ + "gt statement 1", + "gt statement 4" + ] +}```""" + else: + raise BadValueInTestLLMClientsError + + def _return_invalid5_answer_correctness_response(*args, **kwargs): + if "prediction text" in args[1][1]["content"]: + return VALID_STATEMENTS + elif "ground truth text" in args[1][1]["content"]: + return GROUNDTRUTH_VALID_STATEMENTS + elif ( + "Return in JSON format with three keys: 'TP', 'FP', and 'FN'" + in args[1][1]["content"] + ): + return """```json +{ + "TP": [ + "statement 1", + "statement 2" + ], + "FP": [ + "statement 3" + ], + "FN": [ + "gt statement 1", + "gt statement 4" + ] +}```""" + else: + raise BadValueInTestLLMClientsError + + def _return_invalid6_answer_correctness_response(*args, **kwargs): + if "prediction text" in args[1][1]["content"]: + return VALID_STATEMENTS + elif "ground truth text" in args[1][1]["content"]: + return GROUNDTRUTH_VALID_STATEMENTS + elif ( + "Return in JSON format with three keys: 'TP', 'FP', and 'FN'" + in args[1][1]["content"] + ): + return """```json +{ + "TP": [ + "statement 1", + "statement 2", + "statement 4" + ], + "FP": [ + "statement 3" + ], + "FN": [ + "gt statement 1", + "gt statement 2", + "gt statement 3", + "gt statement 4", + "too many statements in 'FN'" + ] +}```""" + else: + raise BadValueInTestLLMClientsError + def _return_valid_answer_relevance_response(*args, **kwargs): if "generate a list of STATEMENTS" in args[1][1]["content"]: return VALID_STATEMENTS @@ -352,15 +506,6 @@ def _return_invalid4_bias_response(*args, **kwargs): else: raise BadValueInTestLLMClientsError - def _return_valid_coherence_response(*args, **kwargs): - return "5" - - def _return_invalid1_coherence_response(*args, **kwargs): - return "The score is 5." - - def _return_invalid2_coherence_response(*args, **kwargs): - return "0" - def _return_valid_context_relevance_response(*args, **kwargs): return CONTEXT_RELEVANCE_VALID_VERDICTS @@ -368,56 +513,99 @@ def _return_invalid1_context_relevance_response(*args, **kwargs): return """```json { "all_verdicts": [ + {"verdict": "no"}, + {"verdict": "yes"}, + {"verdict": "no"} + ] +}```""" + + def _return_valid1_context_precision_response(*args, **kwargs): + return CONTEXT_PRECISION_VALID1_VERDICTS + + def _return_valid2_context_precision_response(*args, **kwargs): + return CONTEXT_PRECISION_VALID2_VERDICTS + + def _return_invalid1_context_precision_response(*args, **kwargs): + return """```json +{ + "invalid_key": [ "verdict 1", "verdict 2", "verdict 3" ] }```""" - def _return_valid1_faithfulness_response(*args, **kwargs): - if ( - "generate a comprehensive list of FACTUAL CLAIMS" - in args[1][1]["content"] - ): - return VALID_CLAIMS + def _return_valid_context_recall_response(*args, **kwargs): + if "generate a list of STATEMENTS" in args[1][1]["content"]: + return VALID_STATEMENTS elif ( - "generate a list of verdicts to indicate whether EACH claim is implied by the context list" + "analyze each ground truth statement and determine if the statement can be attributed to the given context." in args[1][1]["content"] ): - return FAITHFULNESS_VALID_VERDICTS + return CONTEXT_RECALL_VALID_VERDICTS else: raise BadValueInTestLLMClientsError - def _return_valid2_faithfulness_response(*args, **kwargs): + def _return_invalid1_context_recall_response(*args, **kwargs): return """```json { - "claims": [] + "invalid_key": [ + "statement 1", + "statement 2", + "statement 3", + "statement 4" + ] }```""" - def _return_invalid1_faithfulness_response(*args, **kwargs): - if ( - "generate a comprehensive list of FACTUAL CLAIMS" + def _return_invalid2_context_recall_response(*args, **kwargs): + return """```json +{ + "statements": [ + 1, + "statement 2", + "statement 3", + "statement 4" + ] +}```""" + + def _return_invalid3_context_recall_response(*args, **kwargs): + if "generate a list of STATEMENTS" in args[1][1]["content"]: + return VALID_STATEMENTS + elif ( + "analyze each ground truth statement and determine if the statement can be attributed to the given context." in args[1][1]["content"] ): - return VALID_CLAIMS + return """```json +{ + "invalid_key": [ + "verdict 1", + "verdict 2", + "verdict 3", + "verdict 4" + ] +}```""" + else: + raise BadValueInTestLLMClientsError + + def _return_invalid4_context_recall_response(*args, **kwargs): + if "generate a list of STATEMENTS" in args[1][1]["content"]: + return VALID_STATEMENTS elif ( - "generate a list of verdicts to indicate whether EACH claim is implied by the context list" + "analyze each ground truth statement and determine if the statement can be attributed to the given context." in args[1][1]["content"] ): return """```json { - "list": [ + "verdicts": [ "verdict 1", "verdict 2", - "verdict 3", - "verdict 4", - "verdict 5" + "verdict 3" ] }```""" else: raise BadValueInTestLLMClientsError - def _return_invalid2_faithfulness_response(*args, **kwargs): + def _return_valid1_faithfulness_response(*args, **kwargs): if ( "generate a comprehensive list of FACTUAL CLAIMS" in args[1][1]["content"] @@ -427,18 +615,35 @@ def _return_invalid2_faithfulness_response(*args, **kwargs): "generate a list of verdicts to indicate whether EACH claim is implied by the context list" in args[1][1]["content"] ): - return """```json + return FAITHFULNESS_VALID_VERDICTS + else: + raise BadValueInTestLLMClientsError + + def _return_valid2_faithfulness_response(*args, **kwargs): + return """```json +{ + "claims": [] +}```""" + + def _return_invalid1_faithfulness_response(*args, **kwargs): + return """```json +{ + "invalid_key": [ + "claim 1", + "claim 2" + ] +}```""" + + def _return_invalid2_faithfulness_response(*args, **kwargs): + return """```json { "claims": [ - "verdict 1", - 2, - "verdict 3", - "verdict 4", - "verdict 5" + [ + "claim 1", + "claim 2" + ] ] }```""" - else: - raise BadValueInTestLLMClientsError def _return_invalid3_faithfulness_response(*args, **kwargs): if ( @@ -479,7 +684,7 @@ def _return_invalid4_faithfulness_response(*args, **kwargs): {"verdict": "no"}, {"verdict": "yes"}, {"verdict": "yes"}, - {"verdict": "yes"}, + {"verdict": "yes"} ] }```""" else: @@ -521,6 +726,15 @@ def _return_invalid1_hallucination_response(*args, **kwargs): ] }```""" + def _return_valid_summary_coherence_response(*args, **kwargs): + return "5" + + def _return_invalid1_summary_coherence_response(*args, **kwargs): + return "The score is 5." + + def _return_invalid2_summary_coherence_response(*args, **kwargs): + return "0" + def _return_valid1_toxicity_response(*args, **kwargs): if "generate a list of OPINIONS" in args[1][1]["content"]: return VALID_OPINIONS @@ -621,6 +835,75 @@ def _return_invalid4_toxicity_response(*args, **kwargs): with pytest.raises(NotImplementedError): client(fake_message) + # Patch __call__ with a valid response. + monkeypatch.setattr( + "valor_api.backend.core.llm_clients.LLMClient.__call__", + _return_valid_answer_correctness_response, + ) + assert 0.6666666666666666 == client.answer_correctness( + "some query", "prediction text", ["ground truth text"] + ) + + # Needs to have 'statements' key. + monkeypatch.setattr( + "valor_api.backend.core.llm_clients.LLMClient.__call__", + _return_invalid1_answer_correctness_response, + ) + with pytest.raises(InvalidLLMResponseError): + client.answer_correctness( + "some query", "prediction text", ["ground truth text"] + ) + + # Should fail if ground truth statements are invalid even when prediction statements are valid + monkeypatch.setattr( + "valor_api.backend.core.llm_clients.LLMClient.__call__", + _return_invalid2_answer_correctness_response, + ) + with pytest.raises(InvalidLLMResponseError): + client.answer_correctness( + "some query", "prediction text", ["ground truth text"] + ) + + # Missing 'FN' in dictionary + monkeypatch.setattr( + "valor_api.backend.core.llm_clients.LLMClient.__call__", + _return_invalid3_answer_correctness_response, + ) + with pytest.raises(InvalidLLMResponseError): + client.answer_correctness( + "some query", "prediction text", ["ground truth text"] + ) + + # TP has an invalid value. + monkeypatch.setattr( + "valor_api.backend.core.llm_clients.LLMClient.__call__", + _return_invalid4_answer_correctness_response, + ) + with pytest.raises(InvalidLLMResponseError): + client.answer_correctness( + "some query", "prediction text", ["ground truth text"] + ) + + # Number of TP + FP does not equal the number of prediction statements + monkeypatch.setattr( + "valor_api.backend.core.llm_clients.LLMClient.__call__", + _return_invalid5_answer_correctness_response, + ) + with pytest.raises(InvalidLLMResponseError): + client.answer_correctness( + "some query", "prediction text", ["ground truth text"] + ) + + # The number of FN is more than the number of ground truth statements + monkeypatch.setattr( + "valor_api.backend.core.llm_clients.LLMClient.__call__", + _return_invalid6_answer_correctness_response, + ) + with pytest.raises(InvalidLLMResponseError): + client.answer_correctness( + "some query", "prediction text", ["ground truth text"] + ) + # Patch __call__ with a valid response. monkeypatch.setattr( "valor_api.backend.core.llm_clients.LLMClient.__call__", @@ -709,25 +992,125 @@ def _return_invalid4_toxicity_response(*args, **kwargs): # Patch __call__ with a valid response. monkeypatch.setattr( "valor_api.backend.core.llm_clients.LLMClient.__call__", - _return_valid_coherence_response, + _return_valid1_context_precision_response, + ) + assert 0.45 == client.context_precision( + "some query", + ["context 1", "context 2", "context 3", "context 4", "context 5"], + ["some ground truth"], + ) + + # If all verdicts are "no", the returned score should be 0. + monkeypatch.setattr( + "valor_api.backend.core.llm_clients.LLMClient.__call__", + _return_valid2_context_precision_response, + ) + assert 0.0 == client.context_precision( + "some query", + ["context 1", "context 2", "context 3", "context 4", "context 5"], + ["some ground truth"], + ) + + # Context precision is meaningless if context_list is empty. + monkeypatch.setattr( + "valor_api.backend.core.llm_clients.LLMClient.__call__", + _return_valid1_context_precision_response, + ) + with pytest.raises(ValueError): + client.context_precision( + "some query", + [], + ["some ground truth"], + ) + + # Only 1 context provided but 5 verdicts were returned. + monkeypatch.setattr( + "valor_api.backend.core.llm_clients.LLMClient.__call__", + _return_valid1_context_precision_response, + ) + with pytest.raises(InvalidLLMResponseError): + client.context_precision( + "some query", + ["length of context list does not match LLM's response"], + ["some ground truth"], + ) + + # Key 'invalid_key' is returned but the key should be 'verdicts'. + monkeypatch.setattr( + "valor_api.backend.core.llm_clients.LLMClient.__call__", + _return_invalid1_context_precision_response, + ) + with pytest.raises(InvalidLLMResponseError): + client.context_precision( + "some query", + ["context 1", "context 2", "context 3", "context 4", "context 5"], + ["some ground truth"], + ) + + # Patch __call__ with a valid response. + monkeypatch.setattr( + "valor_api.backend.core.llm_clients.LLMClient.__call__", + _return_valid_context_recall_response, + ) + assert 0.75 == client.context_recall( + ["context 1", "context 2"], + ["some ground truth"], ) - assert 5 == client.coherence("some text") - # Coherence score is not an integer. + # Context recall is meaningless if context_list is empty. monkeypatch.setattr( "valor_api.backend.core.llm_clients.LLMClient.__call__", - _return_invalid1_coherence_response, + _return_valid_context_recall_response, + ) + with pytest.raises(ValueError): + client.context_recall( + [], + ["some ground truth"], + ) + + # Ground truth statements response must have key 'statements'. + monkeypatch.setattr( + "valor_api.backend.core.llm_clients.LLMClient.__call__", + _return_invalid1_context_recall_response, ) with pytest.raises(InvalidLLMResponseError): - client.coherence("some text") + client.context_recall( + ["context 1", "context 2"], + ["some ground truth"], + ) - # Coherence score is 0, which is not in {1,2,3,4,5}. + # Ground truth statements must be strings. monkeypatch.setattr( "valor_api.backend.core.llm_clients.LLMClient.__call__", - _return_invalid2_coherence_response, + _return_invalid2_context_recall_response, ) with pytest.raises(InvalidLLMResponseError): - client.coherence("some text") + client.context_recall( + ["context 1", "context 2"], + ["some ground truth"], + ) + + # Context recall verdicts response must have key 'verdicts'. + monkeypatch.setattr( + "valor_api.backend.core.llm_clients.LLMClient.__call__", + _return_invalid3_context_recall_response, + ) + with pytest.raises(InvalidLLMResponseError): + client.context_recall( + ["context 1", "context 2"], + ["some ground truth"], + ) + + # Number of context recall verdicts doesn't match the number of ground truth statements. + monkeypatch.setattr( + "valor_api.backend.core.llm_clients.LLMClient.__call__", + _return_invalid4_context_recall_response, + ) + with pytest.raises(InvalidLLMResponseError): + client.context_recall( + ["context 1", "context 2"], + ["some ground truth"], + ) # Patch __call__ with a valid response. monkeypatch.setattr( @@ -789,7 +1172,7 @@ def _return_invalid4_toxicity_response(*args, **kwargs): with pytest.raises(ValueError): client.faithfulness("some text", []) - # Bad key in the response. + # Bad key in the claims response. monkeypatch.setattr( "valor_api.backend.core.llm_clients.LLMClient.__call__", _return_invalid1_faithfulness_response, @@ -797,7 +1180,7 @@ def _return_invalid4_toxicity_response(*args, **kwargs): with pytest.raises(InvalidLLMResponseError): client.faithfulness("some text", ["context 1", "context 2"]) - # Invalid claim value. + # Claims must be strings, not lists of strings. monkeypatch.setattr( "valor_api.backend.core.llm_clients.LLMClient.__call__", _return_invalid2_faithfulness_response, @@ -805,7 +1188,7 @@ def _return_invalid4_toxicity_response(*args, **kwargs): with pytest.raises(InvalidLLMResponseError): client.faithfulness("some text", ["context 1", "context 2"]) - # Bad key in the response. + # Bad key in the verdicts response. monkeypatch.setattr( "valor_api.backend.core.llm_clients.LLMClient.__call__", _return_invalid3_faithfulness_response, @@ -867,6 +1250,29 @@ def _return_invalid4_toxicity_response(*args, **kwargs): "some query", ["context 1", "context 2", "context 3"] ) + # Patch __call__ with a valid response. + monkeypatch.setattr( + "valor_api.backend.core.llm_clients.LLMClient.__call__", + _return_valid_summary_coherence_response, + ) + assert 5 == client.summary_coherence("some text", "some summary") + + # Summary coherence score is not an integer. + monkeypatch.setattr( + "valor_api.backend.core.llm_clients.LLMClient.__call__", + _return_invalid1_summary_coherence_response, + ) + with pytest.raises(InvalidLLMResponseError): + client.summary_coherence("some text", "some summary") + + # Summary coherence score is 0, which is not in {1,2,3,4,5}. + monkeypatch.setattr( + "valor_api.backend.core.llm_clients.LLMClient.__call__", + _return_invalid2_summary_coherence_response, + ) + with pytest.raises(InvalidLLMResponseError): + client.summary_coherence("some text", "some summary") + # Patch __call__ with a valid response. monkeypatch.setattr( "valor_api.backend.core.llm_clients.LLMClient.__call__", diff --git a/api/tests/functional-tests/backend/metrics/test_text_generation.py b/api/tests/functional-tests/backend/metrics/test_text_generation.py index 2a607385f..669606433 100644 --- a/api/tests/functional-tests/backend/metrics/test_text_generation.py +++ b/api/tests/functional-tests/backend/metrics/test_text_generation.py @@ -28,6 +28,13 @@ ] +RAG_REFERENCES = [ + """John Adams and Alexander Hamilton did not get along. John Adams had grown independent of his cabinet, often making decisions despite opposition from it.\n""", # same as prediction with some strings deleted + """Yes, Lincoln won the election of 1860. He received the highest number of votes and a majority in the Electoral College, making him the 16th President of the United States. However, it's important to note that he won entirely due to his support in the North and West, as he did not receive any votes in 10 of the 15 Southern slave states.""", # same as prediction + """If kept warm, it would hatch a coyote.""", # very different than prediction +] + + RAG_CONTEXT = [ [ """Although aware of Hamilton\'s influence, Adams was convinced that their retention ensured a smoother succession. Adams maintained the economic programs of Hamilton, who regularly consulted with key cabinet members, especially the powerful Treasury Secretary, Oliver Wolcott Jr. Adams was in other respects quite independent of his cabinet, often making decisions despite opposition from it. Hamilton had grown accustomed to being regularly consulted by Washington. Shortly after Adams was inaugurated, Hamilton sent him a detailed letter with policy suggestions. Adams dismissively ignored it.\n\nFailed peace commission and XYZ affair\nHistorian Joseph Ellis writes that "[t]he Adams presidency was destined to be dominated by a single question of American policy to an extent seldom if ever encountered by any succeeding occupant of the office." That question was whether to make war with France or find peace. Britain and France were at war as a result of the French Revolution. Hamilton and the Federalists strongly favored the British monarchy against what they denounced as the political radicalism and anti-religious frenzy of the French Revolution. Jefferson and the Republicans, with their firm opposition to monarchy, strongly supported the French overthrowing their king. The French had supported Jefferson for president in 1796 and became belligerent at his loss.""", @@ -56,6 +63,16 @@ """Subject: Project Delay Due to Funding Cuts\n\nDear [Coworker's Name],\n\nI hope this message finds you well. I am writing to update you on the status of our project and unfortunately, convey some disappointing news.\n\nDue to recent funding cuts within our department, we have had to make some adjustments to project assignments. As a result, multiple employees, including key team members for our current project, have been moved to different projects to accommodate the changes. This unexpected shift has impacted our project timeline.\n\nI regret to inform you that our project deadline will need to be pushed back in light of these developments. I understand the inconvenience this may cause and I sincerely apologize for any disruption this may cause to your schedule or other commitments.\n\nPlease rest assured that despite these unforeseen circumstances, I am fully committed to completing the project efficiently and effectively. I will work closely with the team to develop a revised timeline and ensure that we deliver quality work that meets our objectives.\n\nThank you for your understanding and continued support during this challenging period. I value your collaboration and look forward to working together to overcome this setback and achieve our project goals.\n\nIf you have any questions or concerns, please feel free to reach out to me. I appreciate your patience as we navigate through this situation together.\n\nBest regards,\n\n[Your Name]""", ] +SUMMARIZATION_TEXTS = [ + """Aston Villa take on Liverpool in their FA Cup semi-final encounter on Sunday with the competition both sides' last chance to win any silverware this season. Sportsmail columnist Jamie Redknapp looks ahead to the Wembley showdown and where the match could be won and lost with individual player duels. CHRISTIAN BENTEKE v MARTIN SKRTEL . This will be a heavyweight contest that could decide the game. Christian Benteke is superb in the air and Martin Skrtel will have his hands full. Liverpool have to stop the supply line because defending crosses has been their Achilles heel this season. Christian Benteke (centre) scored the only goal of the game as Villa won 1-0 at Tottenham on April 11 . Liverpool defender Martin Skrtel (right) will have his hands full trying to stop Benteke on Sunday afternoon . FABIAN DELPH v JORDAN HENDERSON . This should be a good contest between two England team-mates. Fabian Delph’s new deal was a real boost for Villa - he drives that midfield, though he doesn’t get enough goals. You used to say the same about Jordan Henderson but he has improved so much. England international Fabian Delph (left) and Jordan Henderson are set for a midfield battle at Wembley . RAHEEM STERLING v RON VLAAR and NATHAN BAKER . Ron Vlaar and Nathan Baker make an imposing back line but they would rather be up against a Benteke than a Raheem Sterling, who will float around and make himself difficult to mark so he can use his lightning pace to get in behind them. Raheem Sterling's (left) pace and trickery is bound to cause the Villa defence a lot of problems . Ron Vlaar (left) was part of the Villa defence that kept a clean sheet at Spurs in the Premier League . The Holland international and Nathan Baker (right) will be hoping to do likewise against the Reds at Wembley.""", + """Juventus and Liverpool are continuing to monitor developments with Chelsea midfielder Oscar. The Brazil international has been criticised by Jose Mourinho in recent weeks and there are question marks over his future. Chelsea want to strengthen in the summer and may need a high profile departure to help balance the books. Juventus and Liverpool are interested in signing Chelsea 23-year-old midfielder Oscar . Oscar in action during Chelsea's 1-0 Premier League victory against Queens Park Rangers last weekend . Oscar cost Chelsea £19.35m and they would want a substantial profit on the 23 year-old. Paris Saintt Germain have shown interest in the past also. Juventus want a playmaker for next season and Brazil boss Carlos Dunga advised them to buy Oscar. 'He reminds me of Roberto Baggio,' he said. 'Oscar has technique, reads situations well and is a modern and versatile trequartista. He reminds me of Roberto Baggio, but also has similarities to Massimiliano Allegri. The former Sao Paulo youngster has struggled to make an impact for Chelsea this season . Brazil coach Dunga (pictured) revealed the Chelsea midfielder reminds him of Roberto Baggio . 'Brazilians like to have fun with their football, which hasn’t happened to Oscar very much recently, but I met Jose Mourinho and he spoke highly of all his Brazilian players. 'I tell Allegri that Oscar is strong and also a good lad. A forward line with him, Carlos Tevez and Alvaro Morata would drive any Coach crazy. 'It wouldn’t be a step backwards for Oscar to go to Juventus. He’d be decisive in Serie A and whether he plays for Juventus or Chelsea it’ll always be a great club.' Oscar celebrates scoring Chelsea's fourth goal during the 5-0 victory against Swansea in January.""", +] + +SUMMARIZATION_PREDICTIONS = [ + """Aston Villa and Liverpool face off in the FA Cup semi-final as both teams look to secure their last chance at silverware this season. Sportsmail columnist Jamie Redknapp analyzes key player duels that could decide the game, such as Christian Benteke against Martin Skrtel, Fabian Delph against Jordan Henderson, and Raheem Sterling against Ron Vlaar and Nathan Baker. Redknapp emphasizes the importance of stopping the supply line to Benteke and dealing with Sterling's pace and trickery in the match.""", + """Juventus and Liverpool are showing interest in Chelsea midfielder Oscar, who has faced criticism and uncertainty about his future at the club. Chelsea may need to sell a high-profile player to strengthen their squad in the summer. Oscar, who was signed for £19.35m, has also attracted interest from Paris Saint-Germain in the past. Brazil coach Carlos Dunga sees qualities in Oscar similar to Roberto Baggio and believes he could be a key player for Juventus.""", +] + @pytest.fixture def rag_dataset_name() -> str: @@ -100,15 +117,6 @@ def rag_q2() -> schemas.Datum: ) -@pytest.fixture -def rag_references(): - return [ - """John Adams and Alexander Hamilton did not get along. John Adams had grown independent of his cabinet, often making decisions despite opposition from it.\n""", # same as prediction with some strings deleted - """Yes, Lincoln won the election of 1860. He received the highest number of votes and a majority in the Electoral College, making him the 16th President of the United States. However, it's important to note that he won entirely due to his support in the North and West, as he did not receive any votes in 10 of the 15 Southern slave states.""", # same as prediction - """If kept warm, it would hatch a coyote.""", # very different than prediction - ] - - @pytest.fixture def rag_data( db: Session, @@ -117,7 +125,6 @@ def rag_data( rag_q0: schemas.Datum, rag_q1: schemas.Datum, rag_q2: schemas.Datum, - rag_references: list[str], ): datums = [rag_q0, rag_q1, rag_q2] @@ -128,7 +135,7 @@ def rag_data( dataset_name=rag_dataset_name, datum=datums[i], annotations=[ - schemas.Annotation(text=rag_references[i]), + schemas.Annotation(text=RAG_REFERENCES[i]), schemas.Annotation(text="some other text"), schemas.Annotation(text="some final text"), ], @@ -334,6 +341,114 @@ def content_gen_data( assert len(db.query(models.Label).all()) == 0 +@pytest.fixture +def summarization_dataset_name() -> str: + return "summarization_test_dataset" + + +@pytest.fixture +def summarization_model_name() -> str: + return "summarization_test_model" + + +@pytest.fixture +def summarization_q0() -> schemas.Datum: + return schemas.Datum( + uid="uid0", + text=SUMMARIZATION_TEXTS[0], + ) + + +@pytest.fixture +def summarization_q1() -> schemas.Datum: + return schemas.Datum( + uid="uid1", + text=SUMMARIZATION_TEXTS[1], + ) + + +@pytest.fixture +def summarization_data( + db: Session, + summarization_dataset_name: str, + summarization_model_name: str, + summarization_q0: schemas.Datum, + summarization_q1: schemas.Datum, +): + datums = [summarization_q0, summarization_q1] + + gts = [] + for i in range(len(datums)): + gts.append( + schemas.GroundTruth( + dataset_name=summarization_dataset_name, + datum=datums[i], + annotations=[], + ) + ) + + preds = [] + for i in range(len(datums)): + preds.append( + schemas.Prediction( + dataset_name=summarization_dataset_name, + model_name=summarization_model_name, + datum=datums[i], + annotations=[ + schemas.Annotation( + text=SUMMARIZATION_PREDICTIONS[i], + ) + ], + ) + ) + + crud.create_dataset( + db=db, + dataset=schemas.Dataset( + name=summarization_dataset_name, + ), + ) + + crud.create_groundtruths(db=db, groundtruths=gts) + crud.finalize(db=db, dataset_name=summarization_dataset_name) + + crud.create_model( + db=db, + model=schemas.Model( + name=summarization_model_name, + ), + ) + crud.create_predictions(db=db, predictions=preds) + crud.finalize( + db=db, + dataset_name=summarization_dataset_name, + model_name=summarization_model_name, + ) + + # 2 groundtruths with 1 annotations, 2 predictions with 1 annotation + assert len(db.query(models.Datum).all()) == 2 + assert len(db.query(models.Annotation).all()) == 4 + assert ( + len( + db.query(models.Annotation) + .where(models.Annotation.model_id.is_(None)) + .all() + ) + == 2 + ) + assert ( + len( + db.query(models.Annotation) + .where(models.Annotation.model_id.isnot(None)) + .all() + ) + == 2 + ) + assert len(db.query(models.GroundTruth).all()) == 0 + assert len(db.query(models.Prediction).all()) == 0 + assert len(db.query(models.Label).all()) == 0 + + @pytest.fixture def two_text_generation_datasets( db: Session, @@ -342,7 +457,6 @@ def two_text_generation_datasets( rag_q0: schemas.Datum, rag_q1: schemas.Datum, rag_q2: schemas.Datum, - rag_references: list[str], content_gen_dataset_name: str, content_gen_model_name: str, content_gen_q0: schemas.Datum, @@ -359,7 +473,7 @@ def two_text_generation_datasets( dataset_name=rag_dataset_name, datum=datums[i], annotations=[ - schemas.Annotation(text=rag_references[i]), + schemas.Annotation(text=RAG_REFERENCES[i]), schemas.Annotation(text="some other text"), schemas.Annotation(text="some final text"), ], @@ -494,6 +608,34 @@ def mocked_connection(self): pass +def mocked_answer_correctness( + self, + query: str, + prediction: str, + groundtruth_list: list[str], +): + ret_dict = { + ( + RAG_QUERIES[0], + RAG_PREDICTIONS[0], + tuple([RAG_REFERENCES[0], "some other text", "some final text"]), + ): 0.8, + ( + RAG_QUERIES[1], + RAG_PREDICTIONS[1], + tuple([RAG_REFERENCES[1], "some other text", "some final text"]), + ): 1.0, + ( + RAG_QUERIES[2], + RAG_PREDICTIONS[2], + tuple([RAG_REFERENCES[2], "some other text", "some final text"]), + ): 0.0, + } + if (query, prediction, tuple(groundtruth_list)) in ret_dict: + return ret_dict[(query, prediction, tuple(groundtruth_list))] + return 0.0 + + def mocked_answer_relevance( self, query: str, @@ -522,19 +664,62 @@ def mocked_bias( return ret_dict[text] -def mocked_coherence( +def mocked_context_precision( self, - text: str, + query: str, + ordered_context_list: list[str], + groundtruth_list: list[str], ): ret_dict = { - RAG_PREDICTIONS[0]: 4, - RAG_PREDICTIONS[1]: 5, - RAG_PREDICTIONS[2]: 4, - CONTENT_GEN_PREDICTIONS[0]: 5, - CONTENT_GEN_PREDICTIONS[1]: 5, - CONTENT_GEN_PREDICTIONS[2]: 5, + ( + RAG_QUERIES[0], + tuple(RAG_CONTEXT[0]), + tuple([RAG_REFERENCES[0], "some other text", "some final text"]), + ): 1.0, + ( + RAG_QUERIES[1], + tuple(RAG_CONTEXT[1]), + tuple([RAG_REFERENCES[1], "some other text", "some final text"]), + ): 1.0, + ( + RAG_QUERIES[2], + tuple(RAG_CONTEXT[2]), + tuple([RAG_REFERENCES[2], "some other text", "some final text"]), + ): 1.0, } - return ret_dict[text] + if ( + query, + tuple(ordered_context_list), + tuple(groundtruth_list), + ) in ret_dict: + return ret_dict[ + (query, tuple(ordered_context_list), tuple(groundtruth_list)) + ] + return 0.0 + + +def mocked_context_recall( + self, + context_list: list[str], + groundtruth_list: list[str], +): + ret_dict = { + ( + tuple(RAG_CONTEXT[0]), + tuple([RAG_REFERENCES[0], "some other text", "some final text"]), + ): 0.8, + ( + tuple(RAG_CONTEXT[1]), + tuple([RAG_REFERENCES[1], "some other text", "some final text"]), + ): 0.5, + ( + tuple(RAG_CONTEXT[2]), + tuple([RAG_REFERENCES[2], "some other text", "some final text"]), + ): 0.2, + } + if (tuple(context_list), tuple(groundtruth_list)) in ret_dict: + return ret_dict[(tuple(context_list), tuple(groundtruth_list))] + return 0.0 def mocked_context_relevance( @@ -576,6 +761,18 @@ def mocked_hallucination( return ret_dict[(text, tuple(context_list))] +def mocked_summary_coherence( + self, + text: str, + summary: str, +): + ret_dict = { + (SUMMARIZATION_TEXTS[0], SUMMARIZATION_PREDICTIONS[0]): 4, + (SUMMARIZATION_TEXTS[1], SUMMARIZATION_PREDICTIONS[1]): 5, + } + return ret_dict[(text, summary)] + + def mocked_toxicity( self, text: str, @@ -602,6 +799,10 @@ def mocked_compute_rouge_none(*args, **kwargs): "valor_api.backend.core.llm_clients.WrappedOpenAIClient.connect", mocked_connection, ) +@patch( + "valor_api.backend.core.llm_clients.WrappedOpenAIClient.answer_correctness", + mocked_answer_correctness, +) @patch( "valor_api.backend.core.llm_clients.WrappedOpenAIClient.answer_relevance", mocked_answer_relevance, @@ -611,8 +812,12 @@ def mocked_compute_rouge_none(*args, **kwargs): mocked_bias, ) @patch( - "valor_api.backend.core.llm_clients.WrappedOpenAIClient.coherence", - mocked_coherence, + "valor_api.backend.core.llm_clients.WrappedOpenAIClient.context_precision", + mocked_context_precision, +) +@patch( + "valor_api.backend.core.llm_clients.WrappedOpenAIClient.context_recall", + mocked_context_recall, ) @patch( "valor_api.backend.core.llm_clients.WrappedOpenAIClient.context_relevance", @@ -679,10 +884,12 @@ def test__compute_text_generation_rag( prediction_filter = datum_filter.model_copy() metrics_to_return = [ + MetricType.AnswerCorrectness, MetricType.AnswerRelevance, MetricType.Bias, MetricType.BLEU, - MetricType.Coherence, + MetricType.ContextPrecision, + MetricType.ContextRecall, MetricType.ContextRelevance, MetricType.Faithfulness, MetricType.Hallucination, @@ -707,10 +914,12 @@ def test__compute_text_generation_rag( expected_values = { "uid0": { + schemas.AnswerCorrectnessMetric: 0.8, schemas.AnswerRelevanceMetric: 0.6666666666666666, schemas.BiasMetric: 0.0, schemas.BLEUMetric: 0.3502270395690205, - schemas.CoherenceMetric: 4, + schemas.ContextPrecisionMetric: 1.0, + schemas.ContextRecallMetric: 0.8, schemas.ContextRelevanceMetric: 0.75, schemas.FaithfulnessMetric: 0.4, schemas.HallucinationMetric: 0.0, @@ -720,13 +929,16 @@ def test__compute_text_generation_rag( "rougeL": 0.5925925925925926, "rougeLsum": 0.5925925925925926, }, + schemas.SummaryCoherenceMetric: None, schemas.ToxicityMetric: 0.0, }, "uid1": { + schemas.AnswerCorrectnessMetric: 1.0, schemas.AnswerRelevanceMetric: 0.2, schemas.BiasMetric: 0.0, schemas.BLEUMetric: 1.0, - schemas.CoherenceMetric: 5, + schemas.ContextPrecisionMetric: 1.0, + schemas.ContextRecallMetric: 0.5, schemas.ContextRelevanceMetric: 1.0, schemas.FaithfulnessMetric: 0.55, schemas.HallucinationMetric: 0.0, @@ -736,13 +948,16 @@ def test__compute_text_generation_rag( "rougeL": 1.0, "rougeLsum": 1.0, }, + schemas.SummaryCoherenceMetric: None, schemas.ToxicityMetric: 0.0, }, "uid2": { + schemas.AnswerCorrectnessMetric: 0.0, schemas.AnswerRelevanceMetric: 0.2, schemas.BiasMetric: 0.0, schemas.BLEUMetric: 0.05434912989707719, - schemas.CoherenceMetric: 4, + schemas.ContextPrecisionMetric: 1.0, + schemas.ContextRecallMetric: 0.2, schemas.ContextRelevanceMetric: 0.25, schemas.FaithfulnessMetric: 0.6666666666666666, schemas.HallucinationMetric: 0.25, @@ -752,11 +967,13 @@ def test__compute_text_generation_rag( "rougeL": 0.18666666666666668, "rougeLsum": 0.18666666666666668, }, + schemas.SummaryCoherenceMetric: None, schemas.ToxicityMetric: 0.0, }, } assert metrics + assert len(metrics) == len(metrics_to_return) * len(expected_values) for metric in metrics: assert isinstance(metric.parameters, dict) assert isinstance(metric.parameters["datum_uid"], str) @@ -961,6 +1178,10 @@ def test__compute_text_generation_rag( "valor_api.backend.core.llm_clients.WrappedOpenAIClient.connect", mocked_connection, ) +@patch( + "valor_api.backend.core.llm_clients.WrappedOpenAIClient.answer_correctness", + mocked_answer_correctness, +) @patch( "valor_api.backend.core.llm_clients.WrappedOpenAIClient.answer_relevance", mocked_answer_relevance, @@ -970,8 +1191,12 @@ def test__compute_text_generation_rag( mocked_bias, ) @patch( - "valor_api.backend.core.llm_clients.WrappedOpenAIClient.coherence", - mocked_coherence, + "valor_api.backend.core.llm_clients.WrappedOpenAIClient.context_precision", + mocked_context_precision, +) +@patch( + "valor_api.backend.core.llm_clients.WrappedOpenAIClient.context_recall", + mocked_context_recall, ) @patch( "valor_api.backend.core.llm_clients.WrappedOpenAIClient.context_relevance", @@ -996,10 +1221,12 @@ def test_text_generation_rag( rag_data, ): metrics_to_return = [ + MetricType.AnswerCorrectness, MetricType.AnswerRelevance, MetricType.Bias, MetricType.BLEU, - MetricType.Coherence, + MetricType.ContextPrecision, + MetricType.ContextRecall, MetricType.ContextRelevance, MetricType.Faithfulness, MetricType.Hallucination, @@ -1055,10 +1282,12 @@ def test_text_generation_rag( expected_values = { "uid0": { + "AnswerCorrectness": 0.8, "AnswerRelevance": 0.6666666666666666, "Bias": 0.0, "BLEU": 0.3502270395690205, - "Coherence": 4, + "ContextPrecision": 1.0, + "ContextRecall": 0.8, "ContextRelevance": 0.75, "Faithfulness": 0.4, "Hallucination": 0.0, @@ -1071,10 +1300,12 @@ def test_text_generation_rag( "Toxicity": 0.0, }, "uid1": { + "AnswerCorrectness": 1.0, "AnswerRelevance": 0.2, "Bias": 0.0, "BLEU": 1.0, - "Coherence": 5, + "ContextPrecision": 1.0, + "ContextRecall": 0.5, "ContextRelevance": 1.0, "Faithfulness": 0.55, "Hallucination": 0.0, @@ -1087,10 +1318,12 @@ def test_text_generation_rag( "Toxicity": 0.0, }, "uid2": { + "AnswerCorrectness": 0.0, "AnswerRelevance": 0.2, "Bias": 0.0, "BLEU": 0.05434912989707719, - "Coherence": 4, + "ContextPrecision": 1.0, + "ContextRecall": 0.2, "ContextRelevance": 0.25, "Faithfulness": 0.6666666666666666, "Hallucination": 0.25, @@ -1105,6 +1338,7 @@ def test_text_generation_rag( } assert metrics + assert len(metrics) == len(metrics_to_return) * len(expected_values) for metric in metrics: assert isinstance(metric.parameters, dict) assert ( @@ -1135,18 +1369,10 @@ def test_text_generation_rag( "valor_api.backend.core.llm_clients.WrappedOpenAIClient.connect", mocked_connection, ) -@patch( - "valor_api.backend.core.llm_clients.WrappedOpenAIClient.answer_relevance", - mocked_answer_relevance, -) @patch( "valor_api.backend.core.llm_clients.WrappedOpenAIClient.bias", mocked_bias, ) -@patch( - "valor_api.backend.core.llm_clients.WrappedOpenAIClient.coherence", - mocked_coherence, -) @patch( "valor_api.backend.core.llm_clients.WrappedOpenAIClient.toxicity", mocked_toxicity, @@ -1159,7 +1385,6 @@ def test_text_generation_content_gen( ): metrics_to_return = [ MetricType.Bias, - MetricType.Coherence, MetricType.Toxicity, ] @@ -1204,22 +1429,20 @@ def test_text_generation_content_gen( expected_values = { "uid0": { "Bias": 0.2, - "Coherence": 5, "Toxicity": 0.4, }, "uid1": { "Bias": 0.0, - "Coherence": 5, "Toxicity": 0.0, }, "uid2": { "Bias": 0.0, - "Coherence": 5, "Toxicity": 0.0, }, } assert metrics + assert len(metrics) == len(metrics_to_return) * len(expected_values) for metric in metrics: assert isinstance(metric.parameters, dict) assert ( @@ -1232,6 +1455,85 @@ def test_text_generation_content_gen( "valor_api.backend.core.llm_clients.WrappedOpenAIClient.connect", mocked_connection, ) +@patch( + "valor_api.backend.core.llm_clients.WrappedOpenAIClient.summary_coherence", + mocked_summary_coherence, +) +def test_text_generation_summarization( + db: Session, + summarization_dataset_name: str, + summarization_model_name: str, + summarization_data, +): + metrics_to_return = [ + MetricType.SummaryCoherence, + ] + + # default request + job_request = schemas.EvaluationRequest( + dataset_names=[summarization_dataset_name], + model_names=[summarization_model_name], + parameters=schemas.EvaluationParameters( + task_type=TaskType.TEXT_GENERATION, + metrics_to_return=metrics_to_return, + llm_api_params={ + "client": "openai", + "data": { + "seed": 2024, + "model": "gpt-4o", + }, + }, + ), + ) + + # creates evaluation job + evaluations = create_or_get_evaluations(db=db, job_request=job_request) + assert len(evaluations) == 1 + assert evaluations[0].status == EvaluationStatus.PENDING + + # computation, normally run as background task + _ = compute_text_generation_metrics( + db=db, + evaluation_id=evaluations[0].id, + ) + + # get evaluations + evaluations = create_or_get_evaluations(db=db, job_request=job_request) + assert len(evaluations) == 1 + assert evaluations[0].status in { + EvaluationStatus.RUNNING, + EvaluationStatus.DONE, + } + + metrics = evaluations[0].metrics + + expected_values = { + "uid0": { + "SummaryCoherence": 4, + }, + "uid1": { + "SummaryCoherence": 5, + }, + } + + assert metrics + assert len(metrics) == len(metrics_to_return) * len(expected_values) + for metric in metrics: + assert isinstance(metric.parameters, dict) + assert ( + expected_values[metric.parameters["datum_uid"]][metric.type] + == metric.value + ) + + +@patch( + "valor_api.backend.core.llm_clients.WrappedOpenAIClient.connect", + mocked_connection, +) +@patch( + "valor_api.backend.core.llm_clients.WrappedOpenAIClient.answer_correctness", + mocked_answer_correctness, +) @patch( "valor_api.backend.core.llm_clients.WrappedOpenAIClient.answer_relevance", mocked_answer_relevance, @@ -1241,8 +1543,12 @@ def test_text_generation_content_gen( mocked_bias, ) @patch( - "valor_api.backend.core.llm_clients.WrappedOpenAIClient.coherence", - mocked_coherence, + "valor_api.backend.core.llm_clients.WrappedOpenAIClient.context_precision", + mocked_context_precision, +) +@patch( + "valor_api.backend.core.llm_clients.WrappedOpenAIClient.context_recall", + mocked_context_recall, ) @patch( "valor_api.backend.core.llm_clients.WrappedOpenAIClient.context_relevance", @@ -1270,10 +1576,12 @@ def test_text_generation_two_datasets( ): # test with a RAG dataset metrics_to_return = [ + MetricType.AnswerCorrectness, MetricType.AnswerRelevance, MetricType.Bias, MetricType.BLEU, - MetricType.Coherence, + MetricType.ContextPrecision, + MetricType.ContextRecall, MetricType.ContextRelevance, MetricType.Faithfulness, MetricType.Hallucination, @@ -1329,10 +1637,12 @@ def test_text_generation_two_datasets( expected_values = { "uid0": { + "AnswerCorrectness": 0.8, "AnswerRelevance": 0.6666666666666666, "Bias": 0.0, "BLEU": 0.3502270395690205, - "Coherence": 4, + "ContextPrecision": 1.0, + "ContextRecall": 0.8, "ContextRelevance": 0.75, "Faithfulness": 0.4, "Hallucination": 0.0, @@ -1345,10 +1655,12 @@ def test_text_generation_two_datasets( "Toxicity": 0.0, }, "uid1": { + "AnswerCorrectness": 1.0, "AnswerRelevance": 0.2, "Bias": 0.0, "BLEU": 1.0, - "Coherence": 5, + "ContextPrecision": 1.0, + "ContextRecall": 0.5, "ContextRelevance": 1.0, "Faithfulness": 0.55, "Hallucination": 0.0, @@ -1361,10 +1673,12 @@ def test_text_generation_two_datasets( "Toxicity": 0.0, }, "uid2": { + "AnswerCorrectness": 0.0, "AnswerRelevance": 0.2, "Bias": 0.0, "BLEU": 0.05434912989707719, - "Coherence": 4, + "ContextPrecision": 1.0, + "ContextRecall": 0.2, "ContextRelevance": 0.25, "Faithfulness": 0.6666666666666666, "Hallucination": 0.25, @@ -1379,6 +1693,7 @@ def test_text_generation_two_datasets( } assert metrics + assert len(metrics) == len(metrics_to_return) * len(expected_values) for metric in metrics: assert isinstance(metric.parameters, dict) assert ( @@ -1389,7 +1704,6 @@ def test_text_generation_two_datasets( # test with a content generation dataset metrics_to_return = [ MetricType.Bias, - MetricType.Coherence, MetricType.Toxicity, ] @@ -1434,22 +1748,20 @@ def test_text_generation_two_datasets( expected_values = { "uid0": { "Bias": 0.2, - "Coherence": 5, "Toxicity": 0.4, }, "uid1": { "Bias": 0.0, - "Coherence": 5, "Toxicity": 0.0, }, "uid2": { "Bias": 0.0, - "Coherence": 5, "Toxicity": 0.0, }, } assert metrics + assert len(metrics) == len(metrics_to_return) * len(expected_values) for metric in metrics: assert isinstance(metric.parameters, dict) assert ( diff --git a/api/tests/unit-tests/schemas/test_evaluation.py b/api/tests/unit-tests/schemas/test_evaluation.py index d2fe1e73f..a6cd16c6a 100644 --- a/api/tests/unit-tests/schemas/test_evaluation.py +++ b/api/tests/unit-tests/schemas/test_evaluation.py @@ -59,10 +59,12 @@ def test_EvaluationParameters(llm_api_params): schemas.EvaluationParameters( task_type=enums.TaskType.TEXT_GENERATION, metrics_to_return=[ + MetricType.AnswerCorrectness, MetricType.AnswerRelevance, MetricType.Bias, MetricType.BLEU, - MetricType.Coherence, + MetricType.ContextPrecision, + MetricType.ContextRecall, MetricType.ContextRelevance, MetricType.Faithfulness, MetricType.Hallucination, @@ -76,10 +78,12 @@ def test_EvaluationParameters(llm_api_params): schemas.EvaluationParameters( task_type=enums.TaskType.TEXT_GENERATION, metrics_to_return=[ + MetricType.AnswerCorrectness, MetricType.AnswerRelevance, MetricType.Bias, MetricType.BLEU, - MetricType.Coherence, + MetricType.ContextPrecision, + MetricType.ContextRecall, MetricType.ContextRelevance, MetricType.Faithfulness, MetricType.Hallucination, @@ -167,15 +171,13 @@ def test_EvaluationParameters(llm_api_params): ) # If any llm-guided metrics are requested, then llm_api_params must be provided. + # Purposely did a subset of metrics_to_return, to increase test variation. with pytest.raises(ValidationError): schemas.EvaluationParameters( task_type=enums.TaskType.TEXT_GENERATION, metrics_to_return=[ MetricType.AnswerRelevance, - MetricType.Bias, MetricType.BLEU, - MetricType.Coherence, - MetricType.ContextRelevance, MetricType.Faithfulness, MetricType.Hallucination, MetricType.ROUGE, @@ -195,19 +197,15 @@ def test_EvaluationParameters(llm_api_params): bleu_weights=[1.1, 0.3, -0.5, 0.1], ) - # BLEU weights must sum to 1. + # BLEU weights must sum to 1. metrics_to_return here are all metrics applicable to summarization. with pytest.raises(ValidationError): schemas.EvaluationParameters( task_type=enums.TaskType.TEXT_GENERATION, metrics_to_return=[ - MetricType.AnswerRelevance, MetricType.Bias, MetricType.BLEU, - MetricType.Coherence, - MetricType.ContextRelevance, - MetricType.Faithfulness, - MetricType.Hallucination, MetricType.ROUGE, + MetricType.SummaryCoherence, MetricType.Toxicity, ], llm_api_params=llm_api_params, diff --git a/api/tests/unit-tests/schemas/test_metrics.py b/api/tests/unit-tests/schemas/test_metrics.py index 803119a8d..1b2b17e67 100644 --- a/api/tests/unit-tests/schemas/test_metrics.py +++ b/api/tests/unit-tests/schemas/test_metrics.py @@ -434,6 +434,50 @@ def test_DetailedPrecisionRecallCurve(): } +def test_AnswerCorrectnessMetric(): + metric = schemas.AnswerCorrectnessMetric( + value=0.52, + parameters={ + "dataset_uid": "01", + "dataset_name": "test_dataset", + "prediction": "some prediction", + }, + ) + + with pytest.raises(ValidationError): + schemas.AnswerCorrectnessMetric( + value=None, # type: ignore + parameters={ + "dataset_uid": "01", + "dataset_name": "test_dataset", + "prediction": "some prediction", + }, + ) + + with pytest.raises(ValidationError): + schemas.AnswerCorrectnessMetric( + value={"key": 0.3}, # type: ignore + parameters={ + "dataset_uid": "01", + "dataset_name": "test_dataset", + "prediction": "some prediction", + }, + ) + + with pytest.raises(ValidationError): + schemas.AnswerCorrectnessMetric( + value=0.0, # type: ignore + parameters="not a valid parameter", # type: ignore + ) + + assert all( + [ + key in ["value", "type", "evaluation_id", "parameters"] + for key in metric.db_mapping(evaluation_id=1) + ] + ) + + def test_AnswerRelevanceMetric(): metric = schemas.AnswerRelevanceMetric( value=0.421, @@ -581,49 +625,83 @@ def test_BLEUMetric(): ) -def test_CoherenceMetric(): - metric = schemas.CoherenceMetric( - value=3, +def test_ContextPrecisionMetric(): + metric = schemas.ContextPrecisionMetric( + value=0.873, parameters={ "dataset_uid": "01", "dataset_name": "test_dataset", - "prediction": "some prediction", + "context_list": ["context1", "context2"], }, ) with pytest.raises(ValidationError): - schemas.CoherenceMetric( + schemas.ContextPrecisionMetric( value=None, # type: ignore parameters={ "dataset_uid": "01", "dataset_name": "test_dataset", - "prediction": "some prediction", + "context_list": ["context1", "context2"], }, ) with pytest.raises(ValidationError): - schemas.CoherenceMetric( - value=2.5, # type: ignore + schemas.ContextPrecisionMetric( + value={"key": 0.222}, # type: ignore parameters={ "dataset_uid": "01", "dataset_name": "test_dataset", - "prediction": "some prediction", + "context_list": ["context1", "context2"], }, ) with pytest.raises(ValidationError): - schemas.CoherenceMetric( - value={"key": 4}, # type: ignore + schemas.ContextPrecisionMetric( + value=0.501, # type: ignore + parameters="not a valid parameter", # type: ignore + ) + + assert all( + [ + key in ["value", "type", "evaluation_id", "parameters"] + for key in metric.db_mapping(evaluation_id=1) + ] + ) + + +def test_ContextRecallMetric(): + metric = schemas.ContextRecallMetric( + value=0.8, + parameters={ + "dataset_uid": "01", + "dataset_name": "test_dataset", + "context_list": ["context1", "context2"], + }, + ) + + with pytest.raises(ValidationError): + schemas.ContextRecallMetric( + value="value", # type: ignore parameters={ "dataset_uid": "01", "dataset_name": "test_dataset", - "prediction": "some prediction", + "context_list": ["context1", "context2"], }, ) with pytest.raises(ValidationError): - schemas.CoherenceMetric( - value=5, # type: ignore + schemas.ContextRecallMetric( + value={"key": 0.5}, # type: ignore + parameters={ + "dataset_uid": "01", + "dataset_name": "test_dataset", + "context_list": ["context1", "context2"], + }, + ) + + with pytest.raises(ValidationError): + schemas.ContextRecallMetric( + value=0.6, # type: ignore parameters="not a valid parameter", # type: ignore ) @@ -838,6 +916,60 @@ def test_ROUGEMetric(): ) +def test_SummaryCoherenceMetric(): + metric = schemas.SummaryCoherenceMetric( + value=3, + parameters={ + "dataset_uid": "01", + "dataset_name": "test_dataset", + "prediction": "some summary", + }, + ) + + with pytest.raises(ValidationError): + schemas.SummaryCoherenceMetric( + value=None, # type: ignore + parameters={ + "dataset_uid": "01", + "dataset_name": "test_dataset", + "prediction": "some summary", + }, + ) + + with pytest.raises(ValidationError): + schemas.SummaryCoherenceMetric( + value=2.5, # type: ignore + parameters={ + "dataset_uid": "01", + "dataset_name": "test_dataset", + "prediction": "some summary", + }, + ) + + with pytest.raises(ValidationError): + schemas.SummaryCoherenceMetric( + value={"key": 4}, # type: ignore + parameters={ + "dataset_uid": "01", + "dataset_name": "test_dataset", + "prediction": "some summary", + }, + ) + + with pytest.raises(ValidationError): + schemas.SummaryCoherenceMetric( + value=5, # type: ignore + parameters="not a valid parameter", # type: ignore + ) + + assert all( + [ + key in ["value", "type", "evaluation_id", "parameters"] + for key in metric.db_mapping(evaluation_id=1) + ] + ) + + def test_ToxicityMetric(): metric = schemas.ToxicityMetric( value=0.4, diff --git a/api/valor_api/backend/core/llm_clients.py b/api/valor_api/backend/core/llm_clients.py index 3ffc61ec5..e288c08c7 100644 --- a/api/valor_api/backend/core/llm_clients.py +++ b/api/valor_api/backend/core/llm_clients.py @@ -5,15 +5,18 @@ from pydantic import BaseModel from valor_api.backend.core.llm_instructions_analysis import ( + generate_answer_correctness_verdicts_instruction, generate_answer_relevance_verdicts_instruction, generate_bias_verdicts_instruction, generate_claims_instruction, - generate_coherence_instruction, + generate_context_precision_verdicts_instruction, + generate_context_recall_verdicts_instruction, generate_context_relevance_verdicts_instruction, generate_faithfulness_verdicts_instruction, generate_hallucination_verdicts_instruction, generate_opinions_instruction, generate_statements_instruction, + generate_summary_coherence_instruction, generate_toxicity_verdicts_instruction, ) from valor_api.backend.metrics.metric_utils import trim_and_load_json @@ -128,7 +131,7 @@ def _generate_claims( {"role": "system", "content": DEFAULT_SYSTEM_PROMPT}, { "role": "user", - "content": generate_claims_instruction(text), + "content": generate_claims_instruction(text=text), }, ] @@ -168,7 +171,7 @@ def _generate_opinions( {"role": "system", "content": DEFAULT_SYSTEM_PROMPT}, { "role": "user", - "content": generate_opinions_instruction(text), + "content": generate_opinions_instruction(text=text), }, ] @@ -208,7 +211,7 @@ def _generate_statements( {"role": "system", "content": DEFAULT_SYSTEM_PROMPT}, { "role": "user", - "content": generate_statements_instruction(text), + "content": generate_statements_instruction(text=text), }, ] @@ -227,6 +230,75 @@ def _generate_statements( ) return statements + def _generate_answer_correctness_verdicts( + self, + query: str, + prediction_statements: list[str], + groundtruth_statements: list[str], + ) -> dict[str, list[dict[str, str]]]: + """ + Generate lists of true positives, false positives and false negatives, using a call to the LLM API. + + Parameters + ---------- + query: str + The query that both the prediction and ground truth should be answering. + prediction_statements: list[str] + The prediction statements to evaluate. + groundtruth_statements: list[str] + The ground truth statements to evaluate. + + Returns + ------- + dict[str, list[dict[str, str]]] + A dictionary of true positives, false positives and false negatives. + """ + messages = [ + {"role": "system", "content": DEFAULT_SYSTEM_PROMPT}, + { + "role": "user", + "content": generate_answer_correctness_verdicts_instruction( + query=query, + prediction_statements=prediction_statements, + groundtruth_statements=groundtruth_statements, + ), + }, + ] + response = self(messages) + response = trim_and_load_json(response) + if ( + type(response) != dict + or "TP" not in response + or "FP" not in response + or "FN" not in response + ): + raise InvalidLLMResponseError( + f"LLM response was not a dictionary of true positives, false positives and false negatives: {response}" + ) + + if ( + type(response["TP"]) != list + or type(response["FP"]) != list + or type(response["FN"]) != list + ): + raise InvalidLLMResponseError( + f"LLM response did not contain valid lists of true positives, false positives and false negatives: {response}" + ) + + if len(response["TP"]) + len(response["FP"]) != len( + prediction_statements + ): + raise InvalidLLMResponseError( + f"Number of true positives and false positives did not match the number of prediction statements: {response}" + ) + + if len(response["FN"]) > len(groundtruth_statements): + raise InvalidLLMResponseError( + f"Number of false negatives exceeded the number of ground truth statements: {response}" + ) + + return response + def _generate_answer_relevance_verdicts( self, query: str, @@ -245,15 +317,15 @@ def _generate_answer_relevance_verdicts( Returns ------- list[dict[str,str]] - The list of verdicts for each statement. Each verdict is a dictionary with the "verdict" and optionally a "reason". + The list of verdicts for each statement. Each verdict is a dictionary with the "verdict" field. """ messages = [ {"role": "system", "content": DEFAULT_SYSTEM_PROMPT}, { "role": "user", "content": generate_answer_relevance_verdicts_instruction( - query, - statements, + query=query, + statements=statements, ), }, ] @@ -295,14 +367,14 @@ def _generate_bias_verdicts( Returns ------- list[dict[str,str]] - The list of verdicts for each opinion. Each verdict is a dictionary with the "verdict" and optionally a "reason". + The list of verdicts for each opinion. Each verdict is a dictionary with the "verdict" field. """ messages = [ {"role": "system", "content": DEFAULT_SYSTEM_PROMPT}, { "role": "user", "content": generate_bias_verdicts_instruction( - opinions, + opinions=opinions, ), }, ] @@ -328,44 +400,117 @@ def _generate_bias_verdicts( return verdicts - def _coherence( + def _generate_context_precision_verdicts( self, - text: str, - ) -> int: + query: str, + ordered_context_list: list[str], + groundtruth: str, + ) -> list[dict[str, str]]: """ - Compute coherence, the collective quality of all sentences, for a single piece of text. + Generate a list of context precision verdicts for an ordered list of contexts, using a call to the LLM API. + + The verdict for each context should be 'yes' if the context is relevant to produce the ground truth answer to the query. The verdict should be 'no' otherwise. Parameters ---------- - text: str - The text to be evaluated. + query: str + The query. + ordered_context_list: list[str] + The ordered list of contexts. Each context will be evaluated to determine if it is useful for producing the ground truth answer to the query. + groundtruth: str + The ground truth answer to the query. Returns ------- - int - The coherence score will be evaluated as an integer, with 1 indicating the lowest coherence and 5 the highest coherence. + list[dict[str,str]] + The list of verdicts for each context. Each verdict is a dictionary with the "verdict" field. """ messages = [ {"role": "system", "content": DEFAULT_SYSTEM_PROMPT}, - {"role": "user", "content": generate_coherence_instruction(text)}, + { + "role": "user", + "content": generate_context_precision_verdicts_instruction( + query=query, + ordered_context_list=ordered_context_list, + groundtruth=groundtruth, + ), + }, ] response = self(messages) + response = trim_and_load_json(response) + if type(response) != dict or "verdicts" not in response: + raise InvalidLLMResponseError( + f"LLM response was not a list of valid verdicts: {response}" + ) - try: - # Valid responses: "5", "\n5", "5\n", "5.", " 5", "5 {explanation}", etc. - ret = int(response.strip()[0]) - except Exception: + verdicts = response["verdicts"] + if ( + type(verdicts) != list + or len(verdicts) != len(ordered_context_list) + or not all( + verdict["verdict"] in ["yes", "no"] for verdict in verdicts + ) + ): raise InvalidLLMResponseError( - f"LLM response was not a valid coherence score: {response}" + f"LLM response was not a list of valid verdicts: {response}" ) - if ret not in {1, 2, 3, 4, 5}: + return verdicts + + def _generate_context_recall_verdicts( + self, + context_list: list[str], + groundtruth_statements: list[str], + ) -> list[dict[str, str]]: + """ + Generate a list of context recall verdicts for a list of ground truth statements, using a call to the LLM API. + + The verdict for each ground truth statement should be 'yes' if the ground truth statement is attributable to the context list and 'no' otherwise. + + Parameters + ---------- + context_list: list[str] + The list of contexts to evaluate against. + groundtruth_statements: str + A list of statements extracted from the ground truth answer. + + Returns + ------- + list[dict[str,str]] + The list of verdicts for each ground truth statement. Each verdict is a dictionary with the "verdict" field. + """ + messages = [ + {"role": "system", "content": DEFAULT_SYSTEM_PROMPT}, + { + "role": "user", + "content": generate_context_recall_verdicts_instruction( + context_list=context_list, + groundtruth_statements=groundtruth_statements, + ), + }, + ] + + response = self(messages) + response = trim_and_load_json(response) + if type(response) != dict or "verdicts" not in response: raise InvalidLLMResponseError( - f"Coherence score was not an integer between 1 and 5: {ret}" + f"LLM response was not a list of valid verdicts: {response}" ) - return ret + verdicts = response["verdicts"] + if ( + type(verdicts) != list + or len(verdicts) != len(groundtruth_statements) + or not all( + verdict["verdict"] in ["yes", "no"] for verdict in verdicts + ) + ): + raise InvalidLLMResponseError( + f"LLM response was not a list of valid verdicts: {response}" + ) + + return verdicts def _generate_context_relevance_verdicts( self, @@ -385,15 +530,15 @@ def _generate_context_relevance_verdicts( Returns ------- list[dict[str,str]] - The list of verdicts for each context. Each verdict is a dictionary with the "verdict" and optionally a "reason". + The list of verdicts for each context. Each verdict is a dictionary with the "verdict" field. """ messages = [ {"role": "system", "content": DEFAULT_SYSTEM_PROMPT}, { "role": "user", "content": generate_context_relevance_verdicts_instruction( - query, - context_list, + query=query, + context_list=context_list, ), }, ] @@ -444,8 +589,8 @@ def _generate_faithfulness_verdicts( { "role": "user", "content": generate_faithfulness_verdicts_instruction( - claims, - context_list, + claims=claims, + context_list=context_list, ), }, ] @@ -491,15 +636,15 @@ def _generate_hallucination_verdicts( Returns ------- list[dict[str,str]] - The list of verdicts for each context. Each verdict is a dictionary with the "verdict" and optionally a "reason". + The list of verdicts for each context. Each verdict is a dictionary with the "verdict" field. """ messages = [ {"role": "system", "content": DEFAULT_SYSTEM_PROMPT}, { "role": "user", "content": generate_hallucination_verdicts_instruction( - text, - context_list, + text=text, + context_list=context_list, ), }, ] @@ -525,6 +670,53 @@ def _generate_hallucination_verdicts( return verdicts + def _summary_coherence( + self, + text: str, + summary: str, + ) -> int: + """ + Compute summary coherence, the collective quality of a summary. + + Parameters + ---------- + text: str + The text that was summarized. + summary: str + The summary to be evaluated. + + Returns + ------- + int + The summary coherence score will be evaluated as an integer, with 1 indicating the lowest summary coherence and 5 the highest summary coherence. + """ + messages = [ + {"role": "system", "content": DEFAULT_SYSTEM_PROMPT}, + { + "role": "user", + "content": generate_summary_coherence_instruction( + text=text, summary=summary + ), + }, + ] + + response = self(messages) + + try: + # Valid responses: "5", "\n5", "5\n", "5.", " 5", "5 {explanation}", etc. + ret = int(response.strip()[0]) + except Exception: + raise InvalidLLMResponseError( + f"LLM response was not a valid summary coherence score: {response}" + ) + + if ret not in {1, 2, 3, 4, 5}: + raise InvalidLLMResponseError( + f"Summary coherence score was not an integer between 1 and 5: {ret}" + ) + + return ret + def _generate_toxicity_verdicts( self, opinions: list[str], @@ -540,14 +732,14 @@ def _generate_toxicity_verdicts( Returns ------- list[dict[str,str]] - The list of verdicts for each opinion. Each verdict is a dictionary with the "verdict" and optionally a "reason". + The list of verdicts for each opinion. Each verdict is a dictionary with the "verdict" field. """ messages = [ {"role": "system", "content": DEFAULT_SYSTEM_PROMPT}, { "role": "user", "content": generate_toxicity_verdicts_instruction( - opinions, + opinions=opinions, ), }, ] @@ -573,6 +765,56 @@ def _generate_toxicity_verdicts( return verdicts + def answer_correctness( + self, + query: str, + prediction: str, + groundtruth_list: list[str], + ) -> float: + """ + Compute answer correctness. Answer correctness is computed as an f1 score obtained by comparing prediction statements to ground truth statements. + + If there are multiple ground truths, then the f1 score is computed for each ground truth and the maximum score is returned. + + This metric was adapted from RAGAS. We follow a similar prompting strategy and computation, however we do not do a weighted sum with an answer similarity score using embeddings. + + Parameters + ---------- + query: str + The query that both the ground truth and prediction should be answering. + prediction: str + The prediction text to extract statements from. + groundtruth_list: list[str] + A list of ground truth texts to extract statements from. + + Returns + ------- + float + The answer correctness score between 0 and 1. Higher values indicate that the answer is more correct. A score of 1 indicates that all statements in the prediction are supported by the ground truth and all statements in the ground truth are present in the prediction. + """ + if len(groundtruth_list) == 0: + raise ValueError( + "Answer correctness is meaningless if the ground truth list is empty." + ) + + prediction_statements = self._generate_statements(prediction) + f1_scores = [] + for groundtruth in groundtruth_list: + groundtruth_statements = self._generate_statements(groundtruth) + verdicts = self._generate_answer_correctness_verdicts( + query=query, + groundtruth_statements=groundtruth_statements, + prediction_statements=prediction_statements, + ) + + tp = len(verdicts["TP"]) + fp = len(verdicts["FP"]) + fn = len(verdicts["FN"]) + + f1_scores.append(tp / (tp + 0.5 * (fp + fn)) if tp > 0 else 0) + + return max(f1_scores) + def answer_relevance( self, query: str, @@ -591,7 +833,7 @@ def answer_relevance( Returns ------- float - The answer relevance score will be evaluated as a float between 0 and 1, with 1 indicating that all statements are relevant to the query. + The answer relevance score between 0 and 1. A score of 1 indicates that all statements are relevant to the query. """ statements = self._generate_statements(text) verdicts = self._generate_answer_relevance_verdicts(query, statements) @@ -614,7 +856,7 @@ def bias( Returns ------- float - The bias score will be evaluated as a float between 0 and 1, with 1 indicating that all opinions in the text are biased. + The bias score between 0 and 1. A score of 1 indicates that all opinions in the text are biased. """ opinions = self._generate_opinions(text) if len(opinions) == 0: @@ -626,24 +868,128 @@ def bias( 1 for verdict in verdicts if verdict["verdict"] == "yes" ) / len(verdicts) - def coherence( + def context_precision( self, - text: str, - ) -> int: + query: str, + ordered_context_list: list[str], + groundtruth_list: list[str], + ) -> float: """ - Compute coherence, the collective quality of all sentences, for a single piece of text. + Compute context precision, a score for evaluating the retrieval mechanism of a RAG model. + + First, an LLM is prompted to determine if each context in the context list is useful for producing the ground truth answer to the query. + + If there are multiple ground truths, then the verdict is "yes" for a context if that context is useful for producing any of the ground truth answers, and "no" otherwise. + + Then, using these verdicts, the context precision score is computed as a weighted sum of the precision at k for each k from 1 to the length of the context list. + + Note that the earlier a piece of context appears in the context list, the more important it is in the computation of this score. For example, the first context in the context list will be included in every precision at k computation, so will have a large influence on the final score, whereas the last context will only be used for the last precision at k computation, so will have a small influence on the final score. Parameters ---------- - text: str - The text to be evaluated. + query: str + A query. + ordered_context_list: list[str] + The ordered list of contexts. Each context will be evaluated to determine if it is useful for producing the ground truth answer to the query. Contexts in this list are NOT treated equally in the computation of this score. The earlier a piece of context appears in the context list, the more important it is in the computation of this score. + groundtruth_list: list[str] + A list of ground truth answers to the query. Returns ------- - int - The coherence score will be evaluated as an integer, with 1 indicating the lowest coherence and 5 the highest coherence. + float + The context precision score between 0 and 1. A higher score indicates better context precision. + """ + if len(ordered_context_list) == 0: + raise ValueError( + "Context precision is meaningless if the context list is empty." + ) + if len(groundtruth_list) == 0: + raise ValueError( + "Context precision is meaningless if the ground truth list is empty." + ) + + # Get verdicts for each ground truth, and aggregate by setting the verdict for + # a context to "yes" if the verdict is "yes" for any ground truth. + aggregate_verdicts = ["no"] * len(ordered_context_list) + for groundtruth in groundtruth_list: + verdicts = self._generate_context_precision_verdicts( + query=query, + ordered_context_list=ordered_context_list, + groundtruth=groundtruth, + ) + for i in range(len(verdicts)): + if verdicts[i]["verdict"] == "yes": + aggregate_verdicts[i] = "yes" + + # Use the aggregate verdicts to compute the precision at k for each k. + precision_at_k_list = [] + for k in range(1, len(ordered_context_list) + 1): + # Only compute the precision at k if the kth context is relevant. + if aggregate_verdicts[k - 1] == "yes": + precision_at_k = ( + sum( + 1 + for verdict in aggregate_verdicts[:k] + if verdict == "yes" + ) + / k + ) + precision_at_k_list.append(precision_at_k) + + # If none of the context are relevant, then the context precision is 0. + if len(precision_at_k_list) == 0: + return 0 + + # Average over all the precision at k for which the kth context is relevant. + return sum(precision_at_k_list) / len(precision_at_k_list) + + def context_recall( + self, + context_list: list[str], + groundtruth_list: list[str], + ) -> float: """ - return self._coherence(text) + Compute context recall, a score for evaluating the retrieval mechanism of a RAG model. + + The context recall score is the proportion of statements in the ground truth that are attributable to the context list. + + If multiple ground truths are provided, then the context recall score is computed for each ground truth and the maximum score is returned. + + Parameters + ---------- + context_list: list[str] + The list of contexts to evaluate against. + groundtruth_list: str + A list of ground truth answers to extract statements from. + + Returns + ------- + float + The context recall score between 0 and 1. A score of 1 indicates that all ground truth statements are attributable to the contexts in the context list. + """ + if len(context_list) == 0: + raise ValueError( + "Context recall is meaningless if the context list is empty." + ) + if len(groundtruth_list) == 0: + raise ValueError( + "Context recall is meaningless if the ground truth list is empty." + ) + + scores = [] + for groundtruth in groundtruth_list: + groundtruth_statements = self._generate_statements(groundtruth) + + verdicts = self._generate_context_recall_verdicts( + context_list, groundtruth_statements + ) + + scores.append( + sum(1 for verdict in verdicts if verdict["verdict"] == "yes") + / len(verdicts) + ) + + return max(scores) def context_relevance( self, @@ -663,11 +1009,11 @@ def context_relevance( Returns ------- float - The context relevance score will be evaluated as a float between 0 and 1, with 0 indicating that none of the contexts are relevant and 1 indicating that all of the contexts are relevant. + The context relevance score between 0 and 1. A score of 0 indicates that none of the contexts are relevant and a score of 1 indicates that all of the contexts are relevant. """ if len(context_list) == 0: raise ValueError( - "Context relevance is meaningless if context_list is empty." + "Context relevance is meaningless if the context list is empty." ) verdicts = self._generate_context_relevance_verdicts( @@ -696,11 +1042,11 @@ def faithfulness( Returns ------- float - The faithfulness score will be evaluated as a float between 0 and 1, with 1 indicating that all claims in the text are implied by the list of contexts. + The faithfulness score between 0 and 1. A score of 1 indicates that all claims in the text are implied by the list of contexts. """ if len(context_list) == 0: raise ValueError( - "Faithfulness is meaningless if context_list is empty." + "Faithfulness is meaningless if the context list is empty." ) claims = self._generate_claims(text) @@ -737,11 +1083,11 @@ def hallucination( Returns ------- float - The hallucination score will be evaluated as a float between 0 and 1, with 1 indicating that all contexts are contradicted by the text. + The hallucination score between 0 and 1. A score of 1 indicates that all contexts are contradicted by the text. """ if len(context_list) == 0: raise ValueError( - "Hallucination is meaningless if context_list is empty." + "Hallucination is meaningless if the context list is empty." ) verdicts = self._generate_hallucination_verdicts( @@ -753,6 +1099,28 @@ def hallucination( 1 for verdict in verdicts if verdict["verdict"] == "yes" ) / len(verdicts) + def summary_coherence( + self, + text: str, + summary: str, + ) -> int: + """ + Compute summary coherence, the collective quality of a summary. + + Parameters + ---------- + text: str + The text that was summarized. + summary: str + The summary to be evaluated. + + Returns + ------- + int + The summary coherence score between 1 and 5. A score of 1 indicates the lowest summary coherence and a score of 5 indicates the highest summary coherence. + """ + return self._summary_coherence(text=text, summary=summary) + def toxicity( self, text: str, @@ -1083,7 +1451,7 @@ def __call__( "claims": [ "The capital of the UK is London.", "The capital of South Korea is Seoul.", - "The capital of the Argentina is Canada." + "The capital of Argentina is Canada." ] }```""" @@ -1113,6 +1481,24 @@ def __call__( ] }```""" + # Answer correctness verdicts + elif ( + "Return in JSON format with three keys: 'TP', 'FP', and 'FN'" + in processed_messages[1]["content"] + ): + response = """```json +{ + "TP": [ + "London is the largest city in the UK by GDP" + ], + "FP": [ + "London is the largest city in the UK by population" + ], + "FN": [ + "In 2021, financial services made up more than 20% of London's output" + ] +}```""" + # Answer relevance verdicts elif ( "generate a list of verdicts that indicate whether each statement is relevant to address the query" @@ -1121,13 +1507,8 @@ def __call__( response = """```json { "verdicts": [ - { - "verdict": "yes" - }, - { - "verdict": "no", - "reason": "The detail in this statement is not necessary for answering the question." - } + {"verdict": "yes"}, + {"verdict": "no"} ] }```""" @@ -1139,23 +1520,46 @@ def __call__( response = """```json { "verdicts": [ - { - "verdict": "no" - }, - { - "verdict": "yes", - "reason": "The opinion 'People from Canada are nicer than people from other countries' shows geographical bias by generalizing positive traits to a specific group of people. A correction would be, 'Many individuals from Canada are known for their politeness.'" - } + {"verdict": "no"}, + {"verdict": "yes"} ] }```""" - # Coherence score + # Summary coherence score elif ( - "Coherence (1-5) - the collective quality of all sentences." + "Your task is to rate the summary based on its coherence" in processed_messages[1]["content"] ): response = "4" + # Context precision verdicts + elif ( + "generate a list of verdicts to determine whether each context in the context list is useful for producing the ground truth answer to the query" + in processed_messages[1]["content"] + ): + response = """```json + { + "verdicts": [ + {"verdict": "yes"}, + {"verdict": "no"}, + {"verdict": "no"}, + {"verdict": "yes"} + ] + }```""" + + # Context recall verdicts + elif ( + "analyze each ground truth statement and determine if the statement can be attributed to the given context" + in processed_messages[1]["content"] + ): + response = """```json + { + "verdicts": [ + {"verdict": "yes"}, + {"verdict": "yes"} + ] + }```""" + # Context relevance verdicts elif ( "generate a list of verdicts to indicate whether each context is relevant to the provided query" @@ -1164,19 +1568,10 @@ def __call__( response = """```json { "verdicts": [ - { - "verdict": "yes" - }, - { - "verdict": "yes" - }, - { - "verdict": "no", - "reason": "This context has no relevance to the query" - }, - { - "verdict": "yes" - } + {"verdict": "yes"}, + {"verdict": "yes"}, + {"verdict": "no"}, + {"verdict": "yes"} ] }```""" @@ -1202,19 +1597,10 @@ def __call__( response = """```json { "verdicts": [ - { - "verdict": "no" - }, - { - "verdict": "no" - }, - { - "verdict": "yes", - "reason": "The text contradicts this context." - }, - { - "verdict": "no" - } + {"verdict": "no"}, + {"verdict": "no"}, + {"verdict": "yes"}, + {"verdict": "no"} ] }```""" @@ -1226,12 +1612,8 @@ def __call__( response = """```json { "verdicts": [ - { - "verdict": "no" - }, - { - "verdict": "no" - } + {"verdict": "no"}, + {"verdict": "no"} ] }```""" diff --git a/api/valor_api/backend/core/llm_instructions_analysis.py b/api/valor_api/backend/core/llm_instructions_analysis.py index c7998d96e..2d6138f13 100644 --- a/api/valor_api/backend/core/llm_instructions_analysis.py +++ b/api/valor_api/backend/core/llm_instructions_analysis.py @@ -14,7 +14,7 @@ def generate_claims_instruction(text: str) -> str: Returns ------- str - The instruction for the llm. + The instruction for the LLM. """ return f"""Based on the text, generate a comprehensive list of FACTUAL CLAIMS that can be inferred from the text. @@ -23,13 +23,13 @@ def generate_claims_instruction(text: str) -> str: You should NOT include any prior knowledge. Take the text at face value when extracting claims. ===== EXAMPLE ====== -Example Text: "Einstein won the noble prize in 1968 for his discovery of the photoelectric effect." +Example Text: "Einstein won the noble prize in 1921 for his discovery of the photoelectric effect." Example JSON: {{ "claims": [ "Einstein won the noble prize for his discovery of the photoelectric effect.", - "Einstein won the noble prize in 1968." + "Einstein won the noble prize in 1921." ] }} ===== END OF EXAMPLE ====== @@ -57,7 +57,7 @@ def generate_opinions_instruction(text: str) -> str: Returns ------- str - The instruction for the llm. + The instruction for the LLM. """ return f"""Based on the text, generate a list of OPINIONS presented in the text. Claims and undisputed truths are NOT opinions. @@ -101,7 +101,7 @@ def generate_statements_instruction(text: str) -> str: Returns ------- str - The instruction for the llm. + The instruction for the LLM. """ return f"""Based on the text, breakdown and generate a list of STATEMENTS presented in the text. Ambiguous statements and single words can also be considered as statements. @@ -127,6 +127,79 @@ def generate_statements_instruction(text: str) -> str: """ +def generate_answer_correctness_verdicts_instruction( + query: str, + prediction_statements: list[str], + groundtruth_statements: list[str], +) -> str: + """ + Instruction template was adapted from RAGAS's codebase https://github.com/explodinggradients/ragas/blob/main/src/ragas/metrics/_answer_correctness.py. + + The RAGAS instruction and example were modified to fit the format of the other Valor LLM-guided metric instructions. + + Parameters + ---------- + query: str + The query that both the prediction and ground truth should be answering. + prediction_statements: list[str] + The prediction statements to evaluate the validity of. + groundtruth_statements: list[str] + The ground truth statements to evaluate the validity of. + + Returns + ------- + str + The instruction for the LLM. + """ + return f"""Based on the query, the prediction statements and the ground truth statements, analyze each statement and classify them into one of the following categories: +- TP (true positive): statements present in the prediction that are directly supported by one or more statements in the ground truth, +- FP (false positive): statements present in the prediction that are not directly supported by any statement in the ground truth, +- FN (false negative): statements present in the ground truth that aren't represented in any statements in the prediction. + +IMPORTANT: Return in JSON format with three keys: 'TP', 'FP', and 'FN', each mapping to a list of statements. +Each statement can only belong to one of the categories. +All prediction statements should either be in 'TP' or 'FP'. +All ground truth statements should either be in 'FN' or not present in the JSON. A ground truth statement should only be in 'FN' if it does not support any of the prediction statements in 'TP'. + +===== EXAMPLE ====== +Example Query: What is the boiling point of water? + +Example Prediction Statements: [ + "The boiling point of water is 100 degrees Celsius at sea level", + "The melting point of water is 0 degrees Celsius!" +] + +Example Ground Truth Statements: [ + "The boiling point of water is 100 degrees Celsius (212 degrees Fahrenheit) at sea level.", + "The boiling point of water can change with altitude." +] + +Example JSON: +{{ + "TP": [ + "The boiling point of water is 100 degrees Celsius at sea level" + ], + "FP": [ + "The melting point of water is 0 degrees Celsius!" + ], + "FN": [ + "The boiling point of water can change with altitude." + ] +}} +===== END OF EXAMPLE ====== +Query: +{query} + +Prediction Statements: +{prediction_statements} + +Ground Truth Statements: +{groundtruth_statements} + +JSON: +""" + + def generate_answer_relevance_verdicts_instruction( query: str, statements: list[str] ) -> str: @@ -147,7 +220,7 @@ def generate_answer_relevance_verdicts_instruction( Returns ------- str - The instruction for the llm. + The instruction for the LLM. """ return f"""Based on the query and the list of statements, generate a list of verdicts that indicate whether each statement is relevant to address the query. Each verdict should have two mandatory fields: 'analysis' and 'verdict'. @@ -211,7 +284,7 @@ def generate_bias_verdicts_instruction(opinions: list[str]) -> str: Returns ------- str - The instruction for the llm. + The instruction for the LLM. """ return f"""Based on the list of opinions, generate a list of verdicts to indicate whether EACH opinion is biased. Each verdict should have two mandatory fields: 'analysis' and 'verdict'. @@ -270,39 +343,146 @@ def generate_bias_verdicts_instruction(opinions: list[str]) -> str: """ -def generate_coherence_instruction(text: str) -> str: +def generate_context_precision_verdicts_instruction( + query: str, + ordered_context_list: list[str], + groundtruth: str, +) -> str: """ - Generate LLM instruction for evaluating the coherence of the text. + Generate LLM instruction for evaluating the usefulness of contexts for producing the ground truth answer to the query. - This instruction was adapted from appendix A of DeepEval's paper G-EVAL: NLG Evaluation using GPT-4 with Better Human Alignment (https://arxiv.org/pdf/2303.16634). - The main adaptation is a generalization of the metric to more task types. The example prompt in DeepEval was specific to summarization, but the below prompt could apply to any text generation task. - Crucially, unlike DeepEval, no context is used. Instead, the coherence of the text is evaluated entirely based on the text. This generalizes the prompt and also prevents the evaluation from being influenced by the quality of sentences in the context. + Instruction template was adapted from DeepEval's codebase https://github.com/confident-ai/deepeval/blob/main/deepeval/metrics/context_precision/template.py. + + Modifications to the instruction include improvements to the spelling, grammar, formatting and examples. Parameters ---------- - text: str - The text to be evaluated. + query: str + The query. + ordered_context_list: list[str] + The ordered list of contexts. Each context will be evaluated to determine if it is useful for producing the ground truth answer to the query. + groundtruth: str + The ground truth answer to the query. Returns ------- str - The instruction for the llm. + The instruction for the LLM. """ - return f"""Grade the text. Your task is to rate the text based on its coherence. Please make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed. + return f"""Given the query, context list, and ground truth, generate a list of verdicts to determine whether each context in the context list is useful for producing the ground truth answer to the query. - Evaluation Criteria: - Coherence (1-5) - the collective quality of all sentences. We align this dimension with the DUC quality question of structure and coherence whereby ”the summary should be well-structured and well-organized. The summary should not just be a heap of related information, but should build from sentence to sentence to a coherent body of information about a topic.” +IMPORTANT: Return in JSON format with the 'verdicts' key mapping to a list of verdicts. +Since you will generate a verdict for each context, the number of verdicts SHOULD BE STRICTLY EQUAL to the length of the context list. +The 'analysis' key should provide a brief analysis of the usefulness of each context for producing the ground truth answer to the query. +The 'analysis' should come BEFORE the 'verdict'. Use your 'analysis' to help you decide the 'verdict'. +The 'verdict' key should STRICTLY be either 'yes' or 'no', and states whether each context is useful for producing the ground truth answer to the query. - Evaluation Steps: - 1. Read the text carefully and identify the main topic and key points. - 2. Check if the text presents the information in a clear and logical order. Examine the collective quality of all sentences. - 3. Assign a score for coherence on a scale of 1 to 5, where 1 is the lowest and 5 is the highest based on the Evaluation Criteria. Respond with just the number 1 to 5. +===== EXAMPLE ====== +Example Query: "Who won the Nobel Prize in 1921 and for what?" - Text: - {text} +Example Context List: ["Einstein won the Nobel Prize for his discovery of the photoelectric effect", "Einstein won the Nobel Prize in 1921.", "Einstein was born in 1879 in Germany."] - Coherence Score (1-5): +Example Ground Truth: "Einstein won the Nobel Prize in 1921 for his discovery of the photoelectric effect." + +Example JSON: +{{ + "verdicts": [ + {{ + "analysis": "The reason why Einstein won the Nobel Prize answers the second part of the query.", + "verdict": "yes" + }}, + {{ + "reason": "The context answers who won the prize in 1921.", + "verdict": "yes" + }}, + {{ + "reason": "Einstein's birth year is not mentioned in the ground truth answer, so this context is not useful for producing the ground truth.", + "verdict": "no" + }} + ] +}} +===== END OF EXAMPLE ====== + +Query: +{query} + +Context List: +{ordered_context_list} + +Ground Truth: +{groundtruth} + +JSON: +""" + + +def generate_context_recall_verdicts_instruction( + context_list: list[str], + groundtruth_statements: list[str], +) -> str: """ + Generate LLM instruction for evaluating whether each ground truth statement is attributable to the context. + + Instruction template was adapted from RAGAS's codebase https://github.com/explodinggradients/ragas/blob/main/src/ragas/metrics/_context_recall.py. + + Modifications to the instruction include changes to the format to match the other Valor instructions as well as changing the ground truth into a list of ground truth statements. + + Parameters + ---------- + context_list: list[str] + The list of contexts to evaluate against. + groundtruth_statements: str + A list of statements extracted from the ground truth answer. + + Returns + ------- + str + The instruction for the LLM. + """ + return f"""Given a context list and a list of ground truth statements, analyze each ground truth statement and determine if the statement can be attributed to the given context. + +IMPORTANT: Return in JSON format with the 'verdicts' key mapping to a list of verdicts. +Since you will generate a verdict for each ground truth statement, the number of verdicts SHOULD BE STRICTLY EQUAL to the number of ground truth statements. +The 'analysis' key should provide a brief analysis of the relationship of each ground truth statement to the context list. +The 'analysis' should come BEFORE the 'verdict'. Use your 'analysis' to help you decide the 'verdict'. +The 'verdict' key should STRICTLY be either 'yes' or 'no', and states whether each ground truth statement is attributable to the context list. + +===== EXAMPLE ====== +Example Context List: ["Albert Einstein (14 March 1879 - 18 April 1955) was a German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. Best known for developing the theory of relativity, he also made important contributions to quantum mechanics, and was thus a central figure in the revolutionary reshaping of the scientific understanding of nature that modern physics accomplished in the first decades of the twentieth century.", "Albert Einstein's mass-energy equivalence formula E = mc2, which arises from relativity theory, has been called 'the world's most famous equation'.", "Albert Einstein received the 1921 Nobel Prize in Physics 'for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect', a pivotal step in the development of quantum theory. His work is also known for its influence on the philosophy of science. In a 1999 poll of 130 leading physicists worldwide by the British journal Physics World, Einstein was ranked the greatest physicist of all time. His intellectual achievements and originality have made Einstein synonymous with genius."] + +Example Ground Truth Statements: ["Albert Einstein was born on 14 March 1879.", "Albert Einstein received the 1921 Nobel Prize in Physics for his services to theoretical physics.", "Einstein published 4 papers in 1905.", "Einstein moved to Switzerland in 1895."] + +Example JSON: +{{ + "verdicts": [ + {{ + "analysis": "The date of birth of Einstein is mentioned clearly in the context.", + "verdict": "yes" + }}, + {{ + "reason": "The statement matches exactly with part of a sentence present in the given context.", + "verdict": "yes" + }}, + {{ + "reason": "There is no mention about papers he wrote in the given context.", + "verdict": "no" + }}, + {{ + "reason": "There is no supporting evidence for a move to Switzerland in the given context.", + "verdict": "no" + }} + ] +}} +===== END OF EXAMPLE ====== + +Context List: +{context_list} + +Ground Truth Statements: +{groundtruth_statements} + +JSON: +""" def generate_context_relevance_verdicts_instruction( @@ -326,7 +506,7 @@ def generate_context_relevance_verdicts_instruction( Returns ------- str - The instruction for the llm. + The instruction for the LLM. """ return f"""Based on the query and the context list, generate a list of verdicts to indicate whether each context is relevant to the provided query. Each verdict should have two mandatory fields: 'analysis' and 'verdict'. @@ -387,7 +567,7 @@ def generate_faithfulness_verdicts_instruction( Returns ------- str - The instruction for the llm. + The instruction for the LLM. """ return f"""Based on the context list and the list of claims, generate a list of verdicts to indicate whether EACH claim is implied by the context list. Each verdict should have two mandatory fields: 'analysis' and 'verdict'. @@ -464,7 +644,7 @@ def generate_hallucination_verdicts_instruction( Returns ------- str - The instruction for the llm. + The instruction for the LLM. """ return f"""Based on the context list and the text, generate a list of verdicts to indicate whether the given text contradicts EACH context. Each verdict should have two mandatory fields: 'analysis' and 'verdict'. @@ -511,6 +691,47 @@ def generate_hallucination_verdicts_instruction( """ +def generate_summary_coherence_instruction( + text: str, + summary: str, +) -> str: + """ + This instruction was adapted from appendix A of DeepEval's paper G-EVAL: NLG Evaluation using GPT-4 with Better Human Alignment (https://arxiv.org/pdf/2303.16634). + + The instruction was generalized to apply to any text summarization task, as opposed to DeepEval's example instruction which was specific to news article summarization. + + Parameters + ---------- + text: str + The text that was summarized. + summary: str + The summary to be evaluated. + + Returns + ------- + str + The instruction for the llm. + """ + return f"""You will be given one summary written for a piece of text. Your task is to rate the summary based on its coherence. Please make sure you read and understand these instructions carefully. Please keep this document open while reviewing and refer to it as needed. + + Evaluation Criteria: + Coherence (1-5) - the collective quality of all sentences. We align this dimension with the DUC quality question of structure and coherence: the summary should be well-structured and well-organized. The summary should not just be a heap of related information, but should build from sentence to sentence to a coherent body of information about a topic. + + Evaluation Steps: + 1. Read the text carefully and identify the main topic and key points. + 2. Read the summary and compare it to the text. Check if the summary covers the main topic and key points of the text, and if it presents them in a clear and logical order. + 3. Assign a score for coherence on a scale of 1 to 5, where 1 is the lowest and 5 is the highest based on the Evaluation Criteria. Respond with just the number 1, 2, 3, 4 or 5. + + Text: + {text} + + Summary: + {summary} + + Coherence Score (1-5): + """ + + def generate_toxicity_verdicts_instruction(opinions: list[str]) -> str: """ Generate LLM instruction for evaluating the toxicity of opinions. @@ -527,7 +748,7 @@ def generate_toxicity_verdicts_instruction(opinions: list[str]) -> str: Returns ------- str - The instruction for the llm. + The instruction for the LLM. """ return f"""Based on the list of opinions, generate a list of verdicts to indicate whether EACH opinion is toxic. Each verdict should have two mandatory fields: 'analysis' and 'verdict'. diff --git a/api/valor_api/backend/metrics/metric_utils.py b/api/valor_api/backend/metrics/metric_utils.py index 11208fc46..c09cd08b4 100644 --- a/api/valor_api/backend/metrics/metric_utils.py +++ b/api/valor_api/backend/metrics/metric_utils.py @@ -110,14 +110,17 @@ def commit_results( | schemas.mIOUMetric | schemas.PrecisionRecallCurve | schemas.DetailedPrecisionRecallCurve + | schemas.AnswerCorrectnessMetric | schemas.AnswerRelevanceMetric | schemas.BiasMetric | schemas.BLEUMetric - | schemas.CoherenceMetric + | schemas.ContextPrecisionMetric + | schemas.ContextRecallMetric | schemas.ContextRelevanceMetric | schemas.FaithfulnessMetric | schemas.HallucinationMetric | schemas.ROUGEMetric + | schemas.SummaryCoherenceMetric | schemas.ToxicityMetric ], evaluation_id: int, diff --git a/api/valor_api/backend/metrics/text_generation.py b/api/valor_api/backend/metrics/text_generation.py index 7ff94f0ff..7e4b14af0 100644 --- a/api/valor_api/backend/metrics/text_generation.py +++ b/api/valor_api/backend/metrics/text_generation.py @@ -28,17 +28,26 @@ LLM_GUIDED_METRICS = { + "AnswerCorrectness", "AnswerRelevance", "Bias", - "Coherence", + "ContextPrecision", + "ContextRecall", "ContextRelevance", "Faithfulness", "Hallucination", + "SummaryCoherence", "Toxicity", } -TEXT_COMPARISON_METRICS = {"BLEU", "ROUGE"} +TEXT_COMPARISON_METRICS = { + "AnswerCorrectness", + "BLEU", + "ContextPrecision", + "ContextRecall", + "ROUGE", +} def _calculate_rouge_scores( @@ -101,7 +110,7 @@ def _calculate_rouge_scores( references=processed_references, rouge_types=rouge_types, use_stemmer=use_stemmer, - use_aggregator=False, # aggregation gives us an average across all predicitons, which isn't what we want + use_aggregator=False, # aggregation gives us an average across all predictions, which isn't what we want ) if not metrics: @@ -127,7 +136,7 @@ def _calculate_sentence_bleu( weights: list[float] = [0.25, 0.25, 0.25, 0.25], ) -> list[dict[str, float]]: """ - Calculate sentence BLEU scores for a set of prediction-groundtruth pairs. + Calculate sentence BLEU scores for a set of prediction - ground truth pairs. Parameters ---------- @@ -256,14 +265,17 @@ def _compute_text_generation_metrics( llm_api_params: dict[str, str | dict] | None = None, metric_params: dict = {}, ) -> list[ - schemas.AnswerRelevanceMetric + schemas.AnswerCorrectnessMetric + | schemas.AnswerRelevanceMetric | schemas.BiasMetric | schemas.BLEUMetric - | schemas.CoherenceMetric + | schemas.ContextPrecisionMetric + | schemas.ContextRecallMetric | schemas.ContextRelevanceMetric | schemas.FaithfulnessMetric | schemas.HallucinationMetric | schemas.ROUGEMetric + | schemas.SummaryCoherenceMetric | schemas.ToxicityMetric ]: """ @@ -276,7 +288,7 @@ def _compute_text_generation_metrics( datum_filter : schemas.Filter The filter to be used to query datums. groundtruth_filter : schemas.Filter - The filter to be used to query groundtruths. + The filter to be used to query ground truths. prediction_filter : schemas.Filter The filter to be used to query predictions. metrics_to_return: list[MetricType] @@ -288,9 +300,40 @@ def _compute_text_generation_metrics( Returns ---------- - Sequence[schemas.AnswerRelevanceMetric | schemas.BiasMetric | schemas.BLEUMetric | schemas.CoherenceMetric | schemas.ContextRelevanceMetric | schemas.FaithfulnessMetric | schemas.HallucinationMetric | schemas.ROUGEMetric | schemas.ToxicityMetric] + Sequence[schemas.AnswerCorrectnessMetric | schemas.AnswerRelevanceMetric | schemas.BiasMetric | schemas.BLEUMetric | schemas.ContextPrecisionMetric | schemas.ContextRecallMetric | schemas.ContextRelevanceMetric | schemas.FaithfulnessMetric | schemas.HallucinationMetric | schemas.ROUGEMetric | schemas.SummaryCoherenceMetric | schemas.ToxicityMetric] A list of computed metrics. """ + is_AnswerCorrectness_enabled = ( + MetricType.AnswerCorrectness in metrics_to_return + ) + is_AnswerRelevance_enabled = ( + MetricType.AnswerRelevance in metrics_to_return + ) + is_Bias_enabled = MetricType.Bias in metrics_to_return + is_BLEU_enabled = MetricType.BLEU in metrics_to_return + is_ContextPrecision_enabled = ( + MetricType.ContextPrecision in metrics_to_return + ) + is_ContextRecall_enabled = MetricType.ContextRecall in metrics_to_return + is_ContextRelevance_enabled = ( + MetricType.ContextRelevance in metrics_to_return + ) + is_Faithfulness_enabled = MetricType.Faithfulness in metrics_to_return + is_Hallucination_enabled = MetricType.Hallucination in metrics_to_return + is_ROUGE_enabled = MetricType.ROUGE in metrics_to_return + is_SummaryCoherence_enabled = ( + MetricType.SummaryCoherence in metrics_to_return + ) + is_Toxicity_enabled = MetricType.Toxicity in metrics_to_return + + client = None + if any([metric in metrics_to_return for metric in LLM_GUIDED_METRICS]): + if llm_api_params is None: + raise ValueError( + f"llm_api_params must be provided for the following metrics: {[metric for metric in metrics_to_return if metric in LLM_GUIDED_METRICS]}." + ) + client = _setup_llm_client(llm_api_params) + prediction_subquery = ( generate_select( models.Annotation.datum_id.label("datum_id"), @@ -303,16 +346,17 @@ def _compute_text_generation_metrics( .subquery() ) + # Text comparison metrics require both predictions and ground truths. output = [] if any( [metric in TEXT_COMPARISON_METRICS for metric in metrics_to_return] ): - # get reference text to compare against from groundtruths - # use array_agg since there can be multiple references for a given datum_uid + # Use array_agg since there can be multiple ground truths and multiple predictions for a given datum_uid. groundtruth_subquery = ( generate_select( models.Datum.id.label("datum_id"), models.Datum.uid.label("datum_uid"), + models.Datum.text.label("datum_text"), models.Dataset.name.label("dataset_name"), functions.array_agg(models.Annotation.text).label( "groundtruth_text" @@ -324,6 +368,7 @@ def _compute_text_generation_metrics( .group_by( models.Datum.id.label("datum_id"), models.Datum.uid.label("datum_uid"), + models.Datum.text.label("datum_text"), models.Dataset.name.label("dataset_name"), ) .subquery() @@ -333,9 +378,13 @@ def _compute_text_generation_metrics( select( groundtruth_subquery.c.datum_uid, groundtruth_subquery.c.dataset_name, + groundtruth_subquery.c.datum_text, functions.array_agg( prediction_subquery.c.prediction_text ).label("predictions"), + functions.array_agg( + prediction_subquery.c.prediction_context_list + ).label("list_of_prediction_context_lists"), functions.array_agg( groundtruth_subquery.c.groundtruth_text ).label("references"), @@ -349,14 +398,40 @@ def _compute_text_generation_metrics( .group_by( groundtruth_subquery.c.datum_uid, groundtruth_subquery.c.dataset_name, + groundtruth_subquery.c.datum_text, ) ) results = db.execute(joint_subquery).all() - is_BLEU_enabled = "BLEU" in metrics_to_return - is_ROUGE_enabled = "ROUGE" in metrics_to_return - for datum_uid, dataset_name, predictions, references in results: + for ( + datum_uid, + dataset_name, + datum_text, + predictions, + list_of_prediction_context_lists, + references, + ) in results: + if is_AnswerCorrectness_enabled: + assert client + for (prediction, groundtruth_list) in zip( + predictions, references + ): + output += [ + schemas.AnswerCorrectnessMetric( + value=client.answer_correctness( + query=datum_text, + prediction=prediction, + groundtruth_list=groundtruth_list, + ), + parameters={ + "dataset": dataset_name, + "datum_uid": datum_uid, + "prediction": prediction, + }, + ) + ] + if is_BLEU_enabled: bleu_params = metric_params.get("BLEU", {}) if not isinstance(bleu_params, dict): @@ -380,6 +455,46 @@ def _compute_text_generation_metrics( ) for metric in bleu_metrics ] + + if is_ContextPrecision_enabled: + assert client + for (prediction_context_list, groundtruth_list) in zip( + list_of_prediction_context_lists, references + ): + output += [ + schemas.ContextPrecisionMetric( + value=client.context_precision( + query=datum_text, + ordered_context_list=prediction_context_list, + groundtruth_list=groundtruth_list, + ), + parameters={ + "dataset": dataset_name, + "datum_uid": datum_uid, + "context_list": prediction_context_list, + }, + ) + ] + + if is_ContextRecall_enabled: + assert client + for (prediction_context_list, groundtruth_list) in zip( + list_of_prediction_context_lists, references + ): + output += [ + schemas.ContextRecallMetric( + value=client.context_recall( + context_list=prediction_context_list, + groundtruth_list=groundtruth_list, + ), + parameters={ + "dataset": dataset_name, + "datum_uid": datum_uid, + "context_list": prediction_context_list, + }, + ) + ] + if is_ROUGE_enabled: rouge_params = metric_params.get("ROUGE", {}) if not isinstance(rouge_params, dict): @@ -415,19 +530,16 @@ def _compute_text_generation_metrics( for metric in rouge_metrics ] - client = None if any( [ - metric_name in LLM_GUIDED_METRICS + ( + metric_name in LLM_GUIDED_METRICS + and metric_name not in TEXT_COMPARISON_METRICS + ) for metric_name in metrics_to_return ] ): - if llm_api_params is None: - raise ValueError( - f"llm_api_params must be provided for the following metrics: {[metric for metric in metrics_to_return if metric in LLM_GUIDED_METRICS]}." - ) - client = _setup_llm_client(llm_api_params) - + assert client datum_subquery = ( generate_select( models.Datum.id.label("datum_id"), @@ -459,19 +571,6 @@ def _compute_text_generation_metrics( ) results = db.execute(joint_subquery).all() - is_AnswerRelevance_enabled = ( - MetricType.AnswerRelevance in metrics_to_return - ) - is_Bias_enabled = MetricType.Bias in metrics_to_return - is_Coherence_enabled = MetricType.Coherence in metrics_to_return - is_ContextRelevance_enabled = ( - MetricType.ContextRelevance in metrics_to_return - ) - is_Faithfulness_enabled = MetricType.Faithfulness in metrics_to_return - is_Hallucination_enabled = ( - MetricType.Hallucination in metrics_to_return - ) - is_Toxicity_enabled = MetricType.Toxicity in metrics_to_return for ( datum_uid, @@ -507,19 +606,6 @@ def _compute_text_generation_metrics( ) ] - if is_Coherence_enabled: - score = client.coherence(text=prediction_text) - output += [ - schemas.CoherenceMetric( - value=score, - parameters={ - "dataset": dataset_name, - "datum_uid": datum_uid, - "prediction": prediction_text, - }, - ) - ] - if is_ContextRelevance_enabled: score = client.context_relevance( query=datum_text, context_list=prediction_context_list @@ -567,6 +653,22 @@ def _compute_text_generation_metrics( ) ] + if is_SummaryCoherence_enabled: + score = client.summary_coherence( + text=datum_text, + summary=prediction_text, + ) + output += [ + schemas.SummaryCoherenceMetric( + value=score, + parameters={ + "dataset": dataset_name, + "datum_uid": datum_uid, + "prediction": prediction_text, + }, + ) + ] + if is_Toxicity_enabled: score = client.toxicity(text=prediction_text) output += [ diff --git a/api/valor_api/enums.py b/api/valor_api/enums.py index 9c4daf4d1..69b2da5f0 100644 --- a/api/valor_api/enums.py +++ b/api/valor_api/enums.py @@ -135,14 +135,17 @@ class MetricType(str, Enum): mIOU = "mIOU" PrecisionRecallCurve = "PrecisionRecallCurve" DetailedPrecisionRecallCurve = "DetailedPrecisionRecallCurve" + AnswerCorrectness = "AnswerCorrectness" AnswerRelevance = "AnswerRelevance" Bias = "Bias" BLEU = "BLEU" - Coherence = "Coherence" + ContextPrecision = "ContextPrecision" + ContextRecall = "ContextRecall" ContextRelevance = "ContextRelevance" Faithfulness = "Faithfulness" Hallucination = "Hallucination" ROUGE = "ROUGE" + SummaryCoherence = "SummaryCoherence" Toxicity = "Toxicity" diff --git a/api/valor_api/schemas/__init__.py b/api/valor_api/schemas/__init__.py index 3dc81ba08..e1ca8a7ec 100644 --- a/api/valor_api/schemas/__init__.py +++ b/api/valor_api/schemas/__init__.py @@ -28,16 +28,18 @@ from .info import APIVersion from .metrics import ( AccuracyMetric, + AnswerCorrectnessMetric, AnswerRelevanceMetric, APMetric, APMetricAveragedOverIOUs, ARMetric, BiasMetric, BLEUMetric, - CoherenceMetric, ConfusionMatrix, ConfusionMatrixEntry, ConfusionMatrixResponse, + ContextPrecisionMetric, + ContextRecallMetric, ContextRelevanceMetric, DetailedPrecisionRecallCurve, F1Metric, @@ -50,6 +52,7 @@ RecallMetric, ROCAUCMetric, ROUGEMetric, + SummaryCoherenceMetric, ToxicityMetric, mAPMetric, mAPMetricAveragedOverIOUs, @@ -125,13 +128,16 @@ "Health", "Readiness", "DatasetSummary", + "AnswerCorrectnessMetric", "AnswerRelevanceMetric", "BiasMetric", "BLEUMetric", - "CoherenceMetric", + "ContextPrecisionMetric", + "ContextRecallMetric", "ContextRelevanceMetric", "FaithfulnessMetric", "HallucinationMetric", "ROUGEMetric", + "SummaryCoherenceMetric", "ToxicityMetric", ] diff --git a/api/valor_api/schemas/evaluation.py b/api/valor_api/schemas/evaluation.py index 02f369f1d..76a614904 100644 --- a/api/valor_api/schemas/evaluation.py +++ b/api/valor_api/schemas/evaluation.py @@ -141,15 +141,26 @@ def _validate_parameters(cls, values): "`iou_thresholds_to_return` must be a subset of `iou_thresholds_to_compute`" ) case TaskType.TEXT_GENERATION: - text_comparison_metrics = set(["ROUGE", "BLEU"]) + text_comparison_metrics = set( + [ + "AnswerCorrectness", + "BLEU", + "ContextPrecision", + "ContextRecall", + "ROUGE", + ] + ) llm_guided_metrics = set( [ + "AnswerCorrectness", "AnswerRelevance", "Bias", - "Coherence", + "ContextPrecision", + "ContextRecall", "ContextRelevance", "Faithfulness", "Hallucination", + "SummaryCoherence", "Toxicity", ] ) diff --git a/api/valor_api/schemas/metrics.py b/api/valor_api/schemas/metrics.py index 407111918..58cddb674 100644 --- a/api/valor_api/schemas/metrics.py +++ b/api/valor_api/schemas/metrics.py @@ -747,6 +747,42 @@ def db_mapping(self, evaluation_id: int) -> dict: } +class AnswerCorrectnessMetric(BaseModel): + """ + Describes an answer correctness metric. + + Attributes + ---------- + value : float + The answer correctness score between 0 and 1, with higher values indicating that the answer is more correct. A score of 1 indicates that all statements in the prediction are supported by the ground truth and all statements in the ground truth are present in the prediction. + parameters : dict + Any parameters associated with the metric, as well as any datum or prediction parameters that are relevant to the metric. + """ + + value: float + parameters: dict + + def db_mapping(self, evaluation_id: int) -> dict: + """ + Creates a mapping for use when uploading the metric to the database. + + Parameters + ---------- + evaluation_id : int + The evaluation id. + + Returns + ---------- + A mapping dictionary. + """ + return { + "value": self.value, + "parameters": self.parameters, + "type": "AnswerCorrectness", + "evaluation_id": evaluation_id, + } + + class AnswerRelevanceMetric(BaseModel): """ Describes an answer relevance metric. @@ -855,19 +891,55 @@ def db_mapping(self, evaluation_id: int) -> dict: } -class CoherenceMetric(BaseModel): +class ContextPrecisionMetric(BaseModel): """ - Describes a coherence metric. + Describes a context precision metric. Attributes ---------- - value : int - The coherence score for a datum. This is an integer with 1 being the lowest coherence and 5 the highest coherence. + value : float + The context precision score for a datum. This is a float between 0 and 1, with 0 indicating that none of the contexts are useful to arrive at the ground truth answer to the query and 1 indicating that all contexts are useful to arrive at the ground truth answer to the query. The score is more heavily influenced by earlier contexts in the list of contexts than later contexts. parameters : dict Any parameters associated with the metric, as well as any datum or prediction parameters that are relevant to the metric. """ - value: int + value: float + parameters: dict + + def db_mapping(self, evaluation_id: int) -> dict: + """ + Creates a mapping for use when uploading the metric to the database. + + Parameters + ---------- + evaluation_id : int + The evaluation id. + + Returns + ---------- + A mapping dictionary. + """ + return { + "value": self.value, + "parameters": self.parameters, + "type": "ContextPrecision", + "evaluation_id": evaluation_id, + } + + +class ContextRecallMetric(BaseModel): + """ + Describes a context recall metric. + + Attributes + ---------- + value : float + The context recall score for a datum. This is a float between 0 and 1, with 1 indicating that all ground truth statements are attributable to the context list. + parameters : dict + Any parameters associated with the metric, as well as any datum or prediction parameters that are relevant to the metric. + """ + + value: float parameters: dict def db_mapping(self, evaluation_id: int) -> dict: @@ -886,7 +958,7 @@ def db_mapping(self, evaluation_id: int) -> dict: return { "value": self.value, "parameters": self.parameters, - "type": "Coherence", + "type": "ContextRecall", "evaluation_id": evaluation_id, } @@ -1035,6 +1107,42 @@ def db_mapping(self, evaluation_id: int) -> dict: } +class SummaryCoherenceMetric(BaseModel): + """ + Describes a summary coherence metric. + + Attributes + ---------- + value : int + The summary coherence score for a datum. This is an integer with 1 being the lowest summary coherence and 5 the highest summary coherence. + parameters : dict + Any parameters associated with the metric, as well as any datum or prediction parameters that are relevant to the metric. + """ + + value: int + parameters: dict + + def db_mapping(self, evaluation_id: int) -> dict: + """ + Creates a mapping for use when uploading the metric to the database. + + Parameters + ---------- + evaluation_id : int + The evaluation id. + + Returns + ---------- + A mapping dictionary. + """ + return { + "value": self.value, + "parameters": self.parameters, + "type": "SummaryCoherence", + "evaluation_id": evaluation_id, + } + + class ToxicityMetric(BaseModel): """ Describes a toxicity metric. diff --git a/api/valor_api/schemas/types.py b/api/valor_api/schemas/types.py index 36151282e..556333670 100644 --- a/api/valor_api/schemas/types.py +++ b/api/valor_api/schemas/types.py @@ -94,9 +94,9 @@ def _match_annotation_to_implied_task_type( and annotation.context_list is None ): implied_type = ["embedding"] - # text generation tasks only support text and optionally context_list + # text generation tasks only support text and context_list, although some metrics only use text or only use context_list elif ( - annotation.text is not None + (annotation.text is not None or annotation.context_list is not None) and not annotation.labels and annotation.bounding_box is None and annotation.polygon is None diff --git a/client/valor/enums.py b/client/valor/enums.py index 50bc5b8d2..a75e296d7 100644 --- a/client/valor/enums.py +++ b/client/valor/enums.py @@ -51,14 +51,17 @@ class MetricType(str, Enum): mIOU = "mIOU" PrecisionRecallCurve = "PrecisionRecallCurve" DetailedPrecisionRecallCurve = "DetailedPrecisionRecallCurve" + AnswerCorrectness = "AnswerCorrectness" AnswerRelevance = "AnswerRelevance" Bias = "Bias" BLEU = "BLEU" - Coherence = "Coherence" + ContextPrecision = "ContextPrecision" + ContextRecall = "ContextRecall" ContextRelevance = "ContextRelevance" Faithfulness = "Faithfulness" Hallucination = "Hallucination" ROUGE = "ROUGE" + SummaryCoherence = "SummaryCoherence" Toxicity = "Toxicity" @classmethod @@ -111,14 +114,17 @@ def text_generation(cls) -> Set["MetricType"]: MetricTypes for text-generation tasks. """ return { + cls.AnswerCorrectness, cls.AnswerRelevance, cls.Bias, cls.BLEU, - cls.Coherence, + cls.ContextPrecision, + cls.ContextRecall, cls.ContextRelevance, cls.Faithfulness, cls.Hallucination, cls.ROUGE, + cls.SummaryCoherence, cls.Toxicity, } diff --git a/docs/metrics.md b/docs/metrics.md index 7a1250339..5d61e70ac 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -28,13 +28,13 @@ If we're missing an important metric for your particular use case, please [write | Detailed Precision-Recall Curves | Similar to `PrecisionRecallCurve`, except this metric a) classifies false positives as `hallucinations` or `misclassifications`, b) classifies false negatives as `misclassifications` or `missed_detections`, and c) gives example datums and bounding boxes for each observation, up to a maximum of `pr_curve_max_examples`. | See [detailed precision-recall curve methods](#detailedprecisionrecallcurve)| -**When calculating IOUs for object detection metrics, Valor handles the necessary conversion between different types of geometric annotations. For example, if your model prediction is a polygon and your groundtruth is a raster, then the raster will be converted to a polygon prior to calculating the IOU. +**When calculating IOUs for object detection metrics, Valor handles the necessary conversion between different types of geometric annotations. For example, if your model prediction is a polygon and your ground truth is a raster, then the raster will be converted to a polygon prior to calculating the IOU. ## Semantic Segmentation Metrics | Name | Description | Equation | | :- | :- | :- | -| Intersection Over Union (IOU) | A ratio between the groundtruth and predicted regions of an image, measured as a percentage, grouped by class. |$\dfrac{area( prediction \cap groundtruth )}{area( prediction \cup groundtruth )}$ | +| Intersection Over Union (IOU) | A ratio between the ground truth and predicted regions of an image, measured as a percentage, grouped by class. |$\dfrac{area( prediction \cap groundtruth )}{area( prediction \cup groundtruth )}$ | | Mean IOU | The average of IOU across labels, grouped by label key. | $\dfrac{1}{\text{number of labels}} \sum\limits_{label \in labels} IOU_{c}$ | @@ -42,10 +42,18 @@ If we're missing an important metric for your particular use case, please [write | Name | Description | Equation | | :- | :- | :- | -| Answer Relevance | The number of statements in the answer that are relevant to the query, divided by the total number of statements in the answer | See [appendix](#answer-relevance) for details. | -| Coherence | Rates the coherence of a textual summary relative to some source text using a score from 1 to 5, where 5 means "This summary is extremely coherent based on the information provided in the source text". | See [appendix](#coherence) for details. | -| ROUGE | A score between 0 and 1 indicating how often the words in the ground truth string appeared in the predicted string (i.e., measuring recall). | See [appendix](#rouge) for details. | +| Answer Correctness | An f1 score computed by comparing statements from a predicted answer to statements from a ground truth. | See [appendix](#answer-correctness-llm-guided) for details. | +| Answer Relevance | The proportion of statements in the answer that are relevant to the query. | $\dfrac{\textnormal{Number of Relevant Statements}}{\textnormal{Total Number of Statements}}$ | +| Bias | The proportion of opinions in the predicted text that are biased. | $\dfrac{\textnormal{Number of Biased Opinions}}{\textnormal{Total Number of Opinions}}$ | | BLEU | A score between 0 and 1 indicating how much the predicted string matches the ground truth string (i.e., measuring precision), with a penalty for brevity. | See [appendix](#bleu) for details. | +| Context Precision | An LLM-guided metric to evaluate a RAG retrieval mechanism. | See [appendix](#context-precision-llm-guided) for details. | +| Context Recall | An LLM-guided metric to evaluate a RAG retrieval mechanism. | See [appendix](#context-recall-llm-guided) for details. | +| Context Relevance | The proportion of retrieved contexts that are relevant to the query. | $\dfrac{\textnormal{Number of Relevant Contexts}}{\textnormal{Total Number of Contexts}}$ | +| Faithfulness | The proportion of claims in the predicted answer that are implied by the retrieved contexts. | $\dfrac{\textnormal{Number of Implied Claims}}{\textnormal{Total Number of Claims}}$ | +| Hallucination | The proportion of retrieved contexts that are contradicted by the predicted answer. | $\dfrac{\textnormal{Number of Contradicted Contexts}}{\textnormal{Total Number of Contexts}}$ | +| ROUGE | A score between 0 and 1 indicating how often the words in the ground truth text appeared in the predicted text (i.e., measuring recall). | See [appendix](#rouge) for details. | +| Summary Coherence | Rates the coherence of a textual summary relative to some source text using a score from 1 to 5, where 5 means "This summary is extremely coherent based on the information provided in the source text". | See [appendix](#summary-coherence-llm-guided) for details. | +| Toxicity | The proportion of opinions in the predicted text that are toxic. | $\dfrac{\textnormal{Number of Toxic Opinions}}{\textnormal{Total Number of Opinions}}$ | # Appendix: Metric Calculations @@ -213,7 +221,7 @@ It's important to note that these curves are computed slightly differently from ### Classification Tasks -Valor calculates its aggregate precision, recall, and F1 metrics by matching the highest confidence prediction with each groundtruth. One issue with this approach is that we may throw away useful information in cases where prediction labels all have similarly strong confidence scores. For example: if our top two predictions for a given ground truth are `{“label”: cat, “score”:.91}` and `{“label”: dog, “score”:.90}`, then our aggregated precision and recall metrics would penalize the `dog` label even though its confidence score was nearly equal to the `cat` label. +Valor calculates its aggregate precision, recall, and F1 metrics by matching the highest confidence prediction with each ground truth. One issue with this approach is that we may throw away useful information in cases where prediction labels all have similarly strong confidence scores. For example: if our top two predictions for a given ground truth are `{“label”: cat, “score”:.91}` and `{“label”: dog, “score”:.90}`, then our aggregated precision and recall metrics would penalize the `dog` label even though its confidence score was nearly equal to the `cat` label. We think the approach above makes sense when calculating aggregate precision and recall metrics, but, when calculating the `PrecisionRecallCurve` value for each label, we consider all ground truth-prediction matches in order to treat each label as its own, separate binary classification problem. @@ -229,23 +237,23 @@ The `PrecisionRecallCurve` values differ from the precision-recall curves used t Valor also includes a more detailed version of `PrecisionRecallCurve` which can be useful for debugging your model's false positives and false negatives. When calculating `DetailedPrecisionCurve`, Valor will classify false positives as either `hallucinations` or `misclassifications` and your false negatives as either `missed_detections` or `misclassifications` using the following logic: #### Classification Tasks - - A **false positive** occurs when there is a qualified prediction (with `score >= score_threshold`) with the same `Label.key` as the groundtruth on the datum, but the `Label.value` is incorrect. - - **Example**: if there's a photo with one groundtruth label on it (e.g., `Label(key='animal', value='dog')`), and we predicted another label value (e.g., `Label(key='animal', value='cat')`) on that datum, we'd say it's a `misclassification` since the key was correct but the value was not. - - Similarly, a **false negative** occurs when there is a prediction with the same `Label.key` as the groundtruth on the datum, but the `Label.value` is incorrect. + - A **false positive** occurs when there is a qualified prediction (with `score >= score_threshold`) with the same `Label.key` as the ground truth on the datum, but the `Label.value` is incorrect. + - **Example**: if there's a photo with one ground truth label on it (e.g., `Label(key='animal', value='dog')`), and we predicted another label value (e.g., `Label(key='animal', value='cat')`) on that datum, we'd say it's a `misclassification` since the key was correct but the value was not. + - Similarly, a **false negative** occurs when there is a prediction with the same `Label.key` as the ground truth on the datum, but the `Label.value` is incorrect. - Stratifications of False Negatives: - `misclassification`: Occurs when a different label value passes the score threshold. - `no_predictions`: Occurs when no label passes the score threshold. #### Object Detection Tasks - A **false positive** is a `misclassification` if the following conditions are met: - 1. There is a qualified prediction with the same `Label.key` as the groundtruth on the datum, but the `Label.value` is incorrect - 2. The qualified prediction and groundtruth have an IOU >= `pr_curve_iou_threshold`. + 1. There is a qualified prediction with the same `Label.key` as the ground truth on the datum, but the `Label.value` is incorrect + 2. The qualified prediction and ground truth have an IOU >= `pr_curve_iou_threshold`. - A **false positive** that does not meet the `misclassification` criteria is considered to be a part of the `hallucinations` set. - A **false negative** is determined to be a `misclassification` if the following criteria are met: - 1. There is a qualified prediction with the same `Label.key` as the groundtruth on the datum, but the `Label.value` is incorrect. - 2. The qualified prediction and groundtruth have an IOU >= `pr_curve_iou_threshold`. + 1. There is a qualified prediction with the same `Label.key` as the ground truth on the datum, but the `Label.value` is incorrect. + 2. The qualified prediction and ground truth have an IOU >= `pr_curve_iou_threshold`. - For a **false negative** that does not meet this criteria, we consider it to have `no_predictions`. - - **Example**: if there's a photo with one groundtruth label on it (e.g., `Label(key='animal', value='dog')`), and we predicted another bounding box directly over that same object (e.g., `Label(key='animal', value='cat')`), we'd say it's a `misclassification`. + - **Example**: if there's a photo with one ground truth label on it (e.g., `Label(key='animal', value='dog')`), and we predicted another bounding box directly over that same object (e.g., `Label(key='animal', value='cat')`), we'd say it's a `misclassification`. The `DetailedPrecisionRecallOutput` also includes up to `n` examples of each type of error, where `n` is set using `pr_curve_max_examples`. An example output is as follows: @@ -254,8 +262,8 @@ The `DetailedPrecisionRecallOutput` also includes up to `n` examples of each typ # To retrieve more detailed examples for each `fn`, `fp`, and `tp`, look at the `DetailedPrecisionRecallCurve` metric detailed_evaluation = evaluate_detection( data=dataset, - pr_curve_max_examples=1 # The maximum number of examples to return for each obseration type (e.g., hallucinations, misclassifications, etc.) - metrics_to_return=[..., 'DetailedPrecisionRecallCurve'] # DetailedPrecisionRecallCurve isn't returned by default; the user must ask for it explicitely + pr_curve_max_examples=1 # The maximum number of examples to return for each observation type (e.g., hallucinations, misclassifications, etc.) + metrics_to_return=[..., 'DetailedPrecisionRecallCurve'] # DetailedPrecisionRecallCurve isn't returned by default; the user must ask for it explicitly ) print(detailed_evaluation) @@ -310,20 +318,27 @@ print(detailed_evaluation) ## General Text Generation Metrics -The general text generation metrics apply to a broad set of text generation tasks. These metrics don't compare to groundtruths and don't require context. The metrics are evaluated purely based on the predicted text. +The general text generation metrics apply to a broad set of text generation tasks. These metrics don't compare to ground truths and don't require context. The metrics are evaluated purely based on the predicted text. Some of the general text generation metrics are not necessarily useful in all tasks, but still can be used. For example, the bias and toxicity metrics evaluate opinions in the predicted text for bias/toxicity. If a task should have few/no opinions, then these metrics might not be useful. However bias and toxicity can still be evaluated on the predicted text, and if there are no opinions, then the bias/toxicity scores should be 0, indicating that there were no biased/toxic opinions. -### Bias +### Bias (LLM-Guided) -Bias is the proportion of biased opinions in a piece of text. +Uses +- Prediction - Annotation.text -Our implementation closely follows [DeepEval's implementation](https://github.com/confident-ai/deepeval/tree/main/deepeval/metrics/bias). We use the same two step prompting strategy and the same instructions. Notably, we follow the same [definitions of opinion and bias](https://docs.confident-ai.com/docs/metrics-bias#definition-of-bias). +Bias is the proportion of biased opinions in a piece of text. If there are no opinions in the text, then the bias score is 0. + +First, an LLM is prompted to extract opinions from the text. Then, the LLM is prompted to evaluate whether each of the opinions is biased. The bias score is the proportion of biased opinions in the text. + +$$Bias = \frac{\textnormal{Number of Biased Opinions}}{\textnormal{Total Number of Opinions}}$$ + +Our implementation closely follows [DeepEval's implementation](https://github.com/confident-ai/deepeval/tree/main/deepeval/metrics/bias). We use the same two step prompting strategy and modified DeepEval's instructions. Notably, we follow the same [definitions of opinion and bias](https://docs.confident-ai.com/docs/metrics-bias#definition-of-bias). In DeepEval, an opinion is defined according to the following principles: -- opinions are personal beliefs or judgments, not verifiable facts -- a mistaken statement of fact (eg. "The Earth is Flat") is merely incorrect, not an opinion -- if a source is cited (eg. "Fox News thinks Donald Trump is a better President than Joe Biden"), it's a reported statement, not a subjective opinion +- Opinions are personal beliefs or judgments, not verifiable facts. +- A mistaken statement of fact (eg. "The Earth is Flat") is merely incorrect, not an opinion. +- If a source is cited (eg. "Fox News thinks the Republican presidential candidate is better than the Democratic presidential candidate."), it's a reported statement, not a subjective opinion of the author of the text. In DeepEval, bias is defined according to the following rubric: - Gender Bias: Discrimination based on a person's gender. @@ -339,22 +354,23 @@ In DeepEval, bias is defined according to the following rubric: - Biased: Crime rates are always higher in those big cities. - Not Biased: Studies show a correlation between population density and certain types of crime. -### Coherence +### Toxicity (LLM-Guided) -Coherence is a measure, on a scale of 1 to 5, of the collective quality of all sentences for a piece of text, with 5 indicating the highest coherence. The coherence of a piece of text is evaluated solely based on the text, without any reference to the query or any context. Because of this, the coherence metric can be applied to any text generation task. +Uses +- Prediction - Annotation.text -Valor's implementation of the coherence metric uses an instruction that was adapted from appendix A of DeepEval's paper G-EVAL: [NLG Evaluation using GPT-4 with Better Human Alignment](https://arxiv.org/pdf/2303.16634). While DeepEval's instruction and evaluation process was specific to summarization tasks, Valor generalized the instruction to apply to any text generation task. Most crucially, Valor does not use the datum text in its coherence evaluation. +Toxicity is the proportion of toxic opinions in a piece of text. If there are no opinions in the text, then the toxicity score is 0. -### Toxicity +First, an LLM is prompted to extract opinions from the text. Then, the LLM is prompted to evaluate whether each of the opinions is toxic. The toxicity score is the proportion of toxic opinions in the text. -Toxicity is the proportion of toxic opinions in a piece of text. +$$Toxicity = \frac{\textnormal{Number of Toxic Opinions}}{\textnormal{Total Number of Opinions}}$$ -Our implementation closely follows [DeepEval's implementation](https://github.com/confident-ai/deepeval/tree/main/deepeval/metrics/toxicity). We use the same two step prompting strategy and the same instructions. Notably, we follow the same [definitions of opinion and toxicity](https://docs.confident-ai.com/docs/metrics-toxicity#definition-of-toxicity). +Our implementation closely follows [DeepEval's implementation](https://github.com/confident-ai/deepeval/tree/main/deepeval/metrics/toxicity). We use the same two step prompting strategy and modified DeepEval's instructions. Notably, we follow the same [definitions of opinion and toxicity](https://docs.confident-ai.com/docs/metrics-toxicity#definition-of-toxicity). In DeepEval, an opinion is defined according to the following principles: -- opinions are personal beliefs or judgments, not verifiable facts -- a mistaken statement of fact (eg. "The Earth is Flat") is merely incorrect, not an opinion -- if a source is cited (eg. "Fox News thinks Donald Trump is a better President than Joe Biden"), it's a reported statement, not a subjective opinion +- Opinions are personal beliefs or judgments, not verifiable facts. +- A mistaken statement of fact (eg. "The Earth is Flat") is merely incorrect, not an opinion. +- If a source is cited (eg. "Fox News thinks the Republican presidential candidate is better than the Democratic presidential candidate."), it's a reported statement, not a subjective opinion of the author of the text. In DeepEval, whether an opinion is toxic is defined according to the following rubric: - Personal Attacks: Insults or hostile comments aimed at degrading the individual rather than addressing their ideas. @@ -375,52 +391,167 @@ In DeepEval, whether an opinion is toxic is defined according to the following r ## Q&A Metrics -Question and Answering (Q&A) is a subcategory of text generation tasks in which the datum is a query/question, and the prediction is an answer to that query. In this setting we can evaluate the predicted text based on properties such as relevance to the answer or the correctness of the answer. These metrics will not apply to all text generation tasks. For example, not all text generation tasks have a single correct answer. +Question and Answering (Q&A) is a subcategory of text generation tasks in which the datum is a query/question, and the prediction is an answer to that query. In this setting we can evaluate the predicted text based on properties such as relevance to the query or the correctness of the answer. These metrics will not apply to all text generation tasks. For example, not all text generation tasks have a single correct answer. + +### Answer Correctness (LLM-Guided) + +Uses +- GroundTruth - Annotation.text +- Prediction - Annotation.text + +Answer correctness is computed as a comparison between a ground truth text and a prediction text. + +First, an LLM is prompted to extract statements from both the ground truth and prediction texts. Then, the LLM is prompted to determine if each statement in the prediction is supported by the ground truth and if each statement in the ground truth is present in the prediction. If a prediction statement is supported by the ground truth, this is a true positive (tp). If a prediction statement is not supported by the ground truth, this is a false positive (fp). If a ground truth statement is not represented in the prediction, this is a false negative (fn). + +The answer correctness score is computed as an f1 score: + +$$AnswerCorrectness = \frac{tp}{tp + 0.5 * (fp + fn)}$$ + +If there are no true positives, the score is 0. Answer correctness will be at most 1, and is 1 only if all statements in the prediction are supported by the ground truth and all statements in the ground truth are present in the prediction. + +If there are multiple ground truth answers for a datum, then the answer correctness score is computed for each ground truth answer and the maximum score is taken. Thus the answer correctness score for a prediction is its highest answer correctness score across all ground truth answers. + +Our implementation was adapted from [RAGAS's implementation](https://github.com/explodinggradients/ragas/blob/main/src/ragas/metrics/_answer_correctness.py). We follow a similar prompting strategy and computation, however we do not do a weighted sum with an answer similarity score using embeddings. RAGAS's answer correctness metric is a weighted sum of the f1 score described here with the answer similarity score. RAGAS computes answer similarity by embedding both the ground truth and prediction and taking their inner product. They use default weights of 0.75 for the f1 score and 0.25 for the answer similarity score. In Valor, we decided to implement answer correctness as just the f1 score, so that users are not required to supply an embedding model. + +### Answer Relevance (LLM-Guided) -### Answer Relevance +Uses +- Datum.text +- Prediction - Annotation.text Answer relevance is the proportion of statements in the answer that are relevant to the query. This metric is used to evaluate the overall relevance of the answer to the query. The answer relevance metric is particularly useful for evaluating question-answering tasks, but could also apply to some other text generation tasks. This metric is not recommended for more open ended tasks. -Our implementation closely follows [DeepEval's implementation](https://github.com/confident-ai/deepeval/tree/main/deepeval/metrics/answer_relevancy). We use the same two step prompting strategy and the same instructions. +First, an LLM is prompted to extract statements from the predicted text. Then, the LLM is prompted to determine if each statement in the prediction is relevant to the query. The answer relevance score is the proportion of relevant statements in the prediction. + +$$AnswerRelevance = \frac{\textnormal{Number of Relevant Statements}}{\textnormal{Total Number of Statements}}$$ + +Our implementation closely follows [DeepEval's implementation](https://github.com/confident-ai/deepeval/tree/main/deepeval/metrics/answer_relevancy). We use the same two step prompting strategy and modified DeepEval's instructions. ## RAG Metrics -Note that RAG is a form of Q&A, so any Q&A metric can also be used to evaluate RAG models. The metrics in this section however should not be used for all Q&A tasks. RAG specific metrics use retrieved contexts, so should not be used to evaluate models that don't use contexts. +Retrieval Augmented Generation (RAG) is a subcategory of Q&A where the model retrieves contexts from a database, then uses the retrieved contexts to aid in generating an answer. RAG models can be evaluated with Q&A metrics (AnswerCorrectness and AnswerRelevance) that evaluate the quality of the generated answer to the query, but RAG models can also be evaluated with RAG specific metrics. Some RAG metrics (Faithfulness and Hallucination) evaluate the quality of the generated answer relative to the retrieved contexts. Other RAG metrics (ContextPrecision, ContextRecall and ContextRelevance) evaluate the retrieval mechanism by evaluating the quality of the retrieved contexts relative to the query and/or ground truth answers. + +### Context Precision (LLM-Guided) + +Uses +- Datum.text +- GroundTruth - Annotation.text +- Prediction - Annotation.context + +Context precision is an LLM-guided metric that uses the query, an ordered list of retrieved contexts and a ground truth to evaluate a RAG retrieval mechanism. + +First, an LLM is prompted to determine if each context in the context list is useful for producing the ground truth answer to the query. A verdict is produced by the LLM for each context, either "yes" this context is useful for producing the ground truth answer or "no" this context is not useful for producing the ground truth answer. + +Second, the list of verdicts is used to compute the context precision score. The context precision score is computed as a weighted sum of the precision at $k$ for each $k$ from 1 to the length of the context list. + +The precision at $k$ is the proportion of "yes" verdicts amongst the first $k$ contexts. Because the precision at $k$ considers the first $k$ contexts, the order of the context list matters. If the RAG retrieval mechanism returns contexts with a measure of the relevance of each context to the query, then the contexts should be ordered from most relevant to least relevant. The formula for precision at $k$ is: + +$$Precision@k = \frac{1}{k}\sum_{i=1}^kv_i$$ + +where $v_i$ is 1 if the $i$ th verdict is "yes" and 0 if the $i$ th verdict is "no". + +The context precision score is computed by adding up all the precision at $k$ for which the $k$ verdict is "yes", then dividing by the total number of contexts for which the verdict is "yes". You could think of this as averaging over the precision at $k$ for which the $k$th verdict is "yes". As an edge case, if all of the verdicts are "no", then the score is 0. If the total number of contexts is $K$, then the formula for context precision is: + +$$Context Precision = \frac{\sum_{k=1}^K(Precision@k \times v_k)}{\sum_{k=1}^Kv_k}$$ + +Note that context precision evaluates not just which contexts are retrieved, but the order of those contexts. The earlier a piece of context appears in the context list, the more important it is in the computation of this score. For example, the first context in the context list will be included in every precision at k computation, so will have a large influence on the final score, whereas the last context will only be used for the last precision at k computation, so will have a small influence on the final score. -### Context Relevance +As an example, suppose there are 4 contexts and the verdicts are ["yes", "no", "no", "yes"]. The precision at 1 is 1 and the precision at 4 is 0.5. The context precision score is then (1 + 0.5) / 2 = 0.75. If instead the verdicts were ["no", "yes", "no", "yes"], then the precision at 2 is 0.5 and the precision at 4 is 0.5, so the context precision score is (0.5 + 0.5) / 2 = 0.5. This example demonstrates how important the first few contexts are in determining the context precision score. Just swapping the first two contexts had a large impact on the score. -Context relevance is the proportion of pieces of retrieved contexts that are relevant to the query. A piece of context is considered relevant to the query if any part of the context is relevant to answering the query. For example, a piece of context might be a paragraph of text, so if the answer or part of the answer to a query is contained somewhere in that paragraph, then that piece of context is considered relevant. +If multiple ground truth answers are provided for a datum, then the verdict for each context is "yes" if the verdict for that context is "yes" for any ground truth. This results in an aggregate verdict for each context (aggregating over the ground truths). This list of aggregate verdicts is used for the precision at k computations. Note that this is different than computing the context precision score for each ground truth and taking the maximum score (that approach makes more sense for answer correctness and context recall). -Context relevance is useful for evaluating the retrieval mechanism of a RAG model. This metric does not considered the generated answer or any groundtruth answers to the query, only the retrieved contexts. +Our implementation uses the same computation as both [RAGAS](https://docs.ragas.io/en/latest/concepts/metrics/context_precision.html) and [DeepEval](https://docs.confident-ai.com/docs/metrics-contextual-precision). Our instruction is loosely adapted from [DeepEval's instruction](https://github.com/confident-ai/deepeval/blob/main/deepeval/metrics/contextual_precision/template.py). -Given the query and the list of contexts, an LLM is prompted to determine if each piece of context is relevant to the query. Then the score is computed as the number of relevant contexts divided by the total number of contexts. +### Context Recall (LLM-Guided) -Our implementation closely follows [DeepEval's implementation](https://github.com/confident-ai/deepeval/tree/main/deepeval/metrics/context_relevancy). The calculation is the same, however we modified the instruction for the LLM. The instruction in DeepEval contained typos and was organized in a confusing way, so we fixed the typos and reorganized the example to make the task clearer. +Uses +- GroundTruth - Annotation.text +- Prediction - Annotation.context -### Faithfulness +Context recall is an LLM-guided metric that uses a list of retrieved contexts and a ground truth answer to a query to evaluate a RAG retrieval mechanism. Context recall is the proportion of ground truth statements that are attributable to the context list. + +First, an LLM is prompted to extract a list of statements made in the ground truth answer. Second, the LLM is prompted with the context list and the list of ground truth statements to determine if each ground truth statement could be attributed to the context list. The number of ground truth statements that could be attributed to the context list is divided by the total number of ground truth statements to get the context recall score. + +$$Context Recall = \frac{\textnormal{Number of Ground Truth Statements Attributable to Context List}}{\textnormal{Total Number of Ground Truth Statements}}$$ + +If multiple ground truth answers are provided for a datum, then the context recall score is computed for each ground truth answer and the maximum score is taken. Thus the context recall for a prediction is its highest context recall score across all ground truth answers. + +Our implementation loosely follows [RAGAS](https://docs.ragas.io/en/latest/concepts/metrics/context_recall.html). The example in Valor's instruction was adapted from the example in [RAGAS's instruction](https://github.com/explodinggradients/ragas/blob/main/src/ragas/metrics/_context_recall.py). + +### Context Relevance (LLM-Guided) + +Uses +- Datum.text +- Prediction - Annotation.context + +Context relevance is an LLM-guided metric that uses a query and a list of retrieved contexts to evaluate a RAG retrieval mechanism. Context relevance is the proportion of pieces of retrieved contexts that are relevant to the query. A piece of context is considered relevant to the query if any part of the context is relevant to answering the query. For example, a piece of context might be a paragraph of text, so if the answer or part of the answer to a query is contained somewhere in that paragraph, then that piece of context is considered relevant. + +First, an LLM is prompted to determine if each piece of context is relevant to the query. Then the score is computed as the number of relevant contexts divided by the total number of contexts. + +$$Context Relevance = \frac{\textnormal{Number of Relevant Contexts}}{\textnormal{Total Number of Contexts}}$$ + +Our implementation follows [DeepEval's implementation](https://github.com/confident-ai/deepeval/tree/main/deepeval/metrics/context_relevancy). The LLM instruction was adapted from DeepEval's instruction. + +### Faithfulness (LLM-Guided) + +Uses +- Prediction - Annotation.text +- Prediction - Annotation.context Faithfulness is the proportion of claims from the predicted text that are implied by the retrieved contexts. -First, an LLM is prompted to extract a list of claims from the predicted text. Then, the LLM is prompted again with the list of claims and the list of contexts and is asked if each claim is implied / can be verified from the contexts. If the claim contradicts any context or if the claim is unrelated to the contexts, the LLM is instructed to indicate that the claim is not implied by the contexts. The number of implied claims is divided by the total number of claims to get the faithfulness score. +First, an LLM is prompted to extract a list of claims from the predicted text. Then, the LLM is prompted again with the list of claims and the context list and is asked if each claim is implied / can be verified from the contexts. If the claim contradicts any context or if the claim is unrelated to the contexts, the LLM is instructed to indicate that the claim is not implied by the contexts. The number of implied claims is divided by the total number of claims to get the faithfulness score. + +$$Faithfulness = \frac{\textnormal{Number of Implied Claims}}{\textnormal{Total Number of Claims}}$$ Our implementation loosely follows and combines the strategies of [DeepEval](https://docs.confident-ai.com/docs/metrics-faithfulness) and [RAGAS](https://docs.ragas.io/en/latest/concepts/metrics/faithfulness.html), however it is notable that DeepEval and RAGAS's definitions of faithfulness are not equivalent. The difference is that, if a claim is unrelated to the contexts (is not implied by any context but also does not contradict any context), then DeepEval counts this claim positively towards the faithfulness score, however RAGAS counts this claim against the faithfulness score. Valor follows the same definition as RAGAS, as we believe that a claim that is unrelated to the contexts should not be counted positively towards the faithfulness score. If a predicted text makes many claims that are unrelated and unverifiable from the contexts, then how can we consider that text faithful to the contexts? -We follow [DeepEval's prompting strategy](https://github.com/confident-ai/deepeval/blob/main/deepeval/metrics/faithfulness/template.py) as this strategy is closer to the other prompting strategies in Valor, however we heavily modify the instructions. Most notably, we reword the instructions and examples to follow RAGAS's definition of faithfulness. +We follow [DeepEval's prompting strategy](https://github.com/confident-ai/deepeval/blob/main/deepeval/metrics/faithfulness) as this strategy is closer to the other prompting strategies in Valor, however we heavily modify the instructions. Most notably, we reword the instructions and examples to follow RAGAS's definition of faithfulness. -### Hallucination +### Hallucination (LLM-Guided) + +Uses +- Prediction - Annotation.text +- Prediction - Annotation.context Hallucination is the proportion of contexts that are contradicted by the predicted text. If the predicted text does not contradict any of the retrieved contexts, then it should receive a hallucination score of 0. The hallucination score is computed as the number of contexts contradicted by the predicted text divided by the total number of contexts. -Given the list of context and the predicted text, an LLM is prompted to determine if the text agrees or contradicts with each piece of context. The LLM is instructed to only indicate contradiction if the text directly contradicts any context, and otherwise indicates agreement. +Given the context list and the predicted text, an LLM is prompted to determine if the text agrees or contradicts with each piece of context. The LLM is instructed to only indicate contradiction if the text directly contradicts any context, and otherwise indicates agreement. + +$$Hallucination = \frac{\textnormal{Number of Contradicted Contexts}}{\textnormal{Total Number of Contexts}}$$ + +Note the differences between faithfulness and hallucination. First, for hallucination a good score is low, whereas for faithfulness a good score is high. Second, hallucination is the proportion of contradicted contexts, whereas faithfulness is the proportion of implied claims. + +Our implementation follows [DeepEval's implementation](https://github.com/confident-ai/deepeval/tree/main/deepeval/metrics/hallucination). + +## Summarization Metrics + +Summarization is the task of generating a shorter version of a piece of text that retains the most important information. Summarization metrics evaluate the quality of a summary by comparing it to the original text. + +Note that Datum.text is used differently for summarization than for Q&A and RAG tasks. For summarization, the Datum.text should be the text that was summarized and the prediction text should be the generated summary. This is different than Q&A and RAG where the Datum.text is the query and the prediction text is the generated answer. -Our implementation closely follows [DeepEval's implementation](https://github.com/confident-ai/deepeval/tree/main/deepeval/metrics/hallucination). The calculation is the same and the instruction is almost the same except a few minor tweaks. +### Summary Coherence (LLM-Guided) -## Text Comparison Metrics +Uses +- Datum.text +- Prediction - Annotation.text -This section contains non-llm guided metrics for comparing a predicted text to one or more groundtruth texts. +Summary coherence is an LLM-guided metric that measures the collective quality of a summary on an integer scale of 1 to 5, where 5 indicates the highest summary coherence. The coherence of a summary is evaluated based on the summary and the text being summarized. + +An LLM is prompted to evaluate the collective quality of a summary given the text being summarized. The LLM is instructed to give a high coherence score if the summary hits the key points in the text and if the summary is logically coherent. There is no formula for summary coherence, as the LLM is instructed to directly output the score. + +Valor's implementation of the summary coherence metric uses an instruction that was adapted from appendix A of DeepEval's paper G-EVAL: [NLG Evaluation using GPT-4 with Better Human Alignment](https://arxiv.org/pdf/2303.16634). The instruction in appendix A of DeepEval's paper is specific to news articles, but Valor generalized this instruction to apply to any text summarization task. + +## Non-LLM-Guided Text Comparison Metrics + +This section contains non-LLM-guided metrics for comparing a predicted text to one or more ground truth texts. These metrics can be run without specifying any LLM api parameters. ### ROUGE +Uses +- GroundTruth - Annotation.text +- Prediction - Annotation.text + ROUGE, or Recall-Oriented Understudy for Gisting Evaluation, is a set of metrics used for evaluating automatic summarization and machine translation software in natural language processing. The metrics compare an automatically produced summary or translation against a reference or a set of references (human-produced) summary or translation. ROUGE metrics range between 0 and 1, with higher scores indicating higher similarity between the automatically produced summary and the reference. In Valor, the ROUGE output value is a dictionary containing the following elements: @@ -434,11 +565,15 @@ In Valor, the ROUGE output value is a dictionary containing the following elemen } ``` -Behind the scenes, we use [Hugging Face's `evaluate` package](https://huggingface.co/spaces/evaluate-metric/rouge) to calculate these scores. Users can pass `rouge_types` and `use_stemmer` to EvaluationParameters in order to gain access to additional functionality from this package. +Behind the scenes, we use [Hugging Face's `evaluate` package](https://huggingface.co/spaces/evaluate-metric/rouge) to calculate these scores. Users can pass `rouge_types` and `rouge_use_stemmer` to EvaluationParameters in order to gain access to additional functionality from this package. ### BLEU -BLEU (bilingual evaluation understudy) is an algorithm for evaluating automatic summarization and machine translation software in natural language processing. BLEU's output is always a number between 0 and 1, where a score near 1 indicates that the hypothesis text is very similar to one or more of the reference texts. +Uses +- GroundTruth - Annotation.text +- Prediction - Annotation.text + +BLEU (BiLingual Evaluation Understudy) is an algorithm for evaluating automatic summarization and machine translation software in natural language processing. BLEU's output is always a number between 0 and 1, where a score near 1 indicates that the hypothesis text is very similar to one or more of the reference texts. Behind the scenes, we use [nltk.translate.bleu_score](https://www.nltk.org/_modules/nltk/translate/bleu_score.html) to calculate these scores. The default BLEU metric calculates a score for up to 4-grams using uniform weights (i.e., `weights=[.25, .25, .25, .25]`; also called BLEU-4). Users can pass their own `bleu_weights` to EvaluationParameters in order to change this default behavior and calculate other BLEU scores. \ No newline at end of file diff --git a/examples/text-generation/text_generation.ipynb b/examples/text-generation/text_generation.ipynb index e6d6bace0..65b8505e0 100644 --- a/examples/text-generation/text_generation.ipynb +++ b/examples/text-generation/text_generation.ipynb @@ -175,15 +175,15 @@ "source": [ "## Evaluation in Valor\n", "\n", - "In this example, the RAG pipeline produces answers to the given queries by retrieving context and then generating answers based on the context and query. Groundtruth answers are also known for these queries. Both the datums (which contain the queries) and the groundtruths are added to the dataset. Then, the predictions are added to the model, which includes the answer and the context used to generate that answer. \n", + "In this example, the RAG pipeline produces answers to the given queries by retrieving context and then generating answers based on the context and query. Ground truth answers are also known for these queries. Both the datums (which contain the queries) and the ground truths are added to the dataset. Then, the predictions are added to the model, which includes the answer and the context used to generate that answer. \n", "\n", - "The metrics requested include some text comparison metrics (BLEU and ROUGE), which do a text comparison between the generated answer and the groundtruth answer for the same datum. If the user only desires these metrics, then they do not need to include the context_list in the prediction and they do not need to supply the llm_api_parameters. \n", + "The metrics requested include some non-LLM-guided text comparison metrics (BLEU and ROUGE), which do a text comparison between the generated answer and the ground truth answer for the same datum. If the user only desires these metrics, then they do not need to include the context_list in the prediction and they do not need to supply the llm_api_parameters. \n", "\n", - "However, other metrics are requested that use llm guided evaluation (AnswerRelevance and Coherence). To get these metrics, the user needs to specify a client (openai or mistral), an api key and a model name, along with any other model kwargs. The api key can be stored in an environment variable or passed directly into model.evaluate_text_generation(). \n", + "However, the rest of the requested metrics use LLM-guided evaluation. To get these metrics, the user needs to specify a client (openai or mistral), an api key and a model name, along with any other model kwargs. The api key can be stored in an environment variable (OPENAI_API_KEY or MISTRAL_API_KEY) or passed directly into model.evaluate_text_generation(). \n", "\n", - "Each of these metrics will use API calls to the specified LLM service to get information relevant for computing the desired metrics. Some of these metrics, such as AnswerRelevance and Coherence, do not require any context, so can be used with a Q&A model that does not use context. Currently, none of these metrics use the context in their API calls or computations.\n", + "Each of these metrics will use API calls to the specified LLM service to get information relevant for computing the desired metrics. Some of these metrics, such as AnswerRelevance, do not require any context, so can be used with a Q&A model that does not use context. Other metrics, such as Hallucination, require context, so are only applicable to RAG models.\n", "\n", - "Note that AnswerRelevance is specific to the Q&A setting (RAG is a subcase of Q&A). AnswerRelevance measures how relevant the answer is to the question, by measuring the proportion of statements in the answer that are relevant to the question. This would not work as well in a less structured setting, such as summarization or content generation, where some statements in the generated text may not be directly relevant to the query. " + "For more information on each metric, see the Valor metric documentation in valor/docs/metrics.md." ] }, { @@ -212,7 +212,7 @@ "outputs": [], "source": [ "# For the purposes of this example, let's get metrics for just the first 5 datums.\n", - "MAX_DATUMS = 10\n", + "MAX_DATUMS = 5\n", "\n", "# Create, build and finalize the dataset and model.\n", "dataset = Dataset.create(\n", @@ -301,10 +301,13 @@ "eval_job = model.evaluate_text_generation(\n", " dataset,\n", " metrics_to_return=[\n", + " \"AnswerCorrectness\",\n", " \"AnswerRelevance\", \n", " \"Bias\",\n", " \"BLEU\", \n", - " \"Coherence\", \n", + " \"ContextPrecision\",\n", + " \"ContextRecall\",\n", + " \"ContextRelevance\",\n", " \"Faithfulness\",\n", " \"Hallucination\",\n", " \"ROUGE\",\n", @@ -333,6 +336,15 @@ "# Here are some example metrics. These are all for query0 and were evaluated by GPT-4o.\n", "example_expected_metrics = [\n", " {\n", + " 'type': 'AnswerCorrectness',\n", + " 'value': 0.5,\n", + " 'parameters': {\n", + " 'dataset': 'rag_dataset', \n", + " 'datum_uid': 'query0', \n", + " 'prediction': \"Cleveland's opponents in 1884 criticized his alleged involvement in a scandal regarding an illegitimate child, which was used to counter his innocent image during the presidential campaign.\",\n", + " },\n", + " },\n", + " {\n", " 'type': 'AnswerRelevance',\n", " 'value': 1.0,\n", " 'parameters': {\n", @@ -361,38 +373,65 @@ " },\n", " },\n", " {\n", - " 'type': 'Coherence',\n", - " 'value': 4.0,\n", + " 'type': 'ContextPrecision', \n", + " 'value': 0.0,\n", " 'parameters': {\n", - " 'dataset': 'rag_dataset',\n", - " 'datum_uid': 'query0',\n", - " 'prediction': \"Cleveland's opponents in 1884 criticized his alleged involvement in a scandal regarding an illegitimate child, which was used to counter his innocent image during the presidential campaign.\"\n", + " 'dataset': 'rag_dataset', \n", + " 'datum_uid': 'query0', \n", + " 'context_list': [\n", + " \"Cleveland was defeated in the 1888 presidential election, in part due to fraud (See Blocks of Five). He actually led in the popular vote over Benjamin Harrison (48.6% to 47.8%), but Harrison won the Electoral College by a 233-168 margin, largely by squeaking out a barely-over-1% win in Cleveland's home state of New York; in fact, had Cleveland won his home state, he would have won the electoral vote by a count of 204-197 (201 votes then needed for victory). Note, though, that Cleveland earned 24 of his electoral votes in states that he won by less than 1% (Connecticut, Virginia, and West Virginia).\", \n", + " \"Some of Cleveland's actions were controversial with political factions. Such criticisms include but are not limited to: his intervention in the Pullman Strike of 1894 in order to keep the railroads moving (a move which angered labor unions), his support of the gold standard, and opposition to free silver which alienated the agrarian wing of the Democrats. Furthermore, critics complained that he had little imagination and seemed overwhelmed by the nation's economic disasters depressions and strikes in his second term. He lost control of his party to the agrarians and silverites in 1896.\"\n", + " ]\n", + " },\n", + " },\n", + " {\n", + " 'type': 'ContextRecall',\n", + " 'value': 0.0, \n", + " 'parameters': {\n", + " 'dataset': 'rag_dataset', \n", + " 'datum_uid': 'query0', \n", + " 'context_list': [\n", + " \"Cleveland was defeated in the 1888 presidential election, in part due to fraud (See Blocks of Five). He actually led in the popular vote over Benjamin Harrison (48.6% to 47.8%), but Harrison won the Electoral College by a 233-168 margin, largely by squeaking out a barely-over-1% win in Cleveland's home state of New York; in fact, had Cleveland won his home state, he would have won the electoral vote by a count of 204-197 (201 votes then needed for victory). Note, though, that Cleveland earned 24 of his electoral votes in states that he won by less than 1% (Connecticut, Virginia, and West Virginia).\", \n", + " \"Some of Cleveland's actions were controversial with political factions. Such criticisms include but are not limited to: his intervention in the Pullman Strike of 1894 in order to keep the railroads moving (a move which angered labor unions), his support of the gold standard, and opposition to free silver which alienated the agrarian wing of the Democrats. Furthermore, critics complained that he had little imagination and seemed overwhelmed by the nation's economic disasters depressions and strikes in his second term. He lost control of his party to the agrarians and silverites in 1896.\"\n", + " ]\n", + " },\n", + " },\n", + " {\n", + " 'type': 'ContextRelevance', \n", + " 'value': 0.0,\n", + " 'parameters': {\n", + " 'dataset': 'rag_dataset', \n", + " 'datum_uid': 'query0', \n", + " 'context_list': [\n", + " \"Cleveland was defeated in the 1888 presidential election, in part due to fraud (See Blocks of Five). He actually led in the popular vote over Benjamin Harrison (48.6% to 47.8%), but Harrison won the Electoral College by a 233-168 margin, largely by squeaking out a barely-over-1% win in Cleveland's home state of New York; in fact, had Cleveland won his home state, he would have won the electoral vote by a count of 204-197 (201 votes then needed for victory). Note, though, that Cleveland earned 24 of his electoral votes in states that he won by less than 1% (Connecticut, Virginia, and West Virginia).\", \n", + " \"Some of Cleveland's actions were controversial with political factions. Such criticisms include but are not limited to: his intervention in the Pullman Strike of 1894 in order to keep the railroads moving (a move which angered labor unions), his support of the gold standard, and opposition to free silver which alienated the agrarian wing of the Democrats. Furthermore, critics complained that he had little imagination and seemed overwhelmed by the nation's economic disasters depressions and strikes in his second term. He lost control of his party to the agrarians and silverites in 1896.\"\n", + " ]\n", " },\n", " },\n", " {\n", " 'type': 'Faithfulness',\n", " 'value': 0.0,\n", " 'parameters': {\n", + " 'dataset': 'rag_dataset',\n", + " 'datum_uid': 'query0',\n", + " 'prediction': \"Cleveland's opponents in 1884 criticized his alleged involvement in a scandal regarding an illegitimate child, which was used to counter his innocent image during the presidential campaign.\",\n", " 'context_list': [\n", " \"Cleveland was defeated in the 1888 presidential election, in part due to fraud (See Blocks of Five). He actually led in the popular vote over Benjamin Harrison (48.6% to 47.8%), but Harrison won the Electoral College by a 233-168 margin, largely by squeaking out a barely-over-1% win in Cleveland's home state of New York; in fact, had Cleveland won his home state, he would have won the electoral vote by a count of 204-197 (201 votes then needed for victory). Note, though, that Cleveland earned 24 of his electoral votes in states that he won by less than 1% (Connecticut, Virginia, and West Virginia).\",\n", " \"Some of Cleveland's actions were controversial with political factions. Such criticisms include but are not limited to: his intervention in the Pullman Strike of 1894 in order to keep the railroads moving (a move which angered labor unions), his support of the gold standard, and opposition to free silver which alienated the agrarian wing of the Democrats. Furthermore, critics complained that he had little imagination and seemed overwhelmed by the nation's economic disasters depressions and strikes in his second term. He lost control of his party to the agrarians and silverites in 1896.\"\n", " ],\n", - " 'dataset': 'rag_dataset',\n", - " 'datum_uid': 'query0',\n", - " 'prediction': \"Cleveland's opponents in 1884 criticized his alleged involvement in a scandal regarding an illegitimate child, which was used to counter his innocent image during the presidential campaign.\"\n", " },\n", " },\n", " {\n", " 'type': 'Hallucination',\n", " 'value': 1.0,\n", " 'parameters': {\n", + " 'dataset': 'rag_dataset',\n", + " 'datum_uid': 'query0',\n", + " 'prediction': \"Cleveland's opponents in 1884 criticized his alleged involvement in a scandal regarding an illegitimate child, which was used to counter his innocent image during the presidential campaign.\",\n", " 'context_list': [\n", " \"Cleveland was defeated in the 1888 presidential election, in part due to fraud (See Blocks of Five). He actually led in the popular vote over Benjamin Harrison (48.6% to 47.8%), but Harrison won the Electoral College by a 233-168 margin, largely by squeaking out a barely-over-1% win in Cleveland's home state of New York; in fact, had Cleveland won his home state, he would have won the electoral vote by a count of 204-197 (201 votes then needed for victory). Note, though, that Cleveland earned 24 of his electoral votes in states that he won by less than 1% (Connecticut, Virginia, and West Virginia).\",\n", " \"Some of Cleveland's actions were controversial with political factions. Such criticisms include but are not limited to: his intervention in the Pullman Strike of 1894 in order to keep the railroads moving (a move which angered labor unions), his support of the gold standard, and opposition to free silver which alienated the agrarian wing of the Democrats. Furthermore, critics complained that he had little imagination and seemed overwhelmed by the nation's economic disasters depressions and strikes in his second term. He lost control of his party to the agrarians and silverites in 1896.\"\n", " ],\n", - " 'dataset': 'rag_dataset',\n", - " 'datum_uid': 'query0',\n", - " 'prediction': \"Cleveland's opponents in 1884 criticized his alleged involvement in a scandal regarding an illegitimate child, which was used to counter his innocent image during the presidential campaign.\"\n", " },\n", " },\n", " {\n", @@ -509,7 +548,7 @@ "\n", "In this example, CNN articles are summarized by GPT3.5-turbo. Groundtruth summaries are also known for these articles. Both the datums (which contain the articles) and the groundtruths are added to the dataset. Then, the predictions are added to the model, which includes just the generated summary (there is not retrieved context for summarization).\n", "\n", - "The metrics requested are BLEU, ROUGE and Coherence. BLEU and ROUGE are used to measure the similarity between the generated summary and the groundtruth summary. Coherence is an llm-guided metric that measures the overall quality and cohesiveness of the generated summary." + "The metrics requested are Bias, BLEU, ROUGE, SummaryCoherence and Toxicity. BLEU and ROUGE are used to measure the similarity between the generated summary and the ground truth summary. Bias and Toxicity are LLM-guided metrics that evaluate the generated summary for biased or toxic opinions. SummaryCoherence is an LLM-guided metric that measures the overall quality and cohesiveness of the generated summary." ] }, { @@ -598,8 +637,8 @@ " metrics_to_return=[\n", " \"Bias\",\n", " \"BLEU\",\n", - " \"Coherence\",\n", " \"ROUGE\",\n", + " \"SummaryCoherence\",\n", " \"Toxicity\",\n", " ],\n", " llm_api_params = {\n", @@ -642,15 +681,6 @@ " },\n", " },\n", " {\n", - " 'type': 'Coherence',\n", - " 'value': 5.0,\n", - " 'parameters': {\n", - " 'dataset': 'summarization_dataset',\n", - " 'datum_uid': 'article4',\n", - " 'prediction': 'British taekwondo fighter Aaron Cook plans to compete for Moldova at the 2016 Olympics in Rio after being overlooked for the Great Britain squad in London 2012. Cook received funding from a Moldovan billionaire and has now obtained Moldovan citizenship. He has decided to no longer compete for Great Britain due to feeling overlooked and unsupported, and hopes to represent Moldova at international competitions, including the Olympics. The British Olympic Association could potentially block this move, as discussions are ongoing.'\n", - " },\n", - " },\n", - " {\n", " 'type': 'ROUGE',\n", " 'value': {\n", " 'rouge1': 0.4915254237288136,\n", @@ -667,6 +697,15 @@ " },\n", " },\n", " {\n", + " 'type': 'SummaryCoherence',\n", + " 'value': 5.0,\n", + " 'parameters': {\n", + " 'dataset': 'summarization_dataset',\n", + " 'datum_uid': 'article4',\n", + " 'prediction': 'British taekwondo fighter Aaron Cook plans to compete for Moldova at the 2016 Olympics in Rio after being overlooked for the Great Britain squad in London 2012. Cook received funding from a Moldovan billionaire and has now obtained Moldovan citizenship. He has decided to no longer compete for Great Britain due to feeling overlooked and unsupported, and hopes to represent Moldova at international competitions, including the Olympics. The British Olympic Association could potentially block this move, as discussions are ongoing.'\n", + " },\n", + " },\n", + " {\n", " 'type': 'Toxicity',\n", " 'value': 0.0,\n", " 'parameters': {\n", @@ -756,9 +795,9 @@ "source": [ "## Evaluation in Valor\n", "\n", - "In this example, text is generated in response to multiple open-ended queries. These queries are written so that there is no correct or canonically good response, so there are no groundtruth annotations for these queries. To build the dataset, we add the queries to the datums, then add groundtruths to the dataset that only contain the datums and no annotations. We add the generated text as predictions to the model. \n", + "In this example, text is generated in response to multiple open-ended queries. These queries are written so that there is no correct or canonically good response, so there are no ground truth annotations for these queries. To build the dataset, we add the queries to the datums, then add ground truths to the dataset that only contain the datums and no annotations. We add the generated text as predictions to the model. \n", "\n", - "The only metric we request is Coherence, which is an llm-guided metric that measures the overall quality and cohesiveness of the generated text. We don't use text comparison metrics as there are no groundtruth annotations to compare to. We don't use Q&A/RAG metrics as there is no context and no query-answer structure to the data." + "The only metrics we request are Bias and Toxicity, which are LLM-guided. We don't use text comparison metrics as there are no ground truth annotations to compare to. We don't use Q&A, RAG or summarization metrics as those only apply for question answering and summarization tasks." ] }, { @@ -798,7 +837,7 @@ " row = df.iloc[i]\n", " datum = datum_list[i]\n", "\n", - " # There are no groundtruth annotations for content generation.\n", + " # There are no ground truth annotations for content generation.\n", " dataset.add_groundtruth(\n", " GroundTruth(\n", " datum=datum,\n", @@ -837,7 +876,6 @@ " dataset,\n", " metrics_to_return=[\n", " \"Bias\",\n", - " \"Coherence\",\n", " \"Toxicity\",\n", " ],\n", " llm_api_params = {\n", @@ -883,33 +921,6 @@ " },\n", " },\n", " {\n", - " \"value\": 5.0,\n", - " \"type\": \"Coherence\",\n", - " \"parameters\": {\n", - " \"dataset\": \"content_generation_dataset\",\n", - " \"datum_uid\": \"query2\",\n", - " \"prediction\": \"\"\"Subject: Project Delay Due to Funding Cuts\n", - "\n", - "Dear [Coworker's Name],\n", - "\n", - "I hope this message finds you well. I am writing to update you on the status of our project and unfortunately, convey some disappointing news regarding a delay in its completion.\n", - "\n", - "Due to recent funding cuts within our department, our project team has been significantly affected. Several team members, including myself, have been relocated to work on other projects to address the shifting priorities resulting from the budget constraints.\n", - "\n", - "As a consequence of these unexpected changes, it is with regret that I must inform you that the original deadline for our project will need to be extended. I understand the inconvenience that this may cause, and I sincerely apologize for any inconvenience this delay may bring to you and your plans.\n", - "\n", - "Rest assured that despite this setback, I am fully committed to ensuring that we still deliver the project with utmost efficiency and quality. I am exploring all possible avenues to mitigate the delay and work towards completing our project in a timely manner.\n", - "\n", - "I appreciate your understanding and patience during this challenging time. Your ongoing support and collaboration are invaluable as we navigate through this situation together. If you have any concerns or questions, please do not hesitate to reach out to me.\n", - "\n", - "Thank you for your understanding, and I look forward to working with you to successfully finalize our project.\n", - "\n", - "Warm regards,\n", - "\n", - "[Your Name]\"\"\",\n", - " },\n", - " },\n", - " {\n", " 'type': 'Toxicity',\n", " 'value': 0.0,\n", " 'parameters': {\n", diff --git a/integration_tests/client/metrics/test_text_generation_with_mock_client.py b/integration_tests/client/metrics/test_text_generation_with_mock_client.py index 11a6f3ea2..4f5278ca9 100644 --- a/integration_tests/client/metrics/test_text_generation_with_mock_client.py +++ b/integration_tests/client/metrics/test_text_generation_with_mock_client.py @@ -140,6 +140,70 @@ def rag_pred_answers( ] +@pytest.fixture +def summarization_q0() -> Datum: + return Datum( + uid="uid0", + text="""News article 0""", + ) + + +@pytest.fixture +def summarization_q1() -> Datum: + return Datum( + uid="uid1", + text="""News article 1""", + ) + + +@pytest.fixture +def summarization_datums( + summarization_q0: Datum, + summarization_q1: Datum, +) -> list[Datum]: + return [summarization_q0, summarization_q1] + + +@pytest.fixture +def summarization_predictions() -> list[str]: + return [ + """Summary 0""", + """Summary 1""", + ] + + +@pytest.fixture +def summarization_gt_questions( + summarization_datums: list[Datum], +) -> list[GroundTruth]: + return [ + GroundTruth( + datum=summarization_datums[i], + annotations=[], + ) + for i in range(len(summarization_datums)) + ] + + +@pytest.fixture +def summarization_pred_answers( + summarization_datums: list[Datum], + summarization_predictions: list[str], +) -> list[GroundTruth]: + assert len(summarization_datums) == len(summarization_predictions) + return [ + Prediction( + datum=summarization_datums[i], + annotations=[ + Annotation( + text=summarization_predictions[i], + ) + ], + ) + for i in range(len(summarization_datums)) + ] + + def test_llm_evaluation_rag_with_mock_client( client: Client, rag_gt_questions: list[GroundTruth], @@ -161,10 +225,12 @@ def test_llm_evaluation_rag_with_mock_client( model.finalize_inferences(dataset) metrics_to_return = [ + MetricType.AnswerCorrectness, MetricType.AnswerRelevance, MetricType.Bias, MetricType.BLEU, - MetricType.Coherence, + MetricType.ContextPrecision, + MetricType.ContextRecall, MetricType.ContextRelevance, MetricType.Faithfulness, MetricType.Hallucination, @@ -209,10 +275,12 @@ def test_llm_evaluation_rag_with_mock_client( expected_metrics = { "uid0": { + "AnswerCorrectness": 0.5, "AnswerRelevance": 0.5, "Bias": 0.5, "BLEU": 0.3502270395690205, - "Coherence": 4, + "ContextPrecision": 0.75, + "ContextRecall": 1.0, "ContextRelevance": 0.75, "Faithfulness": 0.3333333333333333, "Hallucination": 0.25, @@ -225,10 +293,12 @@ def test_llm_evaluation_rag_with_mock_client( "Toxicity": 0.0, }, "uid1": { + "AnswerCorrectness": 0.5, "AnswerRelevance": 0.5, "Bias": 0.5, "BLEU": 1.0, - "Coherence": 4, + "ContextPrecision": 0.75, + "ContextRecall": 1.0, "ContextRelevance": 0.75, "Faithfulness": 0.3333333333333333, "Hallucination": 0.25, @@ -241,10 +311,12 @@ def test_llm_evaluation_rag_with_mock_client( "Toxicity": 0.0, }, "uid2": { + "AnswerCorrectness": 0.5, "AnswerRelevance": 0.5, "Bias": 0.5, "BLEU": 0.05434912989707719, - "Coherence": 4, + "ContextPrecision": 0.75, + "ContextRecall": 1.0, "ContextRelevance": 0.75, "Faithfulness": 0.3333333333333333, "Hallucination": 0.25, @@ -261,15 +333,16 @@ def test_llm_evaluation_rag_with_mock_client( # Check that the returned metrics have the right format. for m in metrics: if m["type"] in [ + "AnswerCorrectness", "AnswerRelevance", "Bias", "BLEU", + "ContextPrecision", + "ContextRecall", "ContextRelevance", "Toxicity", ]: assert 0 <= m["value"] <= 1 - if m["type"] == "Coherence": - assert m["value"] in [1, 2, 3, 4, 5] if m["type"] == "ROUGE": assert isinstance(m["value"], dict) assert all(0 <= v <= 1 for v in m["value"].values()) @@ -337,3 +410,74 @@ def test_llm_evaluation_rag_with_mock_client( }, }, ) + + +def test_llm_evaluation_summarization_with_mock_client( + client: Client, + summarization_gt_questions: list[GroundTruth], + summarization_pred_answers: list[Prediction], + dataset_name: str, + model_name: str, +): + dataset = Dataset.create(dataset_name) + model = Model.create(model_name) + + for gt in summarization_gt_questions: + dataset.add_groundtruth(gt) + + dataset.finalize() + + for pred in summarization_pred_answers: + model.add_prediction(dataset, pred) + + model.finalize_inferences(dataset) + + metrics_to_return = [ + MetricType.SummaryCoherence, + ] + + eval_job = model.evaluate_text_generation( + datasets=dataset, + metrics_to_return=metrics_to_return, + llm_api_params={ + "client": "mock", + "data": { + "model": "model", + }, + }, + metric_params={}, + ) + + assert eval_job.id + eval_job.wait_for_completion(timeout=30) + + assert eval_job.wait_for_completion(timeout=30) == EvaluationStatus.DONE + + metrics = eval_job.metrics + + # Check that the right number of metrics are returned. + assert len(metrics) == len(summarization_pred_answers) * len( + metrics_to_return + ) + + expected_metrics = { + "uid0": { + "SummaryCoherence": 4, + }, + "uid1": { + "SummaryCoherence": 4, + }, + } + + # Check that the returned metrics have the right format. + for m in metrics: + if m["type"] == "SummaryCoherence": + assert m["value"] in [1, 2, 3, 4, 5] + + # Check that mocked metrics are in the returned metrics. + for m in metrics: + uid = m["parameters"]["datum_uid"] + metric_name = m["type"] + assert ( + expected_metrics[uid][metric_name] == m["value"] + ), f"Failed for {uid} and {metric_name}" diff --git a/integration_tests/external/conftest.py b/integration_tests/external/conftest.py index cc220358b..87af0a3f1 100644 --- a/integration_tests/external/conftest.py +++ b/integration_tests/external/conftest.py @@ -3,6 +3,90 @@ from valor import Annotation, Datum, GroundTruth, Prediction +@pytest.fixture +def answer_correctness_q0() -> Datum: + return Datum( + uid="uid0", + text="""Did John Adams get along with Alexander Hamilton?""", + metadata={ + "category": "history", + }, + ) + + +@pytest.fixture +def answer_correctness_q1() -> Datum: + return Datum( + uid="uid1", + text="""Did Lincoln win the election of 1860?""", + metadata={ + "category": "history", + }, + ) + + +@pytest.fixture +def answer_correctness_datums( + answer_correctness_q0: Datum, + answer_correctness_q1: Datum, +) -> list[Datum]: + return [answer_correctness_q0, answer_correctness_q1] + + +@pytest.fixture +def answer_correctness_predictions() -> list[str]: + return [ + """John Adams and Alexander Hamilton did not get along. John Adams and Alexander Hamilton were both federalists.""", + """Lincoln won the election of 1860.""", + ] + + +@pytest.fixture +def answer_correctness_groundtruths() -> list[str]: + return [ + """John Adams and Alexander Hamilton did not get along. John Adams and Alexander Hamilton held opposing views on the role of the federal government.""", + """Lincoln won the election of 1860.""", + ] + + +@pytest.fixture +def answer_correctness_gt_questions( + answer_correctness_datums: list[Datum], + answer_correctness_groundtruths: list[str], +) -> list[GroundTruth]: + assert len(answer_correctness_datums) == len( + answer_correctness_groundtruths + ) + return [ + GroundTruth( + datum=answer_correctness_datums[i], + annotations=[Annotation(text=answer_correctness_groundtruths[i])], + ) + for i in range(len(answer_correctness_datums)) + ] + + +@pytest.fixture +def answer_correctness_pred_answers( + answer_correctness_datums: list[Datum], + answer_correctness_predictions: list[str], +) -> list[GroundTruth]: + assert len(answer_correctness_datums) == len( + answer_correctness_predictions + ) + return [ + Prediction( + datum=answer_correctness_datums[i], + annotations=[ + Annotation( + text=answer_correctness_predictions[i], + ) + ], + ) + for i in range(len(answer_correctness_datums)) + ] + + @pytest.fixture def answer_relevance_q0() -> Datum: return Datum( @@ -122,7 +206,7 @@ def bias_datums( @pytest.fixture def bias_predictions() -> list[str]: return [ - """I like living in Canada. Canadians are nicer than Americans. I also like the cold weather.""", + """I like living in Canada. I think Americans are meaner than Canadians. I also like the cold weather.""", """Fox news is a right wing news outlet. Fox News reported that the left wing president is ruining our country.""", ] @@ -160,72 +244,170 @@ def bias_pred_answers( @pytest.fixture -def coherence_q0() -> Datum: +def context_precision_q0() -> Datum: return Datum( uid="uid0", - text="""Write about a haunted house from the perspective of the ghost.""", - metadata={ - "request_type": "creative", - }, + text="""What are some foods that Lewis Hamilton likes?""", ) @pytest.fixture -def coherence_q1() -> Datum: +def context_precision_q1() -> Datum: return Datum( uid="uid1", - text="""Draft an email to a coworker explaining a project delay. Explain that the delay is due to funding cuts, which resulted in multiple employees being moved to different projects. Inform the coworker that the project deadline will have to be pushed back. Be apologetic and professional. Express eagerness to still complete the project as efficiently as possible.""", - metadata={ - "request_type": "professional", - }, + text="""Name the first and third United States presidents.""", ) @pytest.fixture -def coherence_datums( - coherence_q0: Datum, - coherence_q1: Datum, +def context_precision_datums( + context_precision_q0: Datum, + context_precision_q1: Datum, ) -> list[Datum]: - return [coherence_q0, coherence_q1] + return [context_precision_q0, context_precision_q1] @pytest.fixture -def coherence_predictions() -> list[str]: +def context_precision_groundtruths() -> list[str]: return [ - """I am a ghost that is him over there and that was what was what was what was what was what was what was.""", - """Subject: Project Delay Due to Funding Cuts\n\nDear [Coworker's Name],\n\nI hope this message finds you well. I am writing to update you on the status of our project and unfortunately, convey some disappointing news.\n\nDue to recent funding cuts within our department, we have had to make some adjustments to project assignments. As a result, multiple employees, including key team members for our current project, have been moved to different projects to accommodate the changes. This unexpected shift has impacted our project timeline.\n\nI regret to inform you that our project deadline will need to be pushed back in light of these developments. I understand the inconvenience this may cause and I sincerely apologize for any disruption this may cause to your schedule or other commitments.\n\nPlease rest assured that despite these unforeseen circumstances, I am fully committed to completing the project efficiently and effectively. I will work closely with the team to develop a revised timeline and ensure that we deliver quality work that meets our objectives.\n\nThank you for your understanding and continued support during this challenging period. I value your collaboration and look forward to working together to overcome this setback and achieve our project goals.\n\nIf you have any questions or concerns, please feel free to reach out to me. I appreciate your patience as we navigate through this situation together.\n\nBest regards,\n\n[Your Name]""", + """Lewis Hamilton likes spicy wings.""", + """The first president of the United States was George Washington. The third president of the United States was Thomas Jefferson.""", ] @pytest.fixture -def coherence_gt_questions( - coherence_datums: list[Datum], +def context_precision_context_list() -> list[list[str]]: + return [ + [ + """Lewis Hamilton is an F1 driver.""", + """Lewis Hamilton likes spicy wings.""", + """The F1 driver with the most wins of all time is Lewis Hamilton.""", + """Taylor Swift likes chicken tenders.""", + ], + [ + """The first president of the United States was George Washington.""", + """The second president of the United States was John Adams.""", + """The third president of the United States was Thomas Jefferson.""", + """The fourth president of the United States was James Madison.""", + ], + ] + + +@pytest.fixture +def context_precision_gt_questions( + context_precision_datums: list[Datum], + context_precision_groundtruths: list[str], ) -> list[GroundTruth]: + assert len(context_precision_datums) == len(context_precision_groundtruths) return [ GroundTruth( - datum=coherence_datums[i], - annotations=[], + datum=context_precision_datums[i], + annotations=[Annotation(text=context_precision_groundtruths[i])], + ) + for i in range(len(context_precision_datums)) + ] + + +@pytest.fixture +def context_precision_pred_answers( + context_precision_datums: list[Datum], + context_precision_context_list: list[list[str]], +) -> list[GroundTruth]: + assert len(context_precision_datums) == len(context_precision_context_list) + return [ + Prediction( + datum=context_precision_datums[i], + annotations=[ + Annotation( + context_list=context_precision_context_list[i], + ) + ], ) - for i in range(len(coherence_datums)) + for i in range(len(context_precision_datums)) + ] + + +@pytest.fixture +def context_recall_q0() -> Datum: + return Datum( + uid="uid0", + ) + + +@pytest.fixture +def context_recall_q1() -> Datum: + return Datum( + uid="uid1", + ) + + +@pytest.fixture +def context_recall_datums( + context_recall_q0: Datum, + context_recall_q1: Datum, +) -> list[Datum]: + return [context_recall_q0, context_recall_q1] + + +@pytest.fixture +def context_recall_groundtruths() -> list[str]: + return [ + """Lewis Hamilton likes spicy wings. Taylor Swift likes chicken tenders.""", + """The first U.S. president was George Washington. The second U.S. president was John Adams. The third U.S. president was Thomas Jefferson.""", + ] + + +@pytest.fixture +def context_recall_context_list() -> list[list[str]]: + return [ + [ + """Lewis Hamilton is an F1 driver.""", + """Lewis Hamilton likes spicy wings.""", + ], + [ + """The first president of the United States was George Washington.""", + """The second president of the United States was John Adams.""", + """The third president of the United States was Thomas Jefferson.""", + """The fourth president of the United States was James Madison.""", + ], ] @pytest.fixture -def coherence_pred_answers( - coherence_datums: list[Datum], - coherence_predictions: list[str], +def context_recall_gt_questions( + context_recall_datums: list[Datum], + context_recall_groundtruths: list[str], ) -> list[GroundTruth]: - assert len(coherence_datums) == len(coherence_predictions) + assert len(context_recall_datums) == len(context_recall_groundtruths) + return [ + GroundTruth( + datum=context_recall_datums[i], + annotations=[ + Annotation( + text=context_recall_groundtruths[i], + ) + ], + ) + for i in range(len(context_recall_datums)) + ] + + +@pytest.fixture +def context_recall_pred_answers( + context_recall_datums: list[Datum], + context_recall_context_list: list[list[str]], +) -> list[GroundTruth]: + assert len(context_recall_datums) == len(context_recall_context_list) return [ Prediction( - datum=coherence_datums[i], + datum=context_recall_datums[i], annotations=[ Annotation( - text=coherence_predictions[i], + context_list=context_recall_context_list[i], ) ], ) - for i in range(len(coherence_datums)) + for i in range(len(context_recall_datums)) ] @@ -268,7 +450,7 @@ def context_relevance_context_list() -> list[list[str]]: """Lewis Hamilton is an F1 driver.""", """Lewis Hamilton likes spicy wings.""", """The F1 driver with the most wins of all time is Lewis Hamilton.""", - """Taylor Swift likes chicken tendors.""", + """Taylor Swift likes chicken tenders.""", ], [ """The first president of the United States was George Washington.""", @@ -354,7 +536,7 @@ def faithfulness_context_list() -> list[list[str]]: """Lewis Hamilton is an F1 driver.""", """Lewis Hamilton likes spicy wings.""", """The F1 driver with the most wins of all time is Lewis Hamilton.""", - """Taylor Swift likes chicken tendors.""", + """Taylor Swift likes chicken tenders.""", ], [ """George Washington's favorite color was yellow.""", @@ -488,6 +670,60 @@ def hallucination_pred_answers( ] +@pytest.fixture +def summary_coherence_q0() -> Datum: + return Datum( + uid="uid0", + text="""Everton manager Roberto Martinez has not ruled out the prospect of Antolin Alcaraz or Sylvain Distin earning new contracts but stressed they need to prove they can still be important figures in the club's future. Both centre-backs' current deals expire this summer and it seems highly unlikely Distin, who is 38 in December and has played more for the under-21s in the last month than he has the first team, will be retained. Alcaraz, 33 in July, has more of a chance of securing a short-term extension as Martinez looks to strengthen and restructure his defence in the summer. Roberto Martinez insists 37-year-old defender Sylvain Distin still has time to prove he deserves a new deal . Antolin Alcaraz, who joined Everton from Wigan where he played under Martinez, could get a new deal . While the Toffees boss is keen to advance the talents of younger players - Tyias Browning and Brendan Galloway the two most likely to benefit - he has not ruled out retaining existing senior players. 'There are only two players out of contract and we have two loan players (Aaron Lennon and Christian Atsu) and those decisions will be made when we have finished the season,' said Martinez. 'The next six games could have a massive bearing on that. Ninety minutes is a big opportunity to change people's views. 'All individuals will be judged over that period. In football it does not matter if you have a contract or not, you always need to improve and show the right attitude and show you are ready to be part of the future of the club. 'But when you get players at the end of their contract there are decisions to be made and it is not just the club, it is the player as well.' Roberto Martinez says his club's recruitment team have been searching for targets for six months . Distin has played more for Everton's youth team than the first XI in the past month, and could be on his way . Martinez said they have established a list of transfer targets for the summer and, while he would not confirm publicly, Aston Villa's on-loan Manchester United midfielder Tom Cleverley, out of contract at the end of the season, is believed to be one of them. 'The recruitment department has been working really hard over the last six months and we need to assemble a really strong squad,' Martinez said. 'First and foremost it is an opportunity for young players to show they are ready for big important roles for next campaign and everyone else providing strong competition to be important figures for the future. Tom Cleverley, who is on loan at Aston Villa, is a target, with Martinez having worked with him before . 'The dressing room is very strong as it is now, so we need to make sure whatever we do in the summer is to get us in a better place. 'We know the situation with Tom. He is a player that I know well having worked with him (in a previous loan spell at Wigan) - and that's it. 'Tom is a player that is at the moment fighting for something very important for his club and that deserves respect. 'I wouldn't expect anyone to speak about my players and I would never do that.'""", + ) + + +@pytest.fixture +def summary_coherence_datums( + summary_coherence_q0: Datum, +) -> list[Datum]: + return [summary_coherence_q0] + + +@pytest.fixture +def summary_coherence_predictions() -> list[str]: + return [ + """Roberto Martinez, Everton's manager, has not ruled out the possibility of offering new contracts to veteran defenders Antolin Alcaraz and Sylvain Distin. However, both players need to prove their value and importance to the team's future. Although Distin seems unlikely to be retained due to his age and recent lack of first-team appearances, Alcaraz may have a better chance of securing a short-term extension. Martinez emphasized the importance of all players showing improvement and commitment, including considering younger talents like Tyias Browning and Brendan Galloway for future roles. The club is also planning for the summer transfer window, with reports suggesting they are targeting players like Tom Cleverley, who Martinez has worked with before.""", + ] + + +@pytest.fixture +def summary_coherence_gt_questions( + summary_coherence_datums: list[Datum], +) -> list[GroundTruth]: + return [ + GroundTruth( + datum=summary_coherence_datums[i], + annotations=[], + ) + for i in range(len(summary_coherence_datums)) + ] + + +@pytest.fixture +def summary_coherence_pred_answers( + summary_coherence_datums: list[Datum], + summary_coherence_predictions: list[str], +) -> list[GroundTruth]: + assert len(summary_coherence_datums) == len(summary_coherence_predictions) + return [ + Prediction( + datum=summary_coherence_datums[i], + annotations=[ + Annotation( + text=summary_coherence_predictions[i], + ) + ], + ) + for i in range(len(summary_coherence_datums)) + ] + + @pytest.fixture def toxicity_q0() -> Datum: return Datum( diff --git a/integration_tests/external/test_text_generation_no_mock.py b/integration_tests/external/test_text_generation_no_mock.py index 7d818249f..02f52b30d 100644 --- a/integration_tests/external/test_text_generation_no_mock.py +++ b/integration_tests/external/test_text_generation_no_mock.py @@ -69,6 +69,40 @@ def _get_metrics( return eval_job.metrics +def test_answer_correctness_with_openai( + client: Client, + answer_correctness_gt_questions: list[GroundTruth], + answer_correctness_pred_answers: list[Prediction], + dataset_name: str, + model_name: str, +): + metrics = _get_metrics( + dataset_name=dataset_name, + model_name=model_name, + gt_questions=answer_correctness_gt_questions, + pred_answers=answer_correctness_pred_answers, + metrics_to_return=[MetricType.AnswerCorrectness], + llm_client="openai", + ) + + expected_metrics = { + "uid0": { + "AnswerCorrectness": 0.5, + }, + "uid1": { + "AnswerCorrectness": 1.0, + }, + } + + # Check that the returned metrics have the right format. + for m in metrics: + uid = m["parameters"]["datum_uid"] + metric_name = m["type"] + assert ( + expected_metrics[uid][metric_name] == m["value"] + ), f"Failed for {uid} and {metric_name}" + + def test_answer_relevance_with_openai( client: Client, answer_relevance_gt_questions: list[GroundTruth], @@ -137,28 +171,28 @@ def test_bias_with_openai( ), f"Failed for {uid} and {metric_name}" -def test_coherence_with_openai( +def test_context_relevance_with_openai( client: Client, - coherence_gt_questions: list[GroundTruth], - coherence_pred_answers: list[Prediction], + context_relevance_gt_questions: list[GroundTruth], + context_relevance_pred_answers: list[Prediction], dataset_name: str, model_name: str, ): metrics = _get_metrics( dataset_name=dataset_name, model_name=model_name, - gt_questions=coherence_gt_questions, - pred_answers=coherence_pred_answers, - metrics_to_return=[MetricType.Coherence], + gt_questions=context_relevance_gt_questions, + pred_answers=context_relevance_pred_answers, + metrics_to_return=[MetricType.ContextRelevance], llm_client="openai", ) expected_metrics = { "uid0": { - "Coherence": 1, + "ContextRelevance": 0.25, }, "uid1": { - "Coherence": 5, + "ContextRelevance": 0.75, }, } @@ -171,28 +205,62 @@ def test_coherence_with_openai( ), f"Failed for {uid} and {metric_name}" -def test_context_relevance_with_openai( +def test_context_precision_with_openai( client: Client, - context_relevance_gt_questions: list[GroundTruth], - context_relevance_pred_answers: list[Prediction], + context_precision_gt_questions: list[GroundTruth], + context_precision_pred_answers: list[Prediction], dataset_name: str, model_name: str, ): metrics = _get_metrics( dataset_name=dataset_name, model_name=model_name, - gt_questions=context_relevance_gt_questions, - pred_answers=context_relevance_pred_answers, - metrics_to_return=[MetricType.ContextRelevance], + gt_questions=context_precision_gt_questions, + pred_answers=context_precision_pred_answers, + metrics_to_return=[MetricType.ContextPrecision], llm_client="openai", ) expected_metrics = { "uid0": { - "ContextRelevance": 0.25, + "ContextPrecision": 0.5, }, "uid1": { - "ContextRelevance": 0.75, + "ContextPrecision": 0.8333333333333333, + }, + } + + # Check that the returned metrics have the right format. + for m in metrics: + uid = m["parameters"]["datum_uid"] + metric_name = m["type"] + assert ( + expected_metrics[uid][metric_name] == m["value"] + ), f"Failed for {uid} and {metric_name}" + + +def test_context_recall_with_openai( + client: Client, + context_recall_gt_questions: list[GroundTruth], + context_recall_pred_answers: list[Prediction], + dataset_name: str, + model_name: str, +): + metrics = _get_metrics( + dataset_name=dataset_name, + model_name=model_name, + gt_questions=context_recall_gt_questions, + pred_answers=context_recall_pred_answers, + metrics_to_return=[MetricType.ContextRecall], + llm_client="openai", + ) + + expected_metrics = { + "uid0": { + "ContextRecall": 0.5, + }, + "uid1": { + "ContextRecall": 1.0, }, } @@ -273,6 +341,31 @@ def test_hallucination_with_openai( ), f"Failed for {uid} and {metric_name}" +def test_summary_coherence_with_openai( + client: Client, + summary_coherence_gt_questions: list[GroundTruth], + summary_coherence_pred_answers: list[Prediction], + dataset_name: str, + model_name: str, +): + metrics = _get_metrics( + dataset_name=dataset_name, + model_name=model_name, + gt_questions=summary_coherence_gt_questions, + pred_answers=summary_coherence_pred_answers, + metrics_to_return=[MetricType.SummaryCoherence], + llm_client="openai", + ) + + # Check that the returned metrics have the right format. + assert len(metrics) == 1 + assert metrics[0]["parameters"]["datum_uid"] == "uid0" + assert metrics[0]["type"] == "SummaryCoherence" + + # Check that the summary coherence was rated >= 3. + assert metrics[0]["value"] in {3, 4, 5} + + def test_toxicity_with_openai( client: Client, toxicity_gt_questions: list[GroundTruth], @@ -310,6 +403,40 @@ def test_toxicity_with_openai( ), f"Failed for {uid} and {metric_name}" +def test_answer_correctness_with_mistral( + client: Client, + answer_correctness_gt_questions: list[GroundTruth], + answer_correctness_pred_answers: list[Prediction], + dataset_name: str, + model_name: str, +): + metrics = _get_metrics( + dataset_name=dataset_name, + model_name=model_name, + gt_questions=answer_correctness_gt_questions, + pred_answers=answer_correctness_pred_answers, + metrics_to_return=[MetricType.AnswerCorrectness], + llm_client="mistral", + ) + + expected_metrics = { + "uid0": { + "AnswerCorrectness": 0.5, + }, + "uid1": { + "AnswerCorrectness": 1.0, + }, + } + + # Check that the returned metrics have the right format. + for m in metrics: + uid = m["parameters"]["datum_uid"] + metric_name = m["type"] + assert ( + expected_metrics[uid][metric_name] == m["value"] + ), f"Failed for {uid} and {metric_name}" + + def test_answer_relevance_with_mistral( client: Client, answer_relevance_gt_questions: list[GroundTruth], @@ -378,28 +505,62 @@ def test_bias_with_mistral( ), f"Failed for {uid} and {metric_name}" -def test_coherence_with_mistral( +def test_context_precision_with_mistral( + client: Client, + context_precision_gt_questions: list[GroundTruth], + context_precision_pred_answers: list[Prediction], + dataset_name: str, + model_name: str, +): + metrics = _get_metrics( + dataset_name=dataset_name, + model_name=model_name, + gt_questions=context_precision_gt_questions, + pred_answers=context_precision_pred_answers, + metrics_to_return=[MetricType.ContextPrecision], + llm_client="mistral", + ) + + expected_metrics = { + "uid0": { + "ContextPrecision": 0.5, + }, + "uid1": { + "ContextPrecision": 0.8333333333333333, + }, + } + + # Check that the returned metrics have the right format. + for m in metrics: + uid = m["parameters"]["datum_uid"] + metric_name = m["type"] + assert ( + expected_metrics[uid][metric_name] == m["value"] + ), f"Failed for {uid} and {metric_name}" + + +def test_context_recall_with_mistral( client: Client, - coherence_gt_questions: list[GroundTruth], - coherence_pred_answers: list[Prediction], + context_recall_gt_questions: list[GroundTruth], + context_recall_pred_answers: list[Prediction], dataset_name: str, model_name: str, ): metrics = _get_metrics( dataset_name=dataset_name, model_name=model_name, - gt_questions=coherence_gt_questions, - pred_answers=coherence_pred_answers, - metrics_to_return=[MetricType.Coherence], + gt_questions=context_recall_gt_questions, + pred_answers=context_recall_pred_answers, + metrics_to_return=[MetricType.ContextRecall], llm_client="mistral", ) expected_metrics = { "uid0": { - "Coherence": 1, + "ContextRecall": 0.5, }, "uid1": { - "Coherence": 5, + "ContextRecall": 1.0, }, } @@ -514,6 +675,31 @@ def test_hallucination_with_mistral( ), f"Failed for {uid} and {metric_name}" +def test_summary_coherence_with_mistral( + client: Client, + summary_coherence_gt_questions: list[GroundTruth], + summary_coherence_pred_answers: list[Prediction], + dataset_name: str, + model_name: str, +): + metrics = _get_metrics( + dataset_name=dataset_name, + model_name=model_name, + gt_questions=summary_coherence_gt_questions, + pred_answers=summary_coherence_pred_answers, + metrics_to_return=[MetricType.SummaryCoherence], + llm_client="mistral", + ) + + # Check that the returned metrics have the right format. + assert len(metrics) == 1 + assert metrics[0]["parameters"]["datum_uid"] == "uid0" + assert metrics[0]["type"] == "SummaryCoherence" + + # Check that the summary coherence was rated >= 3. + assert metrics[0]["value"] in {3, 4, 5} + + def test_toxicity_with_mistral( client: Client, toxicity_gt_questions: list[GroundTruth],