From a910ff3b94517c38cdef85ee0865ff746f9b0a2e Mon Sep 17 00:00:00 2001 From: MilesHolland <108901744+MilesHolland@users.noreply.github.com> Date: Fri, 1 Nov 2024 14:55:34 -0400 Subject: [PATCH] RAI service input sanitization (#38247) * escape eval inputs * new recordings and disable accidentally enabled test * save recordings again * save recordings again again * save recordings again again again * update groundedness threshold * Update sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py Co-authored-by: Billy Hu * correct new check --------- Co-authored-by: Billy Hu --- .../azure-ai-evaluation/assets.json | 2 +- .../ai/evaluation/_common/rai_service.py | 35 +++++++++++++-- .../_service_groundedness.py | 2 +- .../azure-ai-evaluation/tests/conftest.py | 2 +- .../e2etests/data/evaluate_test_data.jsonl | 2 +- .../evaluate_test_data_conversation.jsonl | 2 +- .../tests/e2etests/data/questions.jsonl | 2 +- .../e2etests/data/questions_answers.jsonl | 2 +- .../tests/e2etests/test_builtin_evaluators.py | 45 +++++-------------- .../test_content_safety_rai_script.py | 38 ++++++++++++++++ 10 files changed, 87 insertions(+), 45 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/assets.json b/sdk/evaluation/azure-ai-evaluation/assets.json index 0958d81d3933..356e246839ab 100644 --- a/sdk/evaluation/azure-ai-evaluation/assets.json +++ b/sdk/evaluation/azure-ai-evaluation/assets.json @@ -2,5 +2,5 @@ "AssetsRepo": "Azure/azure-sdk-assets", "AssetsRepoPrefixPath": "python", "TagPrefix": "python/evaluation/azure-ai-evaluation", - "Tag": "python/evaluation/azure-ai-evaluation_daf1ed16fc" + "Tag": "python/evaluation/azure-ai-evaluation_043418c052" } diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/rai_service.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/rai_service.py index e359abcca325..432b585e544f 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/rai_service.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/rai_service.py @@ -6,6 +6,8 @@ import math import re import time +import json +import html from ast import literal_eval from typing import Dict, List, Optional, Union, cast from urllib.parse import urlparse @@ -38,10 +40,38 @@ USER_TEXT_TEMPLATE_DICT: Dict[str, Template] = { "DEFAULT": Template("{$query}{$response}"), - Tasks.GROUNDEDNESS: Template('{"question": "$query", "answer": "$response", "context": "$context"}'), } +def get_formatted_template(data: dict, annotation_task: str) -> str: + """Given the task and input data, produce a formatted string that will serve as the main + payload for the RAI service. Requires specific per-task logic. + + :param data: The data to incorporate into the payload. + :type data: dict + :param annotation_task: The annotation task to use. This determines the template to use. + :type annotation_task: str + :return: The formatted based on the data and task template. + :rtype: str + """ + # Template class doesn't play nice with json dumping/loading, just handle groundedness' + # JSON format manually. + # Template was: Template('{"question": "$query", "answer": "$response", "context": "$context"}'), + if annotation_task == Tasks.GROUNDEDNESS: + as_dict = { + "question": data.get("query", ""), + "answer": data.get("response", ""), + "context": data.get("context", ""), + } + return json.dumps(as_dict) + as_dict = { + "query": html.escape(data.get("query", "")), + "response": html.escape(data.get("response", "")), + } + user_text = USER_TEXT_TEMPLATE_DICT.get(annotation_task, USER_TEXT_TEMPLATE_DICT["DEFAULT"]).substitute(**as_dict) + return user_text.replace("'", '\\"') + + def get_common_headers(token: str) -> Dict: """Get common headers for the HTTP request @@ -161,8 +191,7 @@ async def submit_request(data: dict, metric: str, rai_svc_url: str, token: str, :return: The operation ID. :rtype: str """ - user_text = USER_TEXT_TEMPLATE_DICT.get(annotation_task, USER_TEXT_TEMPLATE_DICT["DEFAULT"]).substitute(**data) - normalized_user_text = user_text.replace("'", '\\"') + normalized_user_text = get_formatted_template(data, annotation_task) payload = generate_payload(normalized_user_text, metric, annotation_task=annotation_task) url = rai_svc_url + "/submitannotation" diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py index be0d249c99b3..4175823e9598 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_service_groundedness/_service_groundedness.py @@ -92,7 +92,7 @@ def __init__( azure_ai_project, **kwargs, ): - self._passing_score = 3 # TODO update once the binarization PR is merged + self._passing_score = 5 # TODO update once the binarization PR is merged self._output_prefix = "groundedness_pro" super().__init__( eval_metric=EvaluationMetrics.GROUNDEDNESS, diff --git a/sdk/evaluation/azure-ai-evaluation/tests/conftest.py b/sdk/evaluation/azure-ai-evaluation/tests/conftest.py index 5a44f2f2abb0..4c27d1cf0c90 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/conftest.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/conftest.py @@ -203,7 +203,7 @@ def simple_conversation(): return { "messages": [ { - "content": "What is the capital of France?", + "content": "What is the capital of France?`''\"{}{{]", "role": "user", "context": "Customer wants to know the capital of France", }, diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/data/evaluate_test_data.jsonl b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/data/evaluate_test_data.jsonl index 0396a22c2db6..cad00edbdff1 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/data/evaluate_test_data.jsonl +++ b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/data/evaluate_test_data.jsonl @@ -1,3 +1,3 @@ {"query":"How do you create a run?","context":"AML API only","response":"To create a run using the Azure Machine Learning API, you first need to create an Experiment. Once you have an experiment, you can create a Run object that is associated with that experiment. Here is some Python code that demonstrates this process:\n\n```\nfrom azureml.core import Experiment, Run\nfrom azureml.core.workspace import Workspace\n\n# Define workspace and experiment\nws = Workspace.from_config()\nexp = Experiment(workspace=ws, name='my_experiment')\n\n# Create a new run\nrun = exp.start_logging()\n```\n\nIn this code, the `from_config()` method reads the configuration file that you created when you set up your Azure Machine Learning workspace. The `Experiment` constructor creates an Experiment object that is associated with your workspace, and the `start_logging()` method creates a new Run object that is associated with the Experiment. Now you can use the `run` object to log metrics, upload files, and track other information related to your machine learning experiment.", "ground_truth":"Paris is the capital of France."} {"query":"How do you log a model?","context":"Logging can be done using any OSS Sdk","response":"There are a few ways to log models in Azure Machine Learning. \n\nOne way is to use the `register_model()` method of the `Run` object. The `register_model()` method logs a model file in the Azure Machine Learning service workspace and makes it available for deployment. Here's an example:\n\n```python\nfrom azureml.core import Model\n\nmodel_path = '.\/outputs\/my_model.pkl'\nmodel = Model.register(workspace=ws, model_path=model_path, model_name='my_model')\n```\n\nThis code registers the model file located at `model_path` to the Azure Machine Learning service workspace with the name `my_model`. \n\nAnother way to log a model is to save it as an output of a `Run`. If your model generation code is part of a script or Jupyter notebook that runs as an Azure Machine Learning experiment, you can save the model file as an output of the `Run` object. Here's an example:\n\n```python\nfrom sklearn.linear_model import LogisticRegression\nfrom azureml.core.run import Run\n\n# Initialize a run object\nrun = Run.get_context()\n\n# Train your model\nX_train, y_train = ...\nclf = LogisticRegression().fit(X_train, y_train)\n\n# Save the model to the Run object's outputs directory\nmodel_path = 'outputs\/model.pkl'\njoblib.dump(value=clf, filename=model_path)\n\n# Log the model as a run artifact\nrun.upload_file(name=model_path, path_or_stream=model_path)\n```\n\nIn this code, `Run.get_context()` retrieves the current run context object, which you can use to track metadata and metrics for the run. After training your model, you can use `joblib.dump()` to save the model to a file, and then log the file as an artifact of the run using `run.upload_file()`.","ground_truth":"Paris is the capital of France."} -{"query":"What is the capital of France?","context":"France is in Europe","response":"Paris is the capital of France.", "ground_truth":"Paris is the capital of France."} +{"query":"What is the capital of France?`''\"{}{{]","context":"France is in Europe","response":"Paris is the capital of France.", "ground_truth":"Paris is the capital of France."} diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/data/evaluate_test_data_conversation.jsonl b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/data/evaluate_test_data_conversation.jsonl index 7a52cebbf2f3..037487fae1e4 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/data/evaluate_test_data_conversation.jsonl +++ b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/data/evaluate_test_data_conversation.jsonl @@ -1,2 +1,2 @@ {"conversation" : {"context" : "", "messages": [{"content": "What shape has 3 sides", "role" :"user", "context": null}, {"content": "A triangle", "role" :"assistant", "context": "The answer is a triangle."}, {"content": "Next, what shape has 4 sides", "role" :"user", "context": null}, {"content": "A square", "role" :"assistant", "context": "The answer is a square."}]}} -{"conversation" : {"context" : "User wants to know about state capitals", "messages": [{"content": "What is the capital of Hawaii", "role" :"user", "context": "User wants to know the capital of Hawaii"}, {"content": "Honolulu", "role" :"assistant", "context": "The answer is a Honolulu."}, {"content": "Ok, what is the capital of Massachusetts", "role" :"user", "context": "User wants to know the capital of Massachusetts."}, {"content": "Boston", "role" :"assistant", "context": "The answer is Boston."}]}} +{"conversation" : {"context" : "User wants to know about state capitals", "messages": [{"content": "What is the capital of Hawaii`''\"{}{{]", "role" :"user", "context": "User wants to know the capital of Hawaii"}, {"content": "Honolulu", "role" :"assistant", "context": "The answer is a Honolulu."}, {"content": "Ok, what is the capital of Massachusetts", "role" :"user", "context": "User wants to know the capital of Massachusetts."}, {"content": "Boston", "role" :"assistant", "context": "The answer is Boston."}]}} diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/data/questions.jsonl b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/data/questions.jsonl index 7507f5ca50a3..4419b71e10b4 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/data/questions.jsonl +++ b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/data/questions.jsonl @@ -1,3 +1,3 @@ {"query":"How long is flight from Earth to LV-426?","ground_truth":"Far away.","context": "Refers to a distant fictional location."} {"query":"Why there is no central heating on the street?","ground_truth":"It is expensive.","context": "Discusses infrastructure cost."} -{"query":"Why these questions are so strange?","ground_truth":"The life is strange...","context": "Questions may seem unusual."} +{"query":"Why these questions are so strange?`''\"{}{{]","ground_truth":"The life is strange...","context": "Questions may seem unusual."} diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/data/questions_answers.jsonl b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/data/questions_answers.jsonl index d7d3b3510dd6..764c3bdb08ce 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/data/questions_answers.jsonl +++ b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/data/questions_answers.jsonl @@ -1 +1 @@ -{"query":"How long is flight from Earth to LV-426?","response":"There is nothing good there.", "ground_truth": "39 light years"} +{"query":"How long is flight from Earth to LV-426?`''\"{}{{]","response":"There is nothing good there.", "ground_truth": "39 light years"} diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py index 7667b6d25a86..5bd942be4000 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py @@ -14,6 +14,7 @@ ImageContentItem, ImageUrl, ) +from azure.ai.evaluation._common.constants import HarmSeverityLevel from azure.ai.evaluation._model_configurations import Conversation from azure.ai.evaluation import ( BleuScoreEvaluator, @@ -240,7 +241,7 @@ def test_content_safety_evaluator_violence(self, project_scope, azure_cred, simp assert score2["violence_score"] > 0 assert score2["evaluation_per_turn"]["violence_score"][0] == 0 assert score2["evaluation_per_turn"]["violence_score"][1] > 0 - assert score2["evaluation_per_turn"]["violence"] == ["Very low", "High"] + assert len(score2["evaluation_per_turn"]["violence"]) == 2 assert all(score2["evaluation_per_turn"]["violence_reason"]), "violence_reason must not be None or empty." def test_content_safety_evaluator_sexual(self, project_scope, azure_cred, simple_conversation): @@ -287,7 +288,7 @@ def test_content_safety_evaluator_self_harm(self, project_scope, azure_cred, sim # Test eval_last_turn behavior eval_fn_last = SelfHarmEvaluator(azure_cred, project_scope, eval_last_turn=True) score3 = eval_fn_last(conversation=simple_conversation) - assert score3["self_harm"] == "Very low" + assert score3["self_harm"] in (level.value for level in HarmSeverityLevel) assert score3["self_harm_score"] >= 1 assert score3["self_harm_reason"], "self_harm_reason must not be None or empty." @@ -456,38 +457,12 @@ def test_protected_material_evaluator(self, project_scope, azure_cred, simple_co assert not good_result["protected_material_label"] assert "material was not found" in good_result["protected_material_reason"] - problematic_text = ( - "Hey Dude, don't make it bad Take a sad song and make it better " - + "Remember to let her into your heart Then you can stop to make it better Hey Dude, " - + "don't be afraid You were made to go out and get her The minute you let her " - + "under your skin Then you begin to make it better And anytime you feel the pain, " - + "hey Jude, refrain Don't carry the world upon your shoulders For well you know that " - + "it's a mule who plays it cool By making his world a little folder Na, na, na, " - + "naa-naa Na, na, naa-naa Hey Dude, don't let me down You have found her, " - + "now go and get her (let it out and let it in) Remember to let her into your heart" - ) - - problem_answer = ip_eval( - query="-", - response=problematic_text, - ) - - problem_question = ip_eval( - response="-", - query=problematic_text, - ) - assert problem_answer is not None - assert problem_answer["protected_material_label"] - assert "material was found" in problem_answer["protected_material_reason"] - assert problem_question is not None - assert problem_question["protected_material_label"] - assert "material was found" in problem_question["protected_material_reason"] + # Positive test casing removed due to how fluctuating service results. # Test conversation input - simple_conversation["messages"][3]["content"] = problematic_text convo_result = ip_eval(conversation=simple_conversation) - assert convo_result["protected_material_label"] == 0.5 - assert convo_result["evaluation_per_turn"]["protected_material_label"] == [False, True] + assert convo_result["protected_material_label"] == 0.0 + assert convo_result["evaluation_per_turn"]["protected_material_label"] == [False, False] assert all( convo_result["evaluation_per_turn"]["protected_material_reason"] ), "protected_material_reason must not be None or empty." @@ -761,16 +736,16 @@ def test_multimodal_evaluator_content_safety_json_b64_image(self, project_scope, score = evaluator(conversation=conversation) assert score is not None - assert score["violence"] == "Very low" + assert score["violence"] in (level.value for level in HarmSeverityLevel) assert score["violence_score"] < 1.0 assert score["violence_reason"], "violence_reason must not be None or empty." - assert score["sexual"] == "Very low" + assert score["sexual"] in (level.value for level in HarmSeverityLevel) assert score["sexual_score"] < 1.0 assert score["sexual_reason"], "sexual_reason must not be None or empty." - assert score["self_harm"] == "Very low" + assert score["self_harm"] in (level.value for level in HarmSeverityLevel) assert score["self_harm_score"] < 1.0 assert score["self_harm_reason"], "self_harm_reason must not be None or empty." - assert score["hate_unfairness"] == "Very low" + assert score["hate_unfairness"] in (level.value for level in HarmSeverityLevel) assert score["hate_unfairness_score"] < 1.0 assert score["hate_unfairness_reason"], "hate_unfairness_reason must not be None or empty." diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_content_safety_rai_script.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_content_safety_rai_script.py index 6a40585972e4..6fd7b6d6283a 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_content_safety_rai_script.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_content_safety_rai_script.py @@ -2,6 +2,7 @@ import math import os import pathlib +import json, html, re from typing import Any, Iterator, MutableMapping, Optional from unittest.mock import MagicMock, patch @@ -18,6 +19,8 @@ parse_response, submit_request, Tasks, + USER_TEXT_TEMPLATE_DICT, + get_formatted_template, ) from azure.core.exceptions import HttpResponseError from azure.core.rest import AsyncHttpResponse, HttpRequest @@ -431,3 +434,38 @@ async def test_evaluate_with_rai_service( assert submit_mock._mock_call_count == 1 assert fetch_result_mock._mock_call_count == 1 assert parse_mock._mock_call_count == 1 + + # RAI service templates are so different that it's not worth trying to test them all in one test. + # Groundedness is JSON + def test_get_formatted_template_groundedness(self): + tagged_text = "This text has <> tags." + bracketed_text = "{This text has {brackets}, and I didn't even both to even them out {." + quoted_text = ( + 'This text has \'quotes\', also it has "quotes", and it even has `backticks` and """ triple quotes""".' + ) + all_texts = [tagged_text, quoted_text, bracketed_text] + for text in all_texts: + input_kwargs = { + "query": text, + "response": text, + "context": text, + } + formatted_payload = get_formatted_template(input_kwargs, Tasks.GROUNDEDNESS) + assert json.loads(formatted_payload)["question"] == text + + # Default is basic markup. + def test_get_formatted_template_default(self): + tagged_text = "This text has <> tags." + bracketed_text = "{This text has {brackets}, and I didn't even both to even them out {." + quoted_text = ( + 'This text has \'quotes\', also it has "quotes", and it even has `backticks` and """ triple quotes""".' + ) + all_texts = [tagged_text, quoted_text, bracketed_text] + for text in all_texts: + input_kwargs = { + "query": text, + "response": text, + "context": text, + } + formatted_payload = get_formatted_template(input_kwargs, "DEFAULT") + assert html.unescape(re.match("\{(.*?)}\<", formatted_payload)[1]) == text