diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md index 33fbfa2096fc..262d58302aa8 100644 --- a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md +++ b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md @@ -1,6 +1,5 @@ # Release History - ## 1.0.0b5 (Unreleased) ### Features Added @@ -23,6 +22,7 @@ outputs = asyncio.run(custom_simulator( max_conversation_turns=1, )) ``` +- Adding evaluator for multimodal use cases ### Breaking Changes - Renamed environment variable `PF_EVALS_BATCH_USE_ASYNC` to `AI_EVALS_BATCH_USE_ASYNC`. diff --git a/sdk/evaluation/azure-ai-evaluation/assets.json b/sdk/evaluation/azure-ai-evaluation/assets.json index 7144de427f88..8483a02c668b 100644 --- a/sdk/evaluation/azure-ai-evaluation/assets.json +++ b/sdk/evaluation/azure-ai-evaluation/assets.json @@ -2,5 +2,5 @@ "AssetsRepo": "Azure/azure-sdk-assets", "AssetsRepoPrefixPath": "python", "TagPrefix": "python/evaluation/azure-ai-evaluation", - "Tag": "python/evaluation/azure-ai-evaluation_f0444ef220" + "Tag": "python/evaluation/azure-ai-evaluation_eb4989f81d" } diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py index f1d59bf13b24..c21a97a9531a 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py @@ -12,6 +12,14 @@ SexualEvaluator, ViolenceEvaluator, ) +from ._evaluators._multimodal._content_safety_multimodal import ( + ContentSafetyMultimodalEvaluator, + HateUnfairnessMultimodalEvaluator, + SelfHarmMultimodalEvaluator, + SexualMultimodalEvaluator, + ViolenceMultimodalEvaluator, +) +from ._evaluators._multimodal._protected_material import ProtectedMaterialMultimodalEvaluator from ._evaluators._f1_score import F1ScoreEvaluator from ._evaluators._fluency import FluencyEvaluator from ._evaluators._gleu import GleuScoreEvaluator @@ -65,4 +73,10 @@ "Conversation", "Message", "EvaluationResult", + "ContentSafetyMultimodalEvaluator", + "HateUnfairnessMultimodalEvaluator", + "SelfHarmMultimodalEvaluator", + "SexualMultimodalEvaluator", + "ViolenceMultimodalEvaluator", + "ProtectedMaterialMultimodalEvaluator", ] diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/rai_service.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/rai_service.py index cfbed826a302..3d8ad943522d 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/rai_service.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/rai_service.py @@ -13,10 +13,12 @@ import jwt +from promptflow.core._errors import MissingRequiredPackage from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException from azure.ai.evaluation._http_utils import AsyncHttpPipeline, get_async_http_client from azure.ai.evaluation._model_configurations import AzureAIProject from azure.core.credentials import TokenCredential +from azure.core.exceptions import HttpResponseError from azure.core.pipeline.policies import AsyncRetryPolicy from .constants import ( @@ -26,7 +28,7 @@ Tasks, _InternalEvaluationMetrics, ) -from .utils import get_harm_severity_level +from .utils import get_harm_severity_level, retrieve_content_type try: version = importlib.metadata.version("azure-ai-evaluation") @@ -281,7 +283,11 @@ def _parse_content_harm_response( if key == EvaluationMetrics.HATE_FAIRNESS: key = EvaluationMetrics.HATE_UNFAIRNESS - result: Dict[str, Union[str, float]] = {key: math.nan, key + "_score": math.nan, key + "_reason": ""} + result: Dict[str, Union[str, float]] = { + (key.value if hasattr(key, "value") else key): math.nan, + f"{(key.value if hasattr(key, 'value') else key)}_score": math.nan, + f"{(key.value if hasattr(key, 'value') else key)}_reason": math.nan, + } response = batch_response[0] if metric_name not in response: @@ -330,6 +336,8 @@ def _parse_content_harm_response( reason = "" harm_score = metric_value + if metric_value == "n/a": + return result if not math.isnan(metric_value): # int(math.nan) causes a value error, and math.nan is already handled # by get_harm_severity_level @@ -465,3 +473,109 @@ async def evaluate_with_rai_service( result = parse_response(annotation_response, metric_name, metric_display_name) return result + + +def generate_payload_multimodal(content_type: str, messages, metric: str) -> Dict: + """Generate the payload for the annotation request + :param content_type: The type of the content representing multimodal or images. + :type content_type: str + :param messages: The normalized list of messages to be entered as the "Contents" in the payload. + :type messages: str + :param metric: The evaluation metric to use. This determines the task type, and whether a "MetricList" is needed + in the payload. + :type metric: str + :return: The payload for the annotation request. + :rtype: Dict + """ + include_metric = True + task = Tasks.CONTENT_HARM + if metric == EvaluationMetrics.PROTECTED_MATERIAL: + task = Tasks.PROTECTED_MATERIAL + include_metric = False + + if include_metric: + return { + "ContentType": content_type, + "Contents": [{"messages": messages}], + "AnnotationTask": task, + "MetricList": [metric], + } + return { + "ContentType": content_type, + "Contents": [{"messages": messages}], + "AnnotationTask": task, + } + + +async def submit_multimodal_request(messages, metric: str, rai_svc_url: str, token: str) -> str: + """Submit request to Responsible AI service for evaluation and return operation ID + :param messages: The normalized list of messages to be entered as the "Contents" in the payload. + :type messages: str + :param metric: The evaluation metric to use. + :type metric: str + :param rai_svc_url: The Responsible AI service URL. + :type rai_svc_url: str + :param token: The Azure authentication token. + :type token: str + :return: The operation ID. + :rtype: str + """ + ## handle json payload and payload from inference sdk strongly type messages + if len(messages) > 0 and not isinstance(messages[0], dict): + try: + from azure.ai.inference.models import ChatRequestMessage + except ImportError as ex: + error_message = ( + "Please install 'azure-ai-inference' package to use SystemMessage, UserMessage, AssistantMessage" + ) + raise MissingRequiredPackage(message=error_message) from ex + if len(messages) > 0 and isinstance(messages[0], ChatRequestMessage): + messages = [message.as_dict() for message in messages] + + filtered_messages = [message for message in messages if message["role"] != "system"] + assistant_messages = [message for message in messages if message["role"] == "assistant"] + content_type = retrieve_content_type(assistant_messages, metric) + payload = generate_payload_multimodal(content_type, filtered_messages, metric) + + ## calling rai service for annotation + url = rai_svc_url + "/submitannotation" + headers = get_common_headers(token) + async with get_async_http_client() as client: + response = await client.post( # pylint: disable=too-many-function-args,unexpected-keyword-arg + url, json=payload, headers=headers + ) + if response.status_code != 202: + raise HttpResponseError( + message=f"Received unexpected HTTP status: {response.status_code} {response.text()}", response=response + ) + result = response.json() + operation_id = result["location"].split("/")[-1] + return operation_id + + +async def evaluate_with_rai_service_multimodal( + messages, metric_name: str, project_scope: AzureAIProject, credential: TokenCredential +): + """ "Evaluate the content safety of the response using Responsible AI service + :param messages: The normalized list of messages. + :type messages: str + :param metric_name: The evaluation metric to use. + :type metric_name: str + :param project_scope: The Azure AI project scope details. + :type project_scope: Dict + :param credential: The Azure authentication credential. + :type credential: + ~azure.core.credentials.TokenCredential + :return: The parsed annotation result. + :rtype: List[List[Dict]] + """ + + # Get RAI service URL from discovery service and check service availability + token = await fetch_or_reuse_token(credential) + rai_svc_url = await get_rai_svc_url(project_scope, token) + await ensure_service_availability(rai_svc_url, token, Tasks.CONTENT_HARM) + # Submit annotation request and fetch result + operation_id = await submit_multimodal_request(messages, metric_name, rai_svc_url, token) + annotation_response = cast(List[Dict], await fetch_result(operation_id, rai_svc_url, credential, token)) + result = parse_response(annotation_response, metric_name) + return result diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py index 9d22d522d230..32a83144db61 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_common/utils.py @@ -9,9 +9,9 @@ import nltk from typing_extensions import NotRequired, Required, TypeGuard - +from promptflow.core._errors import MissingRequiredPackage from azure.ai.evaluation._constants import AZURE_OPENAI_TYPE, OPENAI_TYPE -from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, EvaluationException +from azure.ai.evaluation._exceptions import ErrorBlame, ErrorCategory, ErrorTarget, EvaluationException from azure.ai.evaluation._model_configurations import ( AzureAIProject, AzureOpenAIModelConfiguration, @@ -312,3 +312,100 @@ def remove_optional_singletons(eval_class, singletons): if param in singletons: del required_singletons[param] return required_singletons + + +def retrieve_content_type(assistant_messages: List, metric: str) -> str: + """Get the content type for service payload. + + :param assistant_messages: The list of messages to be annotated by evaluation service + :type assistant_messages: list + :param metric: A string representing the metric type + :type metric: str + :return: A text representing the content type. Example: 'text', or 'image' + :rtype: str + """ + # Check if metric is "protected_material" + if metric == "protected_material": + return "image" + + # Iterate through each message + for item in assistant_messages: + # Ensure "content" exists in the message and is iterable + content = item.get("content", []) + for message in content: + if message.get("type", "") == "image_url": + return "image" + # Default return if no image was found + return "text" + + +def validate_conversation(conversation): + def raise_exception(msg, target): + raise EvaluationException( + message=msg, + internal_message=msg, + target=target, + category=ErrorCategory.INVALID_VALUE, + blame=ErrorBlame.USER_ERROR, + ) + + if not conversation or "messages" not in conversation: + raise_exception( + "Attribute 'messages' is missing in the request", + ErrorTarget.CONTENT_SAFETY_CHAT_EVALUATOR, + ) + messages = conversation["messages"] + if not isinstance(messages, list): + raise_exception( + "'messages' parameter must be a JSON-compatible list of chat messages", + ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR, + ) + expected_roles = {"user", "assistant", "system"} + image_found = False + for num, message in enumerate(messages, 1): + if not isinstance(message, dict): + try: + from azure.ai.inference.models import ( + ChatRequestMessage, + UserMessage, + AssistantMessage, + SystemMessage, + ImageContentItem, + ) + except ImportError as ex: + raise MissingRequiredPackage( + message="Please install 'azure-ai-inference' package to use SystemMessage, AssistantMessage" + ) from ex + + if isinstance(messages[0], ChatRequestMessage) and not isinstance( + message, (UserMessage, AssistantMessage, SystemMessage) + ): + raise_exception( + f"Messages must be a strongly typed class of ChatRequestMessage. Message number: {num}", + ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR, + ) + + if isinstance(message.content, list) and any( + isinstance(item, ImageContentItem) for item in message.content + ): + image_found = True + continue + if message.get("role") not in expected_roles: + raise_exception( + f"Invalid role provided: {message.get('role')}. Message number: {num}", + ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR, + ) + content = message.get("content") + if not isinstance(content, (str, list)): + raise_exception( + f"Content in each turn must be a string or array. Message number: {num}", + ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR, + ) + if isinstance(content, list): + if any(item.get("type") == "image_url" and "url" in item.get("image_url", {}) for item in content): + image_found = True + if not image_found: + raise_exception( + "Message needs to have multi-modal input like images.", + ErrorTarget.CONTENT_SAFETY_MULTIMODAL_EVALUATOR, + ) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_utils.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_utils.py index aee603b82f72..3249323c4905 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_utils.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_utils.py @@ -8,6 +8,8 @@ import tempfile from pathlib import Path from typing import Any, Dict, NamedTuple, Optional, Tuple, Union +import uuid +import base64 import pandas as pd from promptflow.client import PFClient @@ -81,6 +83,33 @@ def _azure_pf_client_and_triad(trace_destination) -> Tuple[PFClient, AzureMLWork return azure_pf_client, ws_triad +def _store_multimodal_content(messages, tmpdir: str): + # verify if images folder exists + images_folder_path = os.path.join(tmpdir, "images") + os.makedirs(images_folder_path, exist_ok=True) + + # traverse all messages and replace base64 image data with new file name. + for message in messages: + for content in message.get("content", []): + if content.get("type") == "image_url": + image_url = content.get("image_url") + if image_url and "url" in image_url and image_url["url"].startswith("data:image/jpg;base64,"): + # Extract the base64 string + base64image = image_url["url"].replace("data:image/jpg;base64,", "") + + # Generate a unique filename + image_file_name = f"{str(uuid.uuid4())}.jpg" + image_url["url"] = f"images/{image_file_name}" # Replace the base64 URL with the file path + + # Decode the base64 string to binary image data + image_data_binary = base64.b64decode(base64image) + + # Write the binary image data to the file + image_file_path = os.path.join(images_folder_path, image_file_name) + with open(image_file_path, "wb") as f: + f.write(image_data_binary) + + def _log_metrics_and_instance_results( metrics: Dict[str, Any], instance_results: pd.DataFrame, @@ -110,6 +139,15 @@ def _log_metrics_and_instance_results( artifact_name = EvalRun.EVALUATION_ARTIFACT if run else EvalRun.EVALUATION_ARTIFACT_DUMMY_RUN with tempfile.TemporaryDirectory() as tmpdir: + # storing multi_modal images if exists + col_name = "inputs.conversation" + if col_name in instance_results.columns: + for item in instance_results[col_name].items(): + value = item[1] + if "messages" in value: + _store_multimodal_content(value["messages"], tmpdir) + + # storing artifact result tmp_path = os.path.join(tmpdir, artifact_name) with open(tmp_path, "w", encoding=DefaultOpenEncoding.WRITE) as f: diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py index 2781c88d96eb..d0dc69820607 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_content_safety/_content_safety_chat.py @@ -99,10 +99,10 @@ def __init__( self._eval_last_turn = eval_last_turn self._parallel = parallel self._evaluators: List[Callable[..., Dict[str, Union[str, float]]]] = [ - ViolenceEvaluator(azure_ai_project, credential), - SexualEvaluator(azure_ai_project, credential), - SelfHarmEvaluator(azure_ai_project, credential), - HateUnfairnessEvaluator(azure_ai_project, credential), + ViolenceEvaluator(credential, azure_ai_project), + SexualEvaluator(credential, azure_ai_project), + SelfHarmEvaluator(credential, azure_ai_project), + HateUnfairnessEvaluator(credential, azure_ai_project), ] def __call__(self, *, conversation: list, **kwargs): diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_multimodal/__init__.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_multimodal/__init__.py new file mode 100644 index 000000000000..861e8d1ea088 --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_multimodal/__init__.py @@ -0,0 +1,20 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- +from ._content_safety_multimodal import ContentSafetyMultimodalEvaluator +from ._content_safety_multimodal_base import ContentSafetyMultimodalEvaluatorBase +from ._hate_unfairness import HateUnfairnessMultimodalEvaluator +from ._self_harm import SelfHarmMultimodalEvaluator +from ._sexual import SexualMultimodalEvaluator +from ._violence import ViolenceMultimodalEvaluator +from ._protected_material import ProtectedMaterialMultimodalEvaluator + +__all__ = [ + "ContentSafetyMultimodalEvaluator", + "ContentSafetyMultimodalEvaluatorBase", + "ViolenceMultimodalEvaluator", + "SexualMultimodalEvaluator", + "SelfHarmMultimodalEvaluator", + "HateUnfairnessMultimodalEvaluator", + "ProtectedMaterialMultimodalEvaluator", +] diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py new file mode 100644 index 000000000000..53518ee02518 --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal.py @@ -0,0 +1,130 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- +import logging +from concurrent.futures import as_completed +from typing import Callable, Dict, List, Union + +from promptflow.tracing import ThreadPoolExecutorWithContext as ThreadPoolExecutor +from azure.ai.evaluation._common._experimental import experimental +from azure.ai.evaluation._common.utils import validate_conversation +from ._hate_unfairness import HateUnfairnessMultimodalEvaluator +from ._self_harm import SelfHarmMultimodalEvaluator +from ._sexual import SexualMultimodalEvaluator +from ._violence import ViolenceMultimodalEvaluator + + +logger = logging.getLogger(__name__) + + +@experimental +class ContentSafetyMultimodalEvaluator: + """ + Initialize a content safety multimodal evaluator configured to evaluate content safety metrics + in a multimodal scenario. + + :param credential: The credential required for connecting to the Azure AI project. + :type credential: ~azure.core.credentials.TokenCredential + :param azure_ai_project: The scope of the Azure AI project, containing the subscription ID, + resource group, and project name. + :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject + :param parallel: Specifies whether to use parallel execution for evaluators. + If True, evaluators execute in parallel; otherwise, they execute sequentially. Defaults to True. + :type parallel: bool + + :return: A function that evaluates multimodal chat messages and generates content safety metrics. + :rtype: Callable + + **Usage Example** + + .. code-block:: python + + azure_ai_project = { + "subscription_id": "", + "resource_group_name": "", + "project_name": "", + } + eval_fn = ContentSafetyMultimodalEvaluator(azure_ai_project) + result = eval_fn( + { + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What's in this image?" + }, + { + "type": "image_url", + "image_url": { + "url": "" + } + } + ] + }, + { + "role": "assistant", + "content": "This picture shows an astronaut standing in the desert." + } + ] + } + ) + + **Output Format** + + .. code-block:: json + + { + "violence": "Medium", + "violence_score": 5.0, + "violence_reason": "Some reason", + "sexual": "Medium", + "sexual_score": 5.0, + "sexual_reason": "Some reason", + "self_harm": "Medium", + "self_harm_score": 5.0, + "self_harm_reason": "Some reason", + "hate_unfairness": "Medium", + "hate_unfairness_score": 5.0, + "hate_unfairness_reason": "Some reason" + } + + """ + + def __init__(self, credential, azure_ai_project, parallel: bool = False): + self._parallel = parallel + self._evaluators: List[Callable[..., Dict[str, Union[str, float]]]] = [ + ViolenceMultimodalEvaluator(credential=credential, azure_ai_project=azure_ai_project), + SexualMultimodalEvaluator(credential=credential, azure_ai_project=azure_ai_project), + SelfHarmMultimodalEvaluator(credential=credential, azure_ai_project=azure_ai_project), + HateUnfairnessMultimodalEvaluator(credential=credential, azure_ai_project=azure_ai_project), + ] + + def __call__(self, *, conversation, **kwargs): + """ + Evaluates content-safety metrics for list of messages. + :keyword conversation: The conversation contains list of messages to be evaluated. + Each message should have "role" and "content" keys. + :paramtype conversation: ~azure.ai.evaluation.Conversation + :return: The evaluation score based on the Content Safety Metrics. + :rtype: Dict[str, Union[float, str]] + """ + # validate inputs + validate_conversation(conversation) + results: Dict[str, Union[str, float]] = {} + if self._parallel: + with ThreadPoolExecutor() as executor: + futures = { + executor.submit(evaluator, conversation=conversation, **kwargs): evaluator + for evaluator in self._evaluators + } + + for future in as_completed(futures): + results.update(future.result()) + else: + for evaluator in self._evaluators: + result = evaluator(conversation=conversation, **kwargs) + results.update(result) + + return results diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py new file mode 100644 index 000000000000..205ce002751c --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_multimodal/_content_safety_multimodal_base.py @@ -0,0 +1,57 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- +from abc import ABC +from typing import Union +from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service_multimodal +from azure.ai.evaluation._common.constants import EvaluationMetrics, _InternalEvaluationMetrics +from azure.ai.evaluation._common.utils import validate_conversation +from azure.core.credentials import TokenCredential +from azure.ai.evaluation._common._experimental import experimental + + +@experimental +class ContentSafetyMultimodalEvaluatorBase(ABC): + """ + Initialize a evaluator for a specified Evaluation Metric. Base class that is not + meant to be instantiated by users. + + :param metric: The metric to be evaluated. + :type metric: ~azure.ai.evaluation._evaluators._content_safety.flow.constants.EvaluationMetrics + :param credential: The credential for connecting to Azure AI project. Required + :type credential: ~azure.core.credentials.TokenCredential + :param azure_ai_project: The scope of the Azure AI project. + It contains subscription id, resource group, and project name. + :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject + """ + + def __init__( + self, + metric: Union[EvaluationMetrics, _InternalEvaluationMetrics], + credential: TokenCredential, + azure_ai_project, + ): + self._metric = metric + self._azure_ai_project = azure_ai_project + self._credential = credential + + async def __call__(self, *, conversation, **kwargs): + """ + Evaluates content according to this evaluator's metric. + :keyword conversation: The conversation contains list of messages to be evaluated. + Each message should have "role" and "content" keys. + :paramtype conversation: ~azure.ai.evaluation.Conversation + :return: The evaluation score computation based on the Content Safety metric (self.metric). + :rtype: Dict[str, Union[float, str]] + """ + # validate inputs + validate_conversation(conversation) + messages = conversation["messages"] + # Run score computation based on supplied metric. + result = await evaluate_with_rai_service_multimodal( + messages=messages, + metric_name=self._metric, + project_scope=self._azure_ai_project, + credential=self._credential, + ) + return result diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py new file mode 100644 index 000000000000..b1a1ae6898b8 --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_multimodal/_hate_unfairness.py @@ -0,0 +1,96 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- +from promptflow._utils.async_utils import async_run_allowing_running_loop +from azure.ai.evaluation._common._experimental import experimental +from azure.ai.evaluation._common.constants import EvaluationMetrics +from ._content_safety_multimodal_base import ContentSafetyMultimodalEvaluatorBase + + +@experimental +class HateUnfairnessMultimodalEvaluator: + """ + Initialize a hate-unfairness multimodal evaluator for hate unfairness score. + + :param credential: The credential for connecting to the Azure AI project. This is required. + :type credential: ~azure.core.credentials.TokenCredential + :param azure_ai_project: The scope of the Azure AI project, + which includes subscription ID, resource group name, and project name. + :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject + + **Usage Example** + + .. code-block:: python + + azure_ai_project = { + "subscription_id": "", + "resource_group_name": "", + "project_name": "", + } + eval_fn = HateUnfairnessMultimodalEvaluator(azure_ai_project) + result = eval_fn( + { + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What's in this image?" + }, + { + "type": "image_url", + "image_url": { + "url": "" + } + } + ] + }, + { + "role": "assistant", + "content": "This picture shows an astronaut standing in the desert." + } + ] + } + ) + + **Output Format** + + .. code-block:: json + + { + "hate_unfairness": "High", + "hate_unfairness_score": 6.5, + "hate_unfairness_reason": "Some reason" + } + + """ + + def __init__(self, credential, azure_ai_project): + self._async_evaluator = _AsyncHateUnfairnessMultimodalEvaluator(credential, azure_ai_project) + + def __call__(self, *, conversation, **kwargs): + """ + Evaluates hate unfairness content. + :keyword conversation: The conversation contains list of messages to be evaluated. + Each message should have "role" and "content" keys. + :paramtype conversation: ~azure.ai.evaluation.Conversation + :return: The hate unfairness score. + :rtype: Dict[str, Union[float, str]] + """ + return async_run_allowing_running_loop(self._async_evaluator, conversation=conversation, **kwargs) + + def _to_async(self): + return self._async_evaluator + + +class _AsyncHateUnfairnessMultimodalEvaluator(ContentSafetyMultimodalEvaluatorBase): + def __init__(self, credential, azure_ai_project): + super().__init__( + metric=EvaluationMetrics.HATE_FAIRNESS, + credential=credential, + azure_ai_project=azure_ai_project, + ) + + async def __call__(self, *, conversation, **kwargs): + return await super().__call__(conversation=conversation, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py new file mode 100644 index 000000000000..6ba03f54dc2a --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_multimodal/_protected_material.py @@ -0,0 +1,120 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- +from promptflow._utils.async_utils import async_run_allowing_running_loop +from azure.ai.evaluation._common._experimental import experimental +from azure.ai.evaluation._common.constants import EvaluationMetrics +from azure.ai.evaluation._common.utils import validate_conversation +from azure.ai.evaluation._common.rai_service import evaluate_with_rai_service_multimodal + + +@experimental +class ProtectedMaterialMultimodalEvaluator: + """ + Initialize a protected materials evaluator to detect whether protected material + is present in multimodal messages. The evaluator outputs a Boolean label (`True` or `False`) + indicating the presence of protected material, along with AI-generated reasoning. + + :param credential: The credential for connecting to the Azure AI project. This is required. + :type credential: ~azure.core.credentials.TokenCredential + :param azure_ai_project: The scope of the Azure AI project, containing the subscription ID, + resource group, and project name. + :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject + + :return: A dictionary containing the evaluation result label and reasoning. + :rtype: Dict[str, str] + + **Usage Example** + + .. code-block:: python + + azure_ai_project = { + "subscription_id": "", + "resource_group_name": "", + "project_name": "", + } + eval_fn = ProtectedMaterialMultimodalEvaluator(azure_ai_project) + result = eval_fn( + { + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What's in this image?" + }, + { + "type": "image_url", + "image_url": { + "url": "" + } + } + ] + }, + { + "role": "assistant", + "content": "This picture shows an astronaut standing in the desert." + } + ] + } + ) + + **Output Format** + + .. code-block:: json + + { + "protected_material_label": "False", + "protected_material_reason": "This query does not contain any protected material." + } + + """ + + def __init__( + self, + credential, + azure_ai_project, + ): + self._async_evaluator = _AsyncProtectedMaterialMultimodalEvaluator(credential, azure_ai_project) + + def __call__(self, *, conversation, **kwargs): + """ + Evaluates protected materials content. + + :keyword messages: The messages to be evaluated. Each message should have "role" and "content" keys. + :paramtype messages: ~azure.ai.evaluation.Conversation + :return: A dictionary containing a boolean label and reasoning. + :rtype: Dict[str, str] + """ + return async_run_allowing_running_loop(self._async_evaluator, conversation=conversation, **kwargs) + + def _to_async(self): + return self._async_evaluator + + +class _AsyncProtectedMaterialMultimodalEvaluator: + def __init__(self, credential, azure_ai_project): + self._credential = credential + self._azure_ai_project = azure_ai_project + + async def __call__(self, *, conversation, **kwargs): + """ + Evaluates content according to this evaluator's metric. + :keyword conversation: The conversation contains list of messages to be evaluated. + Each message should have "role" and "content" keys. + :paramtype conversation: ~azure.ai.evaluation.Conversation + :return: The evaluation score computation based on the Content Safety metric (self.metric). + :rtype: Any + """ + # Validate inputs + validate_conversation(conversation) + messages = conversation["messages"] + # Run score computation based on supplied metric. + result = await evaluate_with_rai_service_multimodal( + messages=messages, + metric_name=EvaluationMetrics.PROTECTED_MATERIAL, + credential=self._credential, + project_scope=self._azure_ai_project, + ) + return result diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py new file mode 100644 index 000000000000..17e902f19c2f --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_multimodal/_self_harm.py @@ -0,0 +1,96 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- +from promptflow._utils.async_utils import async_run_allowing_running_loop +from azure.ai.evaluation._common._experimental import experimental +from azure.ai.evaluation._common.constants import EvaluationMetrics +from ._content_safety_multimodal_base import ContentSafetyMultimodalEvaluatorBase + + +@experimental +class SelfHarmMultimodalEvaluator: + """ + Initialize a self harm multimodal evaluator for self harm score. + + :param credential: The credential for connecting to the Azure AI project. This is required. + :type credential: ~azure.core.credentials.TokenCredential + :param azure_ai_project: The scope of the Azure AI project, + which includes subscription ID, resource group name, and project name. + :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject + + **Usage Example** + + .. code-block:: python + + azure_ai_project = { + "subscription_id": "", + "resource_group_name": "", + "project_name": "", + } + eval_fn = SelfHarmMultimodalEvaluator(azure_ai_project) + result = eval_fn( + { + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What's in this image?" + }, + { + "type": "image_url", + "image_url": { + "url": "" + } + } + ] + }, + { + "role": "assistant", + "content": "This picture shows an astronaut standing in the desert." + } + ] + } + ) + + **Output Format** + + .. code-block:: json + + { + "self_harm": "High", + "self_harm_score": 6.5, + "self_harm_reason": "Some reason" + } + + """ + + def __init__(self, credential, azure_ai_project): + self._async_evaluator = _AsyncSelfHarmMultimodalEvaluator(credential, azure_ai_project) + + def __call__(self, *, conversation, **kwargs): + """ + Evaluates self harm content. + :keyword conversation: The conversation contains list of messages to be evaluated. + Each message should have "role" and "content" keys. + :paramtype conversation: ~azure.ai.evaluation.Conversation + :return: The self harm score. + :rtype: Dict[str, Union[float, str]] + """ + return async_run_allowing_running_loop(self._async_evaluator, conversation=conversation, **kwargs) + + def _to_async(self): + return self._async_evaluator + + +class _AsyncSelfHarmMultimodalEvaluator(ContentSafetyMultimodalEvaluatorBase): + def __init__(self, credential, azure_ai_project): + super().__init__( + metric=EvaluationMetrics.SELF_HARM, + credential=credential, + azure_ai_project=azure_ai_project, + ) + + async def __call__(self, *, conversation, **kwargs): + return await super().__call__(conversation=conversation, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_multimodal/_sexual.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_multimodal/_sexual.py new file mode 100644 index 000000000000..8b571206776b --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_multimodal/_sexual.py @@ -0,0 +1,96 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- +from promptflow._utils.async_utils import async_run_allowing_running_loop +from azure.ai.evaluation._common.constants import EvaluationMetrics +from azure.ai.evaluation._common._experimental import experimental +from ._content_safety_multimodal_base import ContentSafetyMultimodalEvaluatorBase + + +@experimental +class SexualMultimodalEvaluator: + """ + Initialize a sexual multimodal evaluator for sexual score. + + :param credential: The credential for connecting to the Azure AI project. This is required. + :type credential: ~azure.core.credentials.TokenCredential + :param azure_ai_project: The scope of the Azure AI project, + which includes subscription ID, resource group name, and project name. + :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject + + **Usage Example** + + .. code-block:: python + + azure_ai_project = { + "subscription_id": "", + "resource_group_name": "", + "project_name": "", + } + eval_fn = SexualMultimodalEvaluator(azure_ai_project) + result = eval_fn( + { + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What's in this image?" + }, + { + "type": "image_url", + "image_url": { + "url": "" + } + } + ] + }, + { + "role": "assistant", + "content": "This picture shows an astronaut standing in the desert." + } + ] + } + ) + + **Output Format** + + .. code-block:: json + + { + "sexual": "High", + "sexual_score": 6.5, + "sexual_reason": "Some reason" + } + + """ + + def __init__(self, credential, azure_ai_project): + self._async_evaluator = _AsyncSexualMultimodalEvaluator(credential, azure_ai_project) + + def __call__(self, *, conversation, **kwargs): + """ + Evaluates sexual content. + :keyword conversation: The conversation contains list of messages to be evaluated. + Each message should have "role" and "content" keys. + :paramtype conversation: ~azure.ai.evaluation.Conversation + :return: The sexual score. + :rtype: Dict[str, Union[float, str]] + """ + return async_run_allowing_running_loop(self._async_evaluator, conversation=conversation, **kwargs) + + def _to_async(self): + return self._async_evaluator + + +class _AsyncSexualMultimodalEvaluator(ContentSafetyMultimodalEvaluatorBase): + def __init__(self, credential, azure_ai_project): + super().__init__( + metric=EvaluationMetrics.SEXUAL, + credential=credential, + azure_ai_project=azure_ai_project, + ) + + async def __call__(self, *, conversation, **kwargs): + return await super().__call__(conversation=conversation, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_multimodal/_violence.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_multimodal/_violence.py new file mode 100644 index 000000000000..b86382c86817 --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_multimodal/_violence.py @@ -0,0 +1,96 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- +from promptflow._utils.async_utils import async_run_allowing_running_loop +from azure.ai.evaluation._common._experimental import experimental +from azure.ai.evaluation._common.constants import EvaluationMetrics +from ._content_safety_multimodal_base import ContentSafetyMultimodalEvaluatorBase + + +@experimental +class ViolenceMultimodalEvaluator: + """ + Initialize a violence multimodal evaluator for violence score. + + :param credential: The credential for connecting to the Azure AI project. This is required. + :type credential: ~azure.core.credentials.TokenCredential + :param azure_ai_project: The scope of the Azure AI project, + which includes subscription ID, resource group name, and project name. + :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject + + **Usage Example** + + .. code-block:: python + + azure_ai_project = { + "subscription_id": "", + "resource_group_name": "", + "project_name": "", + } + eval_fn = ViolenceMultimodalEvaluator(azure_ai_project) + result = eval_fn( + { + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What's in this image?" + }, + { + "type": "image_url", + "image_url": { + "url": "" + } + } + ] + }, + { + "role": "assistant", + "content": "This picture shows an astronaut standing in the desert." + } + ] + } + ) + + **Output Format** + + .. code-block:: json + + { + "violence": "High", + "violence_score": 6.5, + "violence_reason": "Some reason" + } + + """ + + def __init__(self, credential, azure_ai_project): + self._async_evaluator = _AsyncViolenceMultimodalEvaluator(credential, azure_ai_project) + + def __call__(self, *, conversation, **kwargs): + """ + Evaluates violence content. + :keyword conversation: The conversation contains list of messages to be evaluated. + Each message should have "role" and "content" keys. + :paramtype conversation: ~azure.ai.evaluation.Conversation + :return: The violence score. + :rtype: Dict[str, Union[float, str]] + """ + return async_run_allowing_running_loop(self._async_evaluator, conversation=conversation, **kwargs) + + def _to_async(self): + return self._async_evaluator + + +class _AsyncViolenceMultimodalEvaluator(ContentSafetyMultimodalEvaluatorBase): + def __init__(self, credential, azure_ai_project): + super().__init__( + metric=EvaluationMetrics.VIOLENCE, + credential=credential, + azure_ai_project=azure_ai_project, + ) + + async def __call__(self, *, conversation, **kwargs): + return await super().__call__(conversation=conversation, **kwargs) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py index 92ae2a3e98c0..9c9351037da0 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_protected_material/_protected_material.py @@ -15,15 +15,19 @@ class ProtectedMaterialEvaluator(RaiServiceEvaluatorBase): """ Initialize a protected material evaluator to detect whether protected material - is present in your AI system's response. Outputs True or False with AI-generated reasoning. + is present in the AI system's response. The evaluator outputs a Boolean label (`True` or `False`) + indicating the presence of protected material, along with AI-generated reasoning. - :param credential: The credential for connecting to Azure AI project. Required + :param credential: The credential required for connecting to the Azure AI project. :type credential: ~azure.core.credentials.TokenCredential - :param azure_ai_project: The scope of the Azure AI project. - It contains subscription id, resource group, and project name. + :param azure_ai_project: The scope of the Azure AI project, containing the subscription ID, + resource group, and project name. :type azure_ai_project: ~azure.ai.evaluation.AzureAIProject - **Usage** + :return: A dictionary with a label indicating the presence of protected material and the reasoning. + :rtype: Dict[str, Union[bool, str]] + + **Usage Example** .. code-block:: python @@ -35,14 +39,15 @@ class ProtectedMaterialEvaluator(RaiServiceEvaluatorBase): eval_fn = ProtectedMaterialEvaluator(azure_ai_project) result = eval_fn(query="What is the capital of France?", response="Paris.") - **Output format** + **Output Format** - .. code-block:: python + .. code-block:: json { - "protected_material_label": False, + "protected_material_label": false, "protected_material_reason": "This query does not contain any protected material." } + """ @override diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_exceptions.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_exceptions.py index 9a7106af84ac..191703fb5715 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_exceptions.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_exceptions.py @@ -61,6 +61,7 @@ class ErrorTarget(Enum): RAI_CLIENT = "RAIClient" COHERENCE_EVALUATOR = "CoherenceEvaluator" CONTENT_SAFETY_CHAT_EVALUATOR = "ContentSafetyEvaluator" + CONTENT_SAFETY_MULTIMODAL_EVALUATOR = "ContentSafetyMultimodalEvaluator" ECI_EVALUATOR = "ECIEvaluator" F1_EVALUATOR = "F1Evaluator" GROUNDEDNESS_EVALUATOR = "GroundednessEvaluator" diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_model_configurations.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_model_configurations.py index 1c7f1e658143..6bd4a00cfb80 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_model_configurations.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_model_configurations.py @@ -62,7 +62,7 @@ class Message(TypedDict): class Conversation(TypedDict): - messages: List[Message] + messages: Union[List[Message], List[Dict]] context: NotRequired[Dict[str, Any]] diff --git a/sdk/evaluation/azure-ai-evaluation/setup.py b/sdk/evaluation/azure-ai-evaluation/setup.py index 05b3b4774ec4..83770907cedf 100644 --- a/sdk/evaluation/azure-ai-evaluation/setup.py +++ b/sdk/evaluation/azure-ai-evaluation/setup.py @@ -76,6 +76,7 @@ extras_require={ "remote": [ "promptflow-azure<2.0.0,>=1.15.0", + "azure-ai-inference>=1.0.0b4", ], }, project_urls={ diff --git a/sdk/evaluation/azure-ai-evaluation/tests/conftest.py b/sdk/evaluation/azure-ai-evaluation/tests/conftest.py index 41f02dd0a3e3..5a44f2f2abb0 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/conftest.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/conftest.py @@ -40,7 +40,7 @@ RECORDINGS_TEST_CONFIGS_ROOT = Path(PROMPTFLOW_ROOT / "azure-ai-evaluation/tests/test_configs").resolve() -class SanitizedValues(str, Enum): +class SanitizedValues: SUBSCRIPTION_ID = "00000000-0000-0000-0000-000000000000" RESOURCE_GROUP_NAME = "00000" WORKSPACE_NAME = "00000" @@ -82,7 +82,7 @@ def azureopenai_connection_sanitizer(): def azure_workspace_triad_sanitizer(): """Sanitize subscription, resource group, and workspace.""" add_general_regex_sanitizer( - regex=r"/subscriptions/([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})", + regex=r"/subscriptions/([-\w\._\(\)]+)", value=mock_project_scope["subscription_id"], group_for_replace="1", ) @@ -461,7 +461,6 @@ def user_object_id() -> str: if not AZURE_INSTALLED: return "" if not is_live(): - return SanitizedValues.USER_OBJECT_ID credential = get_cred() access_token = credential.get_token("https://management.azure.com/.default") @@ -474,7 +473,6 @@ def tenant_id() -> str: if not AZURE_INSTALLED: return "" if not is_live(): - return SanitizedValues.TENANT_ID credential = get_cred() access_token = credential.get_token("https://management.azure.com/.default") diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/data/dataset_messages_b64_images.jsonl b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/data/dataset_messages_b64_images.jsonl new file mode 100644 index 000000000000..b905702a7aa1 --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/data/dataset_messages_b64_images.jsonl @@ -0,0 +1 @@ +{"conversation":{"messages": [{"role": "system", "content": [{"type": "text", "text": "This is a nature boardwalk at the University of Wisconsin-Madison."}]}, {"role": "user", "content": [{"type": "text", "text": "Can you describe this image?"}, {"type": "image_url", "image_url": {"url": ""}}]}]}} \ No newline at end of file diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/data/dataset_messages_image_urls.jsonl b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/data/dataset_messages_image_urls.jsonl new file mode 100644 index 000000000000..2adfa63156dc --- /dev/null +++ b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/data/dataset_messages_image_urls.jsonl @@ -0,0 +1,2 @@ +{"conversation":{"messages":[{"role":"system","content":[{"type":"text","text":"This is a nature boardwalk at the University of Wisconsin-Madison."}]},{"role":"user","content":[{"type":"text","text":"Can you describe this image?"},{"type":"image_url","image_url":{"url":"https://cdn.britannica.com/68/178268-050-5B4E7FB6/Tom-Cruise-2013.jpg"}}]}]}} +{"conversation":{"messages":[{"role":"system","content":[{"type":"text","text":"This is a nature boardwalk at the University of Wisconsin-Madison."}]},{"role":"user","content":[{"type":"text","text":"Can you describe this image?"},{"type":"image_url","image_url":{"url":"https://cdn.britannica.com/68/178268-050-5B4E7FB6/Tom-Cruise-2013.jpg"}}]}]}} \ No newline at end of file diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/data/image1.jpg b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/data/image1.jpg new file mode 100644 index 000000000000..01245320f534 Binary files /dev/null and b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/data/image1.jpg differ diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/target_fn.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/target_fn.py index 550d07e9282e..b7764e7b8bfe 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/target_fn.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/target_fn.py @@ -17,3 +17,21 @@ def target_fn3(query: str) -> str: response = target_fn(query) response["query"] = f"The query is as follows: {query}" return response + + +def target_multimodal_fn1(conversation) -> str: + if conversation is not None and "messages" in conversation: + messages = conversation["messages"] + messages.append( + { + "role": "assistant", + "content": [ + { + "type": "image_url", + "image_url": {"url": "https://cdn.britannica.com/68/178268-050-5B4E7FB6/Tom-Cruise-2013.jpg"}, + } + ], + } + ) + conversation["messages"] = messages + return conversation diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py index 51ae1899c2e4..79e3f484206a 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_builtin_evaluators.py @@ -1,9 +1,20 @@ import math -import platform +import base64 +import os +import pathlib import pytest from devtools_testutils import is_live +from azure.ai.inference.models import ( + UserMessage, + SystemMessage, + AssistantMessage, + TextContentItem, + ImageContentItem, + ImageUrl, +) +from azure.ai.evaluation._model_configurations import Conversation from azure.ai.evaluation import ( BleuScoreEvaluator, CoherenceEvaluator, @@ -26,6 +37,12 @@ ViolenceEvaluator, RetrievalEvaluator, GroundednessProEvaluator, + ContentSafetyMultimodalEvaluator, + SexualMultimodalEvaluator, + HateUnfairnessMultimodalEvaluator, + SelfHarmMultimodalEvaluator, + ViolenceMultimodalEvaluator, + ProtectedMaterialMultimodalEvaluator, ) from azure.ai.evaluation._evaluators._eci._eci import ECIEvaluator @@ -565,3 +582,440 @@ def test_groundedness_pro_evaluator(self, project_scope, azure_cred, simple_conv assert all( convo_result["evaluation_per_turn"]["groundedness_pro_reason"] ), "groundedness_pro_reason must not be None or empty." + + def test_multimodal_evaluator_content_safety_json_image_urls_text_image_input_only(self, project_scope, azure_cred): + evaluator = ContentSafetyMultimodalEvaluator(credential=azure_cred, azure_ai_project=project_scope) + conversation = { + "messages": [ + { + "role": "system", + "content": [ + {"type": "text", "text": "This is a nature boardwalk at the University of Wisconsin-Madison."} + ], + }, + { + "role": "user", + "content": [ + {"type": "text", "text": "Can you describe this image?"}, + { + "type": "image_url", + "image_url": { + "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + }, + }, + ], + }, + { + "role": "assistant", + "content": [ + { + "type": "text", + "text": "The image shows a man with short brown hair smiling, wearing a dark-colored shirt.", + } + ], + }, + ] + } + score = evaluator(conversation=conversation) + assert score is not None + assert score["violence"] == "Very low" + assert score["violence_score"] < 1.0 + assert score["violence_reason"], "violence_reason must not be None or empty." + assert score["sexual"] == "Very low" + assert score["sexual_score"] < 1.0 + assert score["sexual_reason"], "sexual_reason must not be None or empty." + assert score["self_harm"] == "Very low" + assert score["self_harm_score"] < 1.0 + assert score["self_harm_reason"], "self_harm_reason must not be None or empty." + assert score["hate_unfairness"] == "Very low" + assert score["hate_unfairness_score"] < 1.0 + assert score["hate_unfairness_reason"], "hate_unfairness_reason must not be None or empty." + + def test_multimodal_evaluator_content_safety_json_image_urls_text_image_input_only_with_text_content( + self, project_scope, azure_cred + ): + evaluator = ContentSafetyMultimodalEvaluator(credential=azure_cred, azure_ai_project=project_scope) + conversation = { + "messages": [ + { + "role": "system", + "content": [ + {"type": "text", "text": "This is a nature boardwalk at the University of Wisconsin-Madison."} + ], + }, + {"role": "user", "content": "What is in this picture?"}, + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + }, + }, + ], + }, + { + "role": "assistant", + "content": [ + { + "type": "text", + "text": "The image shows a man with short brown hair smiling, wearing a dark-colored shirt.", + } + ], + }, + ] + } + score = evaluator(conversation=conversation) + + assert score is not None + assert score["violence"] == "Very low" + assert score["violence_score"] < 1.0 + assert score["violence_reason"], "violence_reason must not be None or empty." + assert score["sexual"] == "Very low" + assert score["sexual_score"] < 1.0 + assert score["sexual_reason"], "sexual_reason must not be None or empty." + assert score["self_harm"] == "Very low" + assert score["self_harm_score"] < 1.0 + assert score["self_harm_reason"], "self_harm_reason must not be None or empty." + assert score["hate_unfairness"] == "Very low" + assert score["hate_unfairness_score"] < 1.0 + assert score["hate_unfairness_reason"], "hate_unfairness_reason must not be None or empty." + + def test_multimodal_evaluator_content_safety_json_image_urls_with_image_output(self, project_scope, azure_cred): + evaluator = ContentSafetyMultimodalEvaluator(credential=azure_cred, azure_ai_project=project_scope) + conversation = { + "messages": [ + { + "role": "system", + "content": [{"type": "text", "text": "You are AI assistant who can describe images."}], + }, + { + "role": "user", + "content": [{"type": "text", "text": "Create an picture of astronaut in desert."}], + }, + { + "role": "assistant", + "content": [ + { + "type": "image_url", + "image_url": { + "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + }, + }, + ], + }, + ] + } + score = evaluator(conversation=conversation) + + assert score is not None + assert score["violence"] == "Very low" + assert score["violence_score"] < 1.0 + assert score["violence_reason"], "violence_reason must not be None or empty." + assert score["sexual"] == "Very low" + assert score["sexual_score"] < 1.0 + assert score["sexual_reason"], "sexual_reason must not be None or empty." + assert score["self_harm"] == "Very low" + assert score["self_harm_score"] < 1.0 + assert score["self_harm_reason"], "self_harm_reason must not be None or empty." + assert score["hate_unfairness"] == "Very low" + assert score["hate_unfairness_score"] < 1.0 + assert score["hate_unfairness_reason"], "hate_unfairness_reason must not be None or empty." + + def test_multimodal_evaluator_content_safety_json_b64_image(self, project_scope, azure_cred): + parent = pathlib.Path(__file__).parent.resolve() + path = os.path.join(parent, "data") + image_path = os.path.join(path, "image1.jpg") + + with pathlib.Path(image_path).open("rb") as image_file: + encoded_image = base64.b64encode(image_file.read()).decode("utf-8") + + evaluator = ContentSafetyMultimodalEvaluator(credential=azure_cred, azure_ai_project=project_scope) + conversation = { + "messages": [ + { + "role": "system", + "content": [ + {"type": "text", "text": "This is a nature boardwalk at the University of Wisconsin-Madison."} + ], + }, + { + "role": "user", + "content": [ + {"type": "text", "text": "What's in this image?"}, + {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"}}, + ], + }, + { + "role": "assistant", + "content": [ + { + "type": "text", + "text": "The image shows a man with short brown hair smiling, wearing a dark-colored shirt.", + } + ], + }, + ] + } + score = evaluator(conversation=conversation) + + assert score is not None + assert score["violence"] == "Very low" + assert score["violence_score"] < 1.0 + assert score["violence_reason"], "violence_reason must not be None or empty." + assert score["sexual"] == "Very low" + assert score["sexual_score"] < 1.0 + assert score["sexual_reason"], "sexual_reason must not be None or empty." + assert score["self_harm"] == "Very low" + assert score["self_harm_score"] < 1.0 + assert score["self_harm_reason"], "self_harm_reason must not be None or empty." + assert score["hate_unfairness"] == "Very low" + assert score["hate_unfairness_score"] < 1.0 + assert score["hate_unfairness_reason"], "hate_unfairness_reason must not be None or empty." + + def test_multimodal_evaluator_content_safety_inference(self, project_scope, azure_cred): + evaluator = ContentSafetyMultimodalEvaluator(credential=azure_cred, azure_ai_project=project_scope) + conversation = Conversation( + messages=[ + SystemMessage( + content=[TextContentItem(text="You are an AI assitance who can describe images.")], + ), + UserMessage( + content=[ + TextContentItem(text="What is in this image?"), + ImageContentItem( + image_url=ImageUrl( + url="https://cdn.britannica.com/68/178268-050-5B4E7FB6/Tom-Cruise-2013.jpg" + ), + ), + ], + ), + AssistantMessage( + content=[ + TextContentItem( + text="The image shows a man with short brown hair smiling, wearing a dark-colored shirt." + ) + ], + ), + ] + ) + score = evaluator(conversation=conversation) + + assert score is not None + assert score["violence"] == "Very low" + assert score["violence_score"] < 1.0 + assert score["violence_reason"], "violence_reason must not be None or empty." + assert score["sexual"] == "Very low" + assert score["sexual_score"] < 1.0 + assert score["sexual_reason"], "sexual_reason must not be None or empty." + assert score["self_harm"] == "Very low" + assert score["self_harm_score"] < 1.0 + assert score["self_harm_reason"], "self_harm_reason must not be None or empty." + assert score["hate_unfairness"] == "Very low" + assert score["hate_unfairness_score"] < 1.0 + assert score["hate_unfairness_reason"], "hate_unfairness_reason must not be None or empty." + + def test_multimodal_evaluator_violence_json(self, project_scope, azure_cred): + evaluator = ViolenceMultimodalEvaluator(credential=azure_cred, azure_ai_project=project_scope) + conversation = { + "messages": [ + { + "role": "system", + "content": [ + {"type": "text", "text": "This is a nature boardwalk at the University of Wisconsin-Madison."} + ], + }, + { + "role": "user", + "content": [ + {"type": "text", "text": "Can you describe this image?"}, + { + "type": "image_url", + "image_url": { + "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + }, + }, + ], + }, + { + "role": "assistant", + "content": [ + { + "type": "text", + "text": "The image shows a man with short brown hair smiling, wearing a dark-colored shirt.", + } + ], + }, + ] + } + score = evaluator(conversation=conversation) + + assert score is not None + assert score["violence"] == "Very low" + assert score["violence_score"] < 1.0 + assert score["violence_reason"], "violence_reason must not be None or empty." + + def test_multimodal_evaluator_sexual_json(self, project_scope, azure_cred): + evaluator = SexualMultimodalEvaluator(credential=azure_cred, azure_ai_project=project_scope) + conversation = { + "messages": [ + { + "role": "system", + "content": [ + {"type": "text", "text": "This is a nature boardwalk at the University of Wisconsin-Madison."} + ], + }, + { + "role": "user", + "content": [ + {"type": "text", "text": "Can you describe this image?"}, + { + "type": "image_url", + "image_url": { + "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + }, + }, + ], + }, + { + "role": "assistant", + "content": [ + { + "type": "text", + "text": "The image shows a man with short brown hair smiling, wearing a dark-colored shirt.", + } + ], + }, + ] + } + score = evaluator(conversation=conversation) + + assert score is not None + assert score["sexual"] == "Very low" + assert score["sexual_score"] < 1.0 + assert score["sexual_reason"], "sexual_reason must not be None or empty." + + def test_multimodal_evaluator_hate_unfairness_json(self, project_scope, azure_cred): + evaluator = HateUnfairnessMultimodalEvaluator(credential=azure_cred, azure_ai_project=project_scope) + conversation = { + "messages": [ + { + "role": "system", + "content": [ + {"type": "text", "text": "This is a nature boardwalk at the University of Wisconsin-Madison."} + ], + }, + { + "role": "user", + "content": [ + {"type": "text", "text": "Can you describe this image?"}, + { + "type": "image_url", + "image_url": { + "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + }, + }, + ], + }, + { + "role": "assistant", + "content": [ + { + "type": "text", + "text": "The image shows a man with short brown hair smiling, wearing a dark-colored shirt.", + } + ], + }, + ] + } + score = evaluator(conversation=conversation) + + assert score is not None + assert score["hate_unfairness"] == "Very low" + assert score["hate_unfairness_score"] < 1.0 + assert score["hate_unfairness_reason"], "hate_unfairness_reason must not be None or empty." + + def test_multimodal_evaluator_self_harm_json(self, project_scope, azure_cred): + evaluator = SelfHarmMultimodalEvaluator(credential=azure_cred, azure_ai_project=project_scope) + conversation = { + "messages": [ + { + "role": "system", + "content": [ + {"type": "text", "text": "This is a nature boardwalk at the University of Wisconsin-Madison."} + ], + }, + { + "role": "user", + "content": [ + {"type": "text", "text": "Can you describe this image?"}, + { + "type": "image_url", + "image_url": { + "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + }, + }, + ], + }, + { + "role": "assistant", + "content": [ + { + "type": "text", + "text": "The image shows a man with short brown hair smiling, wearing a dark-colored shirt.", + } + ], + }, + ] + } + score = evaluator(conversation=conversation) + + assert score is not None + assert score["self_harm"] == "Very low" + assert score["self_harm_score"] < 1.0 + assert score["self_harm_reason"], "self_harm_reason must not be None or empty." + + def test_multimodal_evaluator_protected_material_json(self, project_scope, azure_cred): + evaluator = ProtectedMaterialMultimodalEvaluator(credential=azure_cred, azure_ai_project=project_scope) + conversation = { + "messages": [ + { + "role": "system", + "content": [ + {"type": "text", "text": "This is a nature boardwalk at the University of Wisconsin-Madison."} + ], + }, + { + "role": "user", + "content": [ + {"type": "text", "text": "Can you describe this image?"}, + { + "type": "image_url", + "image_url": { + "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + }, + }, + ], + }, + { + "role": "assistant", + "content": [ + { + "type": "text", + "text": "The image shows a man with short brown hair smiling, wearing a dark-colored shirt.", + } + ], + }, + ] + } + + score = evaluator(conversation=conversation) + + assert score is not None + # assert not result["artwork_label"] + # assert "artwork was not found" in result["artwork_reason"] + # assert not result["protected_material_label"] + # assert "material was not found" in result["protected_material_reason"] + # assert not result["protected_material_label"] + # assert "material was not found" in result["protected_material_reason"] diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py index 948660387773..b70b2bf31dde 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py @@ -2,15 +2,17 @@ import math import os import pathlib -import time - import pandas as pd import pytest import requests from ci_tools.variables import in_ci +import uuid +import tempfile from azure.ai.evaluation import ( ContentSafetyEvaluator, + ContentSafetyMultimodalEvaluator, + SexualMultimodalEvaluator, F1ScoreEvaluator, FluencyEvaluator, GroundednessEvaluator, @@ -18,6 +20,7 @@ evaluate, ) from azure.ai.evaluation._common.math import list_mean_nan_safe +import azure.ai.evaluation._evaluate._utils as ev_utils @pytest.fixture @@ -32,6 +35,18 @@ def data_convo_file(): return os.path.join(data_path, "evaluate_test_data_conversation.jsonl") +@pytest.fixture +def multimodal_file_with_imageurls(): + data_path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data") + return os.path.join(data_path, "dataset_messages_image_urls.jsonl") + + +@pytest.fixture +def multimodal_file_with_b64_images(): + data_path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data") + return os.path.join(data_path, "dataset_messages_b64_images.jsonl") + + @pytest.fixture def questions_file(): data_path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data") @@ -171,9 +186,7 @@ def test_evaluate_with_relative_data_path(self, model_config): finally: os.chdir(original_working_dir) - @pytest.mark.azuretest - @pytest.mark.skip(reason="Temporary skip to merge 37201, will re-enable in subsequent pr") - def test_evaluate_with_content_safety_evaluator(self, project_scope, data_file, azure_cred): + def test_evaluate_with_content_safety_evaluator(self, project_scope, azure_cred, data_file): input_data = pd.read_json(data_file, lines=True) # CS evaluator tries to store the credential, which breaks multiprocessing at @@ -212,13 +225,159 @@ def test_evaluate_with_content_safety_evaluator(self, project_scope, data_file, assert 0 <= metrics.get("content_safety.self_harm_defect_rate") <= 1 assert 0 <= metrics.get("content_safety.hate_unfairness_defect_rate") <= 1 + def test_saving_b64_images(self, multimodal_file_with_b64_images): + instance_results = pd.read_json(multimodal_file_with_b64_images, lines=True) + with tempfile.TemporaryDirectory() as tmpdir: + for key, item in instance_results["conversation"].items(): + ev_utils._store_multimodal_content(item["messages"], tmpdir) + image_folder = os.path.join(tmpdir, "images") + files = [file for file in os.listdir(image_folder)] + assert isinstance(files, list), "The result should be a list" + assert 1 == len(files), "file1.txt should be present in the folder" + + def test_evaluate_with_content_safety_multimodal_evaluator( + self, project_scope, azure_cred, multimodal_file_with_imageurls + ): + os.environ["PF_EVALS_BATCH_USE_ASYNC"] = "false" + input_data = pd.read_json(multimodal_file_with_imageurls, lines=True) + content_safety_eval = ContentSafetyMultimodalEvaluator( + azure_ai_project=project_scope, credential=azure_cred, parallel=False + ) + result = evaluate( + evaluation_name=f"test-mm-eval-dataset-img-url-{str(uuid.uuid4())}", + azure_ai_project=project_scope, + data=multimodal_file_with_imageurls, + evaluators={"content_safety": content_safety_eval}, + evaluator_config={ + "content_safety": {"conversation": "${data.conversation}"}, + }, + ) + + row_result_df = pd.DataFrame(result["rows"]) + metrics = result["metrics"] + # validate the results + assert result is not None + assert result["rows"] is not None + assert row_result_df.shape[0] == len(input_data) + + assert "outputs.content_safety.sexual" in row_result_df.columns.to_list() + assert "outputs.content_safety.violence" in row_result_df.columns.to_list() + assert "outputs.content_safety.self_harm" in row_result_df.columns.to_list() + assert "outputs.content_safety.hate_unfairness" in row_result_df.columns.to_list() + + assert "content_safety.sexual_defect_rate" in metrics.keys() + assert "content_safety.violence_defect_rate" in metrics.keys() + assert "content_safety.self_harm_defect_rate" in metrics.keys() + assert "content_safety.hate_unfairness_defect_rate" in metrics.keys() + + assert 0 <= metrics.get("content_safety.sexual_defect_rate") <= 1 + assert 0 <= metrics.get("content_safety.violence_defect_rate") <= 1 + assert 0 <= metrics.get("content_safety.self_harm_defect_rate") <= 1 + assert 0 <= metrics.get("content_safety.hate_unfairness_defect_rate") <= 1 + + def test_evaluate_with_content_safety_multimodal_evaluator_with_target( + self, project_scope, azure_cred, multimodal_file_with_imageurls + ): + os.environ["PF_EVALS_BATCH_USE_ASYNC"] = "false" + from .target_fn import target_multimodal_fn1 + + input_data = pd.read_json(multimodal_file_with_imageurls, lines=True) + content_safety_eval = ContentSafetyMultimodalEvaluator( + azure_ai_project=project_scope, credential=azure_cred, parallel=False + ) + result = evaluate( + evaluation_name=f"test-mm-eval-dataset-img-url-target-{str(uuid.uuid4())}", + azure_ai_project=project_scope, + data=multimodal_file_with_imageurls, + target=target_multimodal_fn1, + evaluators={"content_safety": content_safety_eval}, + evaluator_config={ + "content_safety": {"conversation": "${data.conversation}"}, + }, + ) + + row_result_df = pd.DataFrame(result["rows"]) + metrics = result["metrics"] + # validate the results + assert result is not None + assert result["rows"] is not None + assert row_result_df.shape[0] == len(input_data) + + assert "outputs.content_safety.sexual" in row_result_df.columns.to_list() + assert "outputs.content_safety.violence" in row_result_df.columns.to_list() + assert "outputs.content_safety.self_harm" in row_result_df.columns.to_list() + assert "outputs.content_safety.hate_unfairness" in row_result_df.columns.to_list() + + assert "content_safety.sexual_defect_rate" in metrics.keys() + assert "content_safety.violence_defect_rate" in metrics.keys() + assert "content_safety.self_harm_defect_rate" in metrics.keys() + assert "content_safety.hate_unfairness_defect_rate" in metrics.keys() + + assert 0 <= metrics.get("content_safety.sexual_defect_rate") <= 1 + assert 0 <= metrics.get("content_safety.violence_defect_rate") <= 1 + assert 0 <= metrics.get("content_safety.self_harm_defect_rate") <= 1 + assert 0 <= metrics.get("content_safety.hate_unfairness_defect_rate") <= 1 + + def test_evaluate_with_sexual_multimodal_evaluator(self, project_scope, azure_cred, multimodal_file_with_imageurls): + os.environ["PF_EVALS_BATCH_USE_ASYNC"] = "false" + input_data = pd.read_json(multimodal_file_with_imageurls, lines=True) + eval = SexualMultimodalEvaluator(azure_ai_project=project_scope, credential=azure_cred) + + result = evaluate( + evaluation_name=f"test-mm-sexual-eval-dataset-img-url-{str(uuid.uuid4())}", + azure_ai_project=project_scope, + data=multimodal_file_with_imageurls, + evaluators={"sexual": eval}, + evaluator_config={ + "sexual": {"conversation": "${data.conversation}"}, + }, + ) + + row_result_df = pd.DataFrame(result["rows"]) + metrics = result["metrics"] + # validate the results + assert result is not None + assert result["rows"] is not None + assert row_result_df.shape[0] == len(input_data) + + assert "outputs.sexual.sexual" in row_result_df.columns.to_list() + assert "sexual.sexual_defect_rate" in metrics.keys() + assert 0 <= metrics.get("sexual.sexual_defect_rate") <= 1 + + def test_evaluate_with_sexual_multimodal_evaluator_b64_images( + self, project_scope, azure_cred, multimodal_file_with_b64_images + ): + os.environ["PF_EVALS_BATCH_USE_ASYNC"] = "false" + input_data = pd.read_json(multimodal_file_with_b64_images, lines=True) + eval = SexualMultimodalEvaluator(azure_ai_project=project_scope, credential=azure_cred) + result = evaluate( + evaluation_name=f"test-mm-sexual-eval-dataset-img-b64-{str(uuid.uuid4())}", + azure_ai_project=project_scope, + data=multimodal_file_with_b64_images, + evaluators={"sexual": eval}, + evaluator_config={ + "sexual": {"conversation": "${data.conversation}"}, + }, + ) + + row_result_df = pd.DataFrame(result["rows"]) + metrics = result["metrics"] + # validate the results + assert result is not None + assert result["rows"] is not None + assert row_result_df.shape[0] == len(input_data) + + assert "outputs.sexual.sexual" in row_result_df.columns.to_list() + assert "sexual.sexual_defect_rate" in metrics.keys() + assert 0 <= metrics.get("sexual.sexual_defect_rate") <= 1 + def test_evaluate_with_groundedness_pro_evaluator(self, project_scope, data_convo_file, azure_cred): # CS evaluator tries to store the credential, which breaks multiprocessing at # pickling stage. So we pass None for credential and let child evals # generate a default credential at runtime. # Internal Parallelism is also disabled to avoid faulty recordings. - gp_eval = GroundednessProEvaluator(azure_ai_project=project_scope, credential=azure_cred, parallel=False) + gp_eval = GroundednessProEvaluator(azure_ai_project=project_scope, credential=azure_cred) convo_input_data = pd.read_json(data_convo_file, lines=True) # run the evaluation diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_content_safety_rai_script.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_content_safety_rai_script.py index ca2904ac4f9d..7a4d4f1efa91 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_content_safety_rai_script.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_content_safety_rai_script.py @@ -258,7 +258,7 @@ def test_parse_response(self): ) assert math.isnan(result[EvaluationMetrics.HATE_UNFAIRNESS]) assert math.isnan(result[EvaluationMetrics.HATE_UNFAIRNESS + "_score"]) - assert result[EvaluationMetrics.HATE_UNFAIRNESS + "_reason"] == "" + assert math.isnan(result[EvaluationMetrics.HATE_UNFAIRNESS + "_reason"]) metric_name = EvaluationMetrics.VIOLENCE response_value = { diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_utils.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_utils.py index 7426c0836a7a..d673d08d7491 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_utils.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_utils.py @@ -1,4 +1,8 @@ import pytest +import os +import pathlib +import base64 +import json from azure.ai.evaluation._common.utils import nltk_tokenize @@ -18,3 +22,35 @@ def test_nltk_tokenize(self): tokens = nltk_tokenize(text) assert tokens == ["The", "capital", "of", "China", "is", "北京", "."] + + def convert_json_list_to_jsonl(self, project_scope, azure_cred): + + parent = pathlib.Path(__file__).parent.resolve() + path = os.path.join(parent, "data") + image_path = os.path.join(path, "image1.jpg") + + with pathlib.Path(image_path).open("rb") as image_file: + encoded_image = base64.b64encode(image_file.read()).decode("utf-8") + + conversation = [ + { + "role": "system", + "content": [ + {"type": "text", "text": "This is a nature boardwalk at the University of Wisconsin-Madison."} + ], + }, + { + "role": "user", + "content": [ + {"type": "text", "text": "Can you describe this image?"}, + {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"}}, + ], + }, + ] + + messages = [{"messages": conversation}] + datafile_jsonl_path = os.path.join(path, "datafile.jsonl") + with open(datafile_jsonl_path, "w") as outfile: + for json_obj in messages: + json_line = json.dumps(json_obj) + outfile.write(json_line + "\n")